beautifulsoup get key from javascript into json对象

but5z9lq  于 2021-07-14  发布在  Java
关注(0)|答案(1)|浏览(360)

我正在尝试使用python和beautifulsoup将一组图像数组从javascript转换成json。但我试过很多方法,但都会出错。
我在网页上的js代码:

<script type="text/javascript">
P.when('A').register("ImageBlockATF", function(A){
var data = {
'colorImages': { 'initial': [{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SL1003_.jpg",
"thumb":"https://images-na.ssl-images-amazon.com/images/I/41lv4ReBL4L._AC_US40_.jpg",
"large":"https://images-na.ssl-images-amazon.com/images/I/41lv4ReBL4L._AC_.jpg",
"main":{"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SY355_.jpg":[355,355],
"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SY450_.jpg":[450,450],
"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX425_.jpg":[425,425],
"variant":"MAIN","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SL1005_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41shdN1aAoL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41shdN1aAoL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX679_.jpg":[679,679]},"variant":"PT01","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SL1005_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41pt8OOHsaL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41pt8OOHsaL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX679_.jpg":[679,679]},"variant":"PT02","lowRes":null}]},
'colorToAsin': {'initial': {}},

'airyConfig' :A.$.parseJSON('{"jsUrl":"https://images-na.ssl-images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1460.0/js/airy.skin._CB485981857_.js","cssUrl":"https://images-na.ssl-images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1460.0/css/beacon._CB485971591_.css","swfUrl":"https://images-na.ssl-images-amazon.com/images/G/01/vap/video/airy2/prod/2.0.1460.0/flash/AiryBasicRenderer._CB485925577_.swf","foresterMetadataParams":{"marketplaceId":"A2VIGQ35RCS4UG","method":"Kitchen.ImageBlock","requestId":"4MGH16D6R7WCR018779W","session":"259-8488476-1037262","client":"Dpx"}}')

};
A.trigger('P.AboveTheFold'); // trigger ATF event.
return data;
});
</script>

我想从键'colorimages':{中获取数据到json对象中。我的目标是把所有的图像都放到json对象中,我可以在路上使用它。
我的代码(我尝试的):

url = "https://www.amazon.ae/DubayVintage-Astronaut-Figurine-Spaceman-Sculpture/dp/B08373YYCM/ref=sr_1_1?dchild=1&keywords=B08373YYCM&qid=1619498604&sr=8-1"
soup_main = getResponse(url, UserAgent())

pattern = re.compile(r"var data = { 'colorImages':(\{.*?\})")
script = soup_main.find("script", text=pattern)

data = pattern.search(script.text).group(1)
data = json.loads(data)
print(data)

错误(我得到的):

Traceback (most recent call last):
  File "/home/dobuyme/Desktop/Sharaf DG/scrap.py", line 51, in <module>
    data = pattern.search(script.text).group(1)
AttributeError: 'NoneType' object has no attribute 'text'

我需要从“colorimages”键链接图像,如下所示:

[{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SL1003_.jpg",
"thumb":"https://images-na.ssl-images-amazon.com/images/I/41lv4ReBL4L._AC_US40_.jpg",
"large":"https://images-na.ssl-images-amazon.com/images/I/41lv4ReBL4L._AC_.jpg",
"main":{"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SY355_.jpg":[355,355],
"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SY450_.jpg":[450,450],
"https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX425_.jpg":[425,425],
"variant":"MAIN","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SL1005_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41shdN1aAoL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41shdN1aAoL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX679_.jpg":[679,679]},"variant":"PT01","lowRes":null},{"hiRes":"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SL1005_.jpg","thumb":"https://images-na.ssl-images-amazon.com/images/I/41pt8OOHsaL._AC_US40_.jpg","large":"https://images-na.ssl-images-amazon.com/images/I/41pt8OOHsaL._AC_.jpg","main":{"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SY355_.jpg":[355,355],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SY450_.jpg":[450,450],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX425_.jpg":[425,425],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX466_.jpg":[466,466],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX522_.jpg":[522,522],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX569_.jpg":[569,569],"https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX679_.jpg":[679,679]},"variant":"PT02","lowRes":null}]},
vhmi4jdf

vhmi4jdf1#

首先,你的正则表达式是不是真的工作。其次,您可能会得到一个空的响应,所以一定要添加 user-agent 请求标头。
最后,脚本中的字符串需要一些工作才能安全地转储到 json.loads .
以下是我的看法:

import json
import re

import requests
from bs4 import BeautifulSoup

headers = {
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/90.0.4430.85 Safari/537.36",
}

url = "https://www.amazon.ae/DubayVintage-Astronaut-Figurine-Spaceman-Sculpture/dp/B08373YYCM/ref=sr_1_1?dchild=1&keywords=B08373YYCM&qid=1619498604&sr=8-1"

scripts = BeautifulSoup(requests.get(url, headers=headers).text, "lxml").find_all("script", {"type": "text/javascript"})
filtered_scripts = [s.string for s in scripts if "colorImages" in s.string]
for script in filtered_scripts:
    search = re.search(r"data = (.*),\s'color", script, re.S)
    if search:
        sanitise = (
                search.group(1)
                .replace("'", '"')
                .replace(" ", "")
                .replace("\n", "") + "}"
        )
        data = json.loads(sanitise)
        print(data["colorImages"]["initial"])

输出:

[{'hiRes': 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SL1003_.jpg', 'thumb': 'https://images-na.ssl-images-amazon.com/images/I/41lv4ReBL4L._AC_US40_.jpg', 'large': 'https://images-na.ssl-images-amazon.com/images/I/41lv4ReBL4L._AC_.jpg', 'main': {'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SY355_.jpg': [355, 355], 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SY450_.jpg': [450, 450], 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX425_.jpg': [425, 425], 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX466_.jpg': [466, 466], 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX522_.jpg': [522, 522], 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX569_.jpg': [569, 569], 'https://images-na.ssl-images-amazon.com/images/I/61mw5BDEYoL._AC_SX679_.jpg': [679, 679]}, 'variant': 'MAIN', 'lowRes': None}, {'hiRes': 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SL1005_.jpg', 'thumb': 'https://images-na.ssl-images-amazon.com/images/I/41shdN1aAoL._AC_US40_.jpg', 'large': 'https://images-na.ssl-images-amazon.com/images/I/41shdN1aAoL._AC_.jpg', 'main': {'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SY355_.jpg': [355, 355], 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SY450_.jpg': [450, 450], 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX425_.jpg': [425, 425], 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX466_.jpg': [466, 466], 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX522_.jpg': [522, 522], 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX569_.jpg': [569, 569], 'https://images-na.ssl-images-amazon.com/images/I/61kOw5lC%2B%2BL._AC_SX679_.jpg': [679, 679]}, 'variant': 'PT01', 'lowRes': None}, {'hiRes': 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SL1005_.jpg', 'thumb': 'https://images-na.ssl-images-amazon.com/images/I/41pt8OOHsaL._AC_US40_.jpg', 'large': 'https://images-na.ssl-images-amazon.com/images/I/41pt8OOHsaL._AC_.jpg', 'main': {'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SY355_.jpg': [355, 355], 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SY450_.jpg': [450, 450], 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX425_.jpg': [425, 425], 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX466_.jpg': [466, 466], 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX522_.jpg': [522, 522], 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX569_.jpg': [569, 569], 'https://images-na.ssl-images-amazon.com/images/I/511019WE7xL._AC_SX679_.jpg': [679, 679]}, 'variant': 'PT02', 'lowRes': None}]

相关问题