python 尝试创建pyton脚本返回tonie长度等

2q5ifsrm  于 5个月前  发布在  Python
关注(0)|答案(1)|浏览(49)

我试图创建一个脚本,以返回所有tonies在https://tonies.com/en-gb/tonies/的持续时间。我也想返回他们每个人的成本,但挣扎。我也看了通过 selenium 脚本,但卡住了cookie接受这是一个影子dom。我想我可能会使这过于复杂。我是编程和Python新手。任何建议都表示赞赏。脚本在其当前形式似乎只刮到了前21件

import re
    import requests
    from bs4 import BeautifulSoup

    def get_tonie_info(tonie_url):
response = requests.get(tonie_url)
soup = BeautifulSoup(response.text, 'html.parser')

script_tags = soup.find_all('script')

tonie_info = {'url': tonie_url, 'durations': []}

for script_tag in script_tags:
    script_content = script_tag.string

    if script_content and 'runTime' in script_content:
        matches = re.findall(r'"runTime":\s*(\d+)', script_content)

        if matches:
            tonie_info['durations'] = list(map(int, matches))

return tonie_info

   def scrape_tonies():
all_tonie_info = []

base_url = "https://tonies.com/en-gb/tonies/?page="

page_number = 9  # Only scrape data from page 9
current_url = base_url + str(page_number)
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')

tonie_links = soup.find_all('a', class_='View__StretchedLink-sc-5t9da0-0 ivnTIu')

for tonie_link in tonie_links:
    tonie_url = "https://tonies.com" + tonie_link['href']
    tonie_info = get_tonie_info(tonie_url)

    if tonie_info['durations']:
        tonie_info['name'] = tonie_link.text.strip()
        tonie_info['duration'] = tonie_info['durations'][-1]
        all_tonie_info.append(tonie_info)
    else:
        print(f"Could not retrieve information for {tonie_url}")

return all_tonie_info

   if __name__ == "__main__":
tonies_info = scrape_tonies()

for index, tonie_info in enumerate(tonies_info, start=1):
    print(f"Toni {index} Name: {tonie_info['name']}")
    print(f"   URL: {tonie_info['url']}")
    print(f"   Duration: {tonie_info['duration']}")

字符串

r7xajy2e

r7xajy2e1#

您可以尝试以JSON格式收集托尼的数据,然后进行 * 后处理 *:

import json

url = "https://tonies.com/en-gb/tonies/"
response = requests.get(url) # with optional headers
soup = BeautifulSoup(response.text, "html.parser")

data = (json.loads(soup.select_one("#__NEXT_DATA__").text)
    ["props"]["pageProps"]["page"]["productList"]["normalizedProducts"])

use_keys = ["name", "price", "runTime"] # << ask for more if needed

tonies = [
    {
        k: d.get(k) if k!="price" else d.get(k).get("amount")
        for k in use_keys
    } for d in data
     
]

字符串
输出量:

# len(tonies) # 196

print(json.dumps(tonies, indent=4))

[
    {
        "name": "Chase",
        "price": 14.99,
        "runTime": 54
    },
    {
        "name": "Elmer and Friends Story Collection",
        "price": 14.99,
        "runTime": 62
    },
    {
        "name": "Frozen",
        "price": 14.99,
        "runTime": 24
    },
    ...
]

相关问题