我试图创建一个脚本,以返回所有tonies在https://tonies.com/en-gb/tonies/的持续时间。我也想返回他们每个人的成本,但挣扎。我也看了通过 selenium 脚本,但卡住了cookie接受这是一个影子dom。我想我可能会使这过于复杂。我是编程和Python新手。任何建议都表示赞赏。脚本在其当前形式似乎只刮到了前21件
import re
import requests
from bs4 import BeautifulSoup
def get_tonie_info(tonie_url):
response = requests.get(tonie_url)
soup = BeautifulSoup(response.text, 'html.parser')
script_tags = soup.find_all('script')
tonie_info = {'url': tonie_url, 'durations': []}
for script_tag in script_tags:
script_content = script_tag.string
if script_content and 'runTime' in script_content:
matches = re.findall(r'"runTime":\s*(\d+)', script_content)
if matches:
tonie_info['durations'] = list(map(int, matches))
return tonie_info
def scrape_tonies():
all_tonie_info = []
base_url = "https://tonies.com/en-gb/tonies/?page="
page_number = 9 # Only scrape data from page 9
current_url = base_url + str(page_number)
response = requests.get(current_url)
soup = BeautifulSoup(response.text, 'html.parser')
tonie_links = soup.find_all('a', class_='View__StretchedLink-sc-5t9da0-0 ivnTIu')
for tonie_link in tonie_links:
tonie_url = "https://tonies.com" + tonie_link['href']
tonie_info = get_tonie_info(tonie_url)
if tonie_info['durations']:
tonie_info['name'] = tonie_link.text.strip()
tonie_info['duration'] = tonie_info['durations'][-1]
all_tonie_info.append(tonie_info)
else:
print(f"Could not retrieve information for {tonie_url}")
return all_tonie_info
if __name__ == "__main__":
tonies_info = scrape_tonies()
for index, tonie_info in enumerate(tonies_info, start=1):
print(f"Toni {index} Name: {tonie_info['name']}")
print(f" URL: {tonie_info['url']}")
print(f" Duration: {tonie_info['duration']}")
字符串
1条答案
按热度按时间r7xajy2e1#
您可以尝试以JSON格式收集托尼的数据,然后进行 * 后处理 *:
字符串
输出量:
型