from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException,
TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json
data = []
driver_path =
"C:\\Users\Engineer_Stephen\\Downloads\\Compressed\\chromedriver"
service = Service(driver_path)
chrome_options = webdriver.ChromeOptions()
#chrome_options.add_argument("--headless")
chrome_options.add_argument("--start-maximized")
driver = webdriver.Chrome(service=service,
options=chrome_options)
links = ['link1','link1','link1','link1','link1','link1']
for link in links:
try:
driver.get(link)
wait = WebDriverWait(driver, 80)
map_element =
wait.until(EC.visibility_of_element_located((By.XPATH,
'//span[@class="contour-HazardInfo-name montage-
Text"]')))
disaster = driver.find_element(By.XPATH,
'//span[@class="contour-HazardInfo-name montage-Text"]')
info = driver.find_element(By.XPATH, "//div[@class='contour-
TabBarItem-label montage-Text']")
types = driver.find_element(By.XPATH, "//div[@data-montage-
id='type']")
description = driver.find_element(By.XPATH,
"//label/span[contains(text(), 'Description')]")
descriptionText = driver.find_element(By.XPATH,
'//div[@class="contour-HazardInfo-descriptionText montage-
Text"]')
data.append({
'disaster': disaster.text,
'info': info.text,
'type': types.text,
'description': description.text,
'descriptionText': descriptionText.text
})
except NoSuchElementException:
print(f"Data not available for link: {link}")
except TimeoutException:
print(f"Timeout occurred for link: {link}")
driver.quit()
continue
except Exception as e:
print(f"An error occurred for link: {link}")
print(str(e))
driver.quit()
continue
driver.quit()
resource = json.dumps(data, indent=4)
print(resource)
堆栈跟踪是这样的:
链接超时:https://disasteralert.pdc.org/disasteralert/?hazard_id=202739
链接发生错误:[https://disasteralert.pdc.org/disasteralert/?hazard_id=202738](https://disasteralert.pdc.org/disasteralert/?hazard_id=202738%5C) HTTPConnectionPool(host='localhost',port=33660):超过URL的最大重试次数:/session/99 df 58 ee 2824 dbdf 4720427 db 546798 d/url(由NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000027446C07A00>:无法建立新连接:[WinError 10061]无法建立连接,因为目标计算机主动拒绝连接'))
它只显示第一个过期链接的连接超时,它不能继续为其余链接提供第二个异常
我提供了一个链接的样本,我想报废。我发现有些链接已经过期了。我正在努力改善这段代码,使其关闭当前窗口的过期链接,并转到下一个链接可能请您帮助
我期望代码从列表中列出的链接中删除数据。如果链接过期,这意味着它将遇到超时异常。我希望Selenium关闭该浏览器窗口并转到下一个链接,这意味着打开另一个浏览器窗口并抓取数据。
1条答案
按热度按时间yqyhoc1h1#
对该网页的初步分析显示实际信息的来源是
https://hpxml.pdc.org/public.xml
,这是一个定期访问的端点,用于刷新带有Map等的页面中的信息。这是获取该信息的一种方法:
终端结果:
您可以找到pandas文档here。