获取代理IP(蘑菇代理) — 会返回一个网址
def get_ip():
response = requests.get('返回的网址')
if(response.text[0] == '{'):
print('IP提取频率过快!10秒后再试试吧!')
return None
return [ip for ip in response.text.split('\n') if ip != '']
ips = get_ip()
proxies = {
'http':ips[0],
'https':ips[1]
}
添加代理
response = requests.get('http://www.gaoimg.com/photo/game/',headers = headers,proxies = proxies)
from selenium.webdriver import Chrome
# 1、创建浏览器对象(若为全局变量,程序结束浏览器不会关闭,局部变量会自动关闭)
driver = Chrome()
# 2、输入网址
driver.get('https://huaban.com/explore/hunsha-1')
运行以上程序若能打开浏览器则安装成功。
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
driver = Chrome()
driver.get('https://www.51job.com')
search = driver.find_element_by_id('kwdselectid')
search.send_keys('数据分析')
# 按回车进行搜索
search.send_keys(Keys.ENTER)
print(driver.page_source)
next = driver.find_element_by_class_name('next')
# print(next)
# 点击按钮
next.click()
from selenium.webdriver import Chrome,ChromeOptions
创建浏览器配置对象
options = ChromeOptions()
添加取消测试环境选项(清除网页上方自动控制显示)
options.add_experimental_option('excludeSwitches', ['enable-automation'])
取消图片加载(对于图片较多的网页图片的渲染只会影响加载速度)
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
设置代理
options.add_argument(f'--proxy-server=http://{代理}')
创建浏览器对象
driver = Chrome(options = options)
配置好后需要在创建浏览器对象时添加。
对于有登录拦截的网站如知乎、淘宝需要跳过登录
这里以淘宝为例
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.keys import Keys
import time
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
driver = Chrome(options=options)
def save_cookie(url = 'https://www.taobao.com/'):
driver.get(url)
search = driver.find_element_by_id('q')
search.send_keys('鞋子')
search.send_keys(Keys.ENTER) #未登录时搜索物品就会被登陆拦截
time.sleep(15) #程序等待15秒以进行人工登录操作
cookies = driver.get_cookies() #登录好后保存此时的cookie以便于以后爬取数据时使用
file = open('files/tb_cookies.txt','w',encoding='utf-8')
file.write(str(cookies))
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.keys import Keys
options = ChromeOptions()
options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
driver = Chrome(options=options)
driver.get('https://www.taobao.com/')
# 设置cookie
cookies = eval(open('files/tb_cookies.txt','r',encoding='utf-8').read())
for cookie in cookies:
if(cookie['secure']):
driver.add_cookie(cookie)
driver.get('https://www.taobao.com/')
search = driver.find_element_by_id('q')
search.send_keys('鞋子')
search.send_keys(Keys.ENTER)
print(driver.page_source)
利用保存好的cookie完成免登录(绕过登录)。若登陆成功后出现休息拦截说明登录频繁。过一会儿再试就好了。休息拦截如下:
爬取淘宝源码:链接:https://pan.baidu.com/s/18Unb_k8YwHgspAOEXYN5oQ
提取码:gelv
导包
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
创建配置对象
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
创建浏览器对象
driver = Chrome(options=options)
driver.get('https://www.51job.com'
搜索岗位
search = driver.find_element_by_id('kwdselectid')
search.send_keys('数据分析')
search.send_keys(Keys.ENTER)
获取岗位信息
def get_messages():
# 创建解析对象
soup = BeautifulSoup(driver.page_source,'lxml')
# 获取信息选择器
messages = soup.select('div.j_joblist>div')
# print(messages)
jobs_names = []
company_names = []
salary = []
details = []
for message in messages:
jobs_names.append(message.select_one('.t>span').get_text())
company_names.append(message.select_one('.er>a').get_text())
if(message.select_one('.sal')):
salary.append(message.select_one('.sal').get_text())
else:
salary.append('工资面议')
details.append(message.select_one('.e>a').attrs['href'])
return zip(jobs_names,company_names,salary,details)
写入数据
def write_message(jobs):
file = open('files/jobs51/jobs.csv','a',encoding='utf-8')
writer = csv.writer(file)
writer.writerow(['工作岗位','公司名称','薪资','岗位详情'])
for job in jobs:
writer.writerow(job)
file.close()
翻页
def next_page():
time.sleep(1)
click_next = driver.find_element_by_class_name('next')
click_next.click()
主函数
index = 0
while(1):
try:
jobs = get_messages()
write_message(jobs)
index += 1
time.sleep(5)
next_page()
print(f'第{index}页数据写入完成!')
except:
print('所有数据写入完成!')
break
51job网站总页数打开查看后发现是176的静态数据,所以该循环也可以不用写成死循环。
爬取部分CSV文件截图:
版权说明 : 本文为转载文章, 版权归原作者所有 版权申明
原文链接 : https://blog.csdn.net/Lemon_Review/article/details/119656044
内容来源于网络,如有侵权,请联系作者删除!