使用代理ip进入网页
import requests
# 1. 获取蘑菇代理中的代理IP
def get_ip():
response = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0] == '{':
print('提取IP失败')
return None
return [x for x in response.text.split('\n') if x]
# 2. 使用代理IP
def get_net_data():
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
# 'http': 'ip:端口'、'http': 'http://ip:端口'
# 'https': 'ip:端口'、'https': 'https://ip:端口'
ips = get_ip()
# 判断是否提取到有效ip
if not ips:
print('ip获取失败,等10秒以后重新运行')
return
proxies = {
'http': ips[0],
'https': ips[1]
}
response = requests.get('https://movie.douban.com/top250', headers=headers, proxies=proxies)
print(response.text)
if __name__ == '__main__':
get_net_data()
优化使用代理ip
import requests
import time
def get_ip():
response = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0] == '{':
print('提取IP失败')
return None
return [x for x in response.text.split('\n') if x]
def get_net_data():
# 不断获取ip直到成功
while True:
ips = get_ip()
if ips:
break
time.sleep(5)
print('ip获取成功:', ips)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
proxies = {
'http': ips[0],
'https': ips[1]
}
url = 'https://movie.douban.com/top250'
response = requests.get(url, headers=headers, proxies=proxies)
print(response.text)
get_net_data()
selenium的基本功能
from selenium.webdriver import Chrome
# 1. 创建浏览器对象(如果是全局变量,程序结束浏览器不会关闭;局部变量会自动关闭)
b = Chrome()
# 2. 输入网址
b.get('https://www.jd.com/')
# 3. 获取网页源代码
print(b.page_source)
# 关闭浏览器
# b.close()
selemium的常规交互
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time
# 1. 创建浏览器
b = Chrome()
# 2. 打开网页
b.get('https://www.51job.com/')
# 3. 获取标签(输入框)
# search_input = b.find_element_by_id('kwdselectid')
search_input = b.find_element_by_css_selector('#kwdselectid')
# print(search_input)
# 4.在输入框中输入内容
search_input.send_keys('数据分析')
# 按输入框中按回车键
search_input.send_keys(Keys.ENTER)
# 5.获取网页数据
print(b.page_source)
print('--------------------------------------------------------------------------------------------')
# 6. 获取下一页对应的标签
next = b.find_element_by_css_selector('.next')
# 7.点击按钮
next.click()
print('+++++++++++++++++')
time.sleep(1)
print(b.page_source)
selenium的常用配置
from selenium.webdriver import Chrome, ChromeOptions
import requests
import time
def get_ip():
response = requests.get('http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=775206edf3dc4329ba04568b75a66a30&count=4&expiryDate=0&format=2&newLine=3')
if response.text[0] == '{':
print('提取IP失败')
return None
return [x for x in response.text.split('\n') if x]
while True:
ips = get_ip()
if ips:
break
time.sleep(1)
print(ips)
# 1. 创建谷歌浏览器的配置对象
options = ChromeOptions()
# 1) 添加取消测试环境选项
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 2) 取消图片加载
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 3) 设置代理
options.add_argument(f'--proxy-server=http://{ips[0]}')
b = Chrome(options=options)
b.get('https://movie.douban.com/top250')
print(b.page_source)
获取和保存cookie
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
import time
def save_cookie():
# 打开浏览器,引导到登录页面
b = Chrome()
b.get('https://www.taobao.com')
search_input = b.find_element_by_id('q')
search_input.send_keys('鞋子')
search_input.send_keys(Keys.ENTER)
# 人工做登录操作(给足够的时间)
# time.sleep(10)
input('是否继续:')
# 获取cookie
cookies = b.get_cookies()
# print(cookies, type(cookies))
f = open('files/taobao.txt', 'w', encoding='utf-8')
f.write(str(cookies))
save_cookie()
爬淘宝
from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
options = ChromeOptions()
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
# 用浏览器打开网页
b = Chrome(options=options)
b.get('https://www.taobao.com')
# 设置cookies
cookies = eval(open('files/taobao.txt', encoding='utf-8').read())
for cookie in cookies:
if cookie['secure']:
b.add_cookie(cookie)
b.get('https://www.taobao.com')
search_input = b.find_element_by_id('q')
search_input.send_keys('鞋子')
search_input.send_keys(Keys.ENTER)