初识爬虫3

1.cookies参数(浏览器的登录信息,需要设置,防止反爬机制检测)

1.1 headers中设置cookies参数

python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://github.com/'
# 构建请求字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}

response = requests.get(url, headers=headers)

with open('guitub_1.html', 'wb') as f:
    f.write(response.content)

1.2 构建cookies字典

python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://github.com/'
# 构建请求字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    # 'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}

# 构建cookies字典
temp = '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'

# 方案一
# cookies_list = temp.split('; ')
# cookies = {}
# for cookie in cookies_list:
#     cookies[cookie.split('=')[0]] = cookie.split('=')[-1]
#
# response = requests.get(url, headers=headers, cookies=cookies)
#
#
# with open('guitub_2.html', 'wb') as f:
#     f.write(response.content)

# 方案二(工作中用)
cookies_list = temp.split('; ')
cookies = {cookie.split('=')[0] : cookie.split('=')[1] for cookie in cookies_list}
print(cookies)

response = requests.get(url, headers=headers, cookies=cookies)


# with open('guitub_3.html', 'wb') as f:
#     f.write(response.content)

2.cookiejar的处理(存储和管理 cookie)

python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://www.baidu.com'

response = requests.get(url)
print(response.cookies)

# 对cookiejar对象的处理,将cookiejar转换为字典:
dict_cookies = requests.utils.dict_from_cookiejar(response.cookies)
print(dict_cookies)

# 将字典转换回cookiejar
jar_cookies = requests.utils.cookiejar_from_dict(dict_cookies)
print(jar_cookies)
  1. 模拟网络波动,timeout的使用
python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://twitter.com'

response = requests.get(url, timeout=3)
print(response.cookies)
相关推荐
吴秋霖3 天前
主流反爬虫、反作弊防护与风控对抗手段
爬虫·算法·反爬虫技术
hui函数3 天前
scrapy框架-day02
后端·爬虫·python·scrapy
用户051610461673 天前
爬虫 API 技术全解析:从原理到实战的高效数据采集指南
爬虫·api
xiaoxiongip6665 天前
动态ip适合挂什么项目
网络·爬虫·python·网络协议·tcp/ip·ip
q567315235 天前
自动化拨号爬虫体系:虚拟机集群部署与增量管理
运维·爬虫·网络协议·自动化
电商API_180079052476 天前
淘宝商品视频批量自动化获取的常见渠道分享
java·爬虫·自动化·网络爬虫·音视频
果壳~6 天前
【Python】爬虫html提取内容基础,bs4
爬虫·python·html
jay神6 天前
基于Python的商品爬取与可视化系统
爬虫·python·数据分析·毕业设计·可视化系统
华科云商xiao徐7 天前
如何在C语言环境中借助Linux库构建高效网络爬虫
爬虫·数据挖掘·数据分析
明远湖之鱼7 天前
巧用 Puppeteer + Cheerio:批量生成高质量 Emoji 图片
前端·爬虫·node.js