初识爬虫3

1.cookies参数(浏览器的登录信息,需要设置,防止反爬机制检测)

1.1 headers中设置cookies参数

python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://github.com/'
# 构建请求字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}

response = requests.get(url, headers=headers)

with open('guitub_1.html', 'wb') as f:
    f.write(response.content)

1.2 构建cookies字典

python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://github.com/'
# 构建请求字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    # 'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}

# 构建cookies字典
temp = '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'

# 方案一
# cookies_list = temp.split('; ')
# cookies = {}
# for cookie in cookies_list:
#     cookies[cookie.split('=')[0]] = cookie.split('=')[-1]
#
# response = requests.get(url, headers=headers, cookies=cookies)
#
#
# with open('guitub_2.html', 'wb') as f:
#     f.write(response.content)

# 方案二(工作中用)
cookies_list = temp.split('; ')
cookies = {cookie.split('=')[0] : cookie.split('=')[1] for cookie in cookies_list}
print(cookies)

response = requests.get(url, headers=headers, cookies=cookies)


# with open('guitub_3.html', 'wb') as f:
#     f.write(response.content)

2.cookiejar的处理(存储和管理 cookie)

python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://www.baidu.com'

response = requests.get(url)
print(response.cookies)

# 对cookiejar对象的处理,将cookiejar转换为字典:
dict_cookies = requests.utils.dict_from_cookiejar(response.cookies)
print(dict_cookies)

# 将字典转换回cookiejar
jar_cookies = requests.utils.cookiejar_from_dict(dict_cookies)
print(jar_cookies)
  1. 模拟网络波动,timeout的使用
python 复制代码
# -*- coding: utf-8 -*-
import requests

url = 'https://twitter.com'

response = requests.get(url, timeout=3)
print(response.cookies)
相关推荐
Tech Synapse5 小时前
Python网络爬虫实践案例:爬取猫眼电影Top100
开发语言·爬虫·python
数据小爬虫@6 小时前
利用Python爬虫获取淘宝店铺详情
开发语言·爬虫·python
B站计算机毕业设计超人12 小时前
计算机毕业设计SparkStreaming+Kafka新能源汽车推荐系统 汽车数据分析可视化大屏 新能源汽车推荐系统 汽车爬虫 汽车大数据 机器学习
数据仓库·爬虫·python·数据分析·kafka·数据可视化·推荐算法
易辰君14 小时前
【Python爬虫实战】深入解析 Scrapy 爬虫框架:高效抓取与实战搭建全指南
开发语言·爬虫·python
风动也无爱14 小时前
Java的正则表达式和爬虫
java·爬虫·正则表达式
数据小爬虫@15 小时前
如何利用Python爬虫精准获得1688店铺的所有商品信息
开发语言·爬虫·python
好看资源平台1 天前
动态网站数据爬取——Selenium的使用
爬虫·python
兆。1 天前
python实战案例----使用 PyQt5 构建简单的 HTTP 接口测试工具
爬虫·python·qt
吖吖耶3331 天前
【Python爬虫】Scrapy框架实战
爬虫·python·scrapy
Token_w1 天前
Python爬虫进阶实战项目:使用青果网代理高效爬取某手办网详情数据
大数据·网络·爬虫·python·tcp/ip·tcp