爬取某乎专栏文章html格式,并转到pdf保存

python 复制代码
import os
import re
import requests
import pdfkit
import parsel


'''
1.先获取html文章内容获取小赖
2.把html文件转成pdf
'''
filename = 'html\\'
if not os.path.exists(filename):
    os.mkdir(filename)

filename2 = 'pdf\\'
if not os.path.exists(filename2):
    os.mkdir(filename2)

url = 'https://zhuanlan.zhihu.com/p/193129156'
cookies = {
    'SESSIONID': 'ZehN0iPTjKrdNi2WFa8ER0HYH12U9US4kgYo4OyrAw9',
    'JOID': 'UFkXAUwdzw4BSxxaBBwZFvYmUVAcTqd8awp4CkBenUlQPHAJRUHN4GRNHFkB00noQ-NEB7nC7Z2WgNoZlyTCwLM=',
    'osd': 'VVEWC0IYxw8LRRlSBRYXE_4nW14ZRqZ2ZQ9wC0pQmEFRNn4MTUDH7mFFHVMP1kHpSe1BD7jI45iegdAXkizDyr0=',
    '_zap': '5d19d70f-e34a-45e7-8cba-44d6328dbfa5',
    'd_c0': 'AGBYomVNDBiPTgMQXEjxJSr1GFG2DZT3PU8=|1705922455',
    '_xsrf': '4b939fa4-4fa2-46dc-82f6-2857cce6a020',
    'Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49': '1708692590,1709899965',
    'captcha_session_v2': '2|1:0|10:1709899965|18:captcha_session_v2|88:OXFTZEJJWktjZlIyNlBIQ0taVFBVeEl1SU56ODFSenFLVEhUb3owaTNBQ2FIMVZOVUI3VGVIbjdFMmRjOTQxRg==|db2630233d68810f30a1c841e1c55bbe6681e70720408bbc56daf1b71d4b2ffe',
    'tst': 'r',
    'Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49': '1709900122',
    'z_c0': '2|1:0|10:1709900123|4:z_c0|92:Mi4xbk1rS0V3QUFBQUFBWUZpaVpVME1HQ1lBQUFCZ0FsVk4wMDdZWmdBY19qeXVOczZhdnJRcUVFMlRZQ0t3NmwyMFR3|4a5a855581731f870a9bdd1b6451c482813a5701508ab9082e7019c802cd4b16',
    'KLBRSID': 'fe78dd346df712f9c4f126150949b853|1709900123|1709900121',
}

headers = {
    'authority': 'zhuanlan.zhihu.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    # 'cookie': 'SESSIONID=ZehN0iPTjKrdNi2WFa8ER0HYH12U9US4kgYo4OyrAw9; JOID=UFkXAUwdzw4BSxxaBBwZFvYmUVAcTqd8awp4CkBenUlQPHAJRUHN4GRNHFkB00noQ-NEB7nC7Z2WgNoZlyTCwLM=; osd=VVEWC0IYxw8LRRlSBRYXE_4nW14ZRqZ2ZQ9wC0pQmEFRNn4MTUDH7mFFHVMP1kHpSe1BD7jI45iegdAXkizDyr0=; _zap=5d19d70f-e34a-45e7-8cba-44d6328dbfa5; d_c0=AGBYomVNDBiPTgMQXEjxJSr1GFG2DZT3PU8=|1705922455; _xsrf=4b939fa4-4fa2-46dc-82f6-2857cce6a020; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1708692590,1709899965; captcha_session_v2=2|1:0|10:1709899965|18:captcha_session_v2|88:OXFTZEJJWktjZlIyNlBIQ0taVFBVeEl1SU56ODFSenFLVEhUb3owaTNBQ2FIMVZOVUI3VGVIbjdFMmRjOTQxRg==|db2630233d68810f30a1c841e1c55bbe6681e70720408bbc56daf1b71d4b2ffe; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1709900122; z_c0=2|1:0|10:1709900123|4:z_c0|92:Mi4xbk1rS0V3QUFBQUFBWUZpaVpVME1HQ1lBQUFCZ0FsVk4wMDdZWmdBY19qeXVOczZhdnJRcUVFMlRZQ0t3NmwyMFR3|4a5a855581731f870a9bdd1b6451c482813a5701508ab9082e7019c802cd4b16; KLBRSID=fe78dd346df712f9c4f126150949b853|1709900123|1709900121',
    'referer': 'https://www.zhihu.com/column/c_1090924073042837504',
    'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}

response = requests.get(url=url, cookies=cookies, headers=headers).text
selector = parsel.Selector(response)
title = selector.css('.Post-Title::text').get()
html = selector.css('.css-376mun').get()
# 把文章链接获取出来,然后替换到显示图片的位置
img_url_list = re.findall('<noscript><img src="(.*?)" data-caption=',html)
# 提取需要被替换的内容
img_list = re.findall('</noscript><img src="(.*?)" data-caption=',html)
for img_url,img in zip(img_url_list,img_list):
    html = html.replace(img,img_url)


'''把html文章内容,保存成html文件'''
html_str = '''
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Document</title>
</head>
<body>
{article}
</body>
</html>
'''
content = html_str.format(article=html)  # 字符串格式化方法,把获取到的文章内容传入到前端模板中
html_path = filename+title+'.html'
pdf_path = filename2+title+'.pdf'
with open(html_path,mode='w',encoding='utf-8') as f:
    f.write(content)
config = pdfkit.configuration(wkhtmltopdf=r'C:\Users\86187\PycharmProjects\pythonProject2\wkhtmltoodf\wkhtmltopdf\bin\wkhtmltopdf.exe')
pdfkit.from_file(html_path,pdf_path,configuration=config)


print(title)
print(content)

总结:

1.保存成html格式,首先写html框架,然后,然后获取页面,将页面格式化保存。

python 复制代码
html = selector.css('.css-376mun').get()
'''把html文章内容,保存成html文件'''
html_str = '''
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Document</title>
</head>
<body>
{article}
</body>
</html>
'''
content = html_str.format(article=html) 

2.保存后会出现问题,图片的链接会失效,因此需要将链接进行替换。

python 复制代码
img_url_list = re.findall('<noscript><img src="(.*?)" data-caption=',html)
# 提取需要被替换的内容
img_list = re.findall('</noscript><img src="(.*?)" data-caption=',html)
for img_url,img in zip(img_url_list,img_list):
    html = html.replace(img,img_url)

3.下载一个驱动wkhtmltopdf,下载链接:wkhtmltopdf 。然后使用pdfkit库进行使用。

相关推荐
结网的兔子39 分钟前
前端学习笔记——Element Plus 栅格布局系统示例
前端·javascript·css
l1t1 小时前
DeepSeek总结的用 C# 构建 DuckDB 插件说明
前端·数据库·c#·插件·duckdb
泯泷1 小时前
从零构建寄存器式 JSVMP:实战教程导读
前端·javascript·算法
开源盛世!!2 小时前
3.19-3.21
linux·服务器·前端
必胜刻2 小时前
AJAX 请求理解
前端·ajax·okhttp·前后端交互
小民AI实战笔记2 小时前
NVM实战指南:高效管理你的Node.js环境
前端·node.js
www_stdio2 小时前
前端异步核心:Promise 从入门到吃透
前端
朱建伟2 小时前
大神尤雨溪再次出手,前端工具链整合--该文章是对vite plus官方README文档进行了翻译
前端·vite
vball2 小时前
宏观数据从哪里来?——主流宏观经济数据库与API全景
前端
Predestination王瀞潞2 小时前
5.4.1 通信->WWW万维网内容访问标准(W3C):WWW(World Wide Web)基本信息&核心设计目标&现实意义
css·网络·网络协议·html·url·www