爬取某乎专栏文章html格式,并转到pdf保存

python 复制代码
import os
import re
import requests
import pdfkit
import parsel


'''
1.先获取html文章内容获取小赖
2.把html文件转成pdf
'''
filename = 'html\\'
if not os.path.exists(filename):
    os.mkdir(filename)

filename2 = 'pdf\\'
if not os.path.exists(filename2):
    os.mkdir(filename2)

url = 'https://zhuanlan.zhihu.com/p/193129156'
cookies = {
    'SESSIONID': 'ZehN0iPTjKrdNi2WFa8ER0HYH12U9US4kgYo4OyrAw9',
    'JOID': 'UFkXAUwdzw4BSxxaBBwZFvYmUVAcTqd8awp4CkBenUlQPHAJRUHN4GRNHFkB00noQ-NEB7nC7Z2WgNoZlyTCwLM=',
    'osd': 'VVEWC0IYxw8LRRlSBRYXE_4nW14ZRqZ2ZQ9wC0pQmEFRNn4MTUDH7mFFHVMP1kHpSe1BD7jI45iegdAXkizDyr0=',
    '_zap': '5d19d70f-e34a-45e7-8cba-44d6328dbfa5',
    'd_c0': 'AGBYomVNDBiPTgMQXEjxJSr1GFG2DZT3PU8=|1705922455',
    '_xsrf': '4b939fa4-4fa2-46dc-82f6-2857cce6a020',
    'Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49': '1708692590,1709899965',
    'captcha_session_v2': '2|1:0|10:1709899965|18:captcha_session_v2|88:OXFTZEJJWktjZlIyNlBIQ0taVFBVeEl1SU56ODFSenFLVEhUb3owaTNBQ2FIMVZOVUI3VGVIbjdFMmRjOTQxRg==|db2630233d68810f30a1c841e1c55bbe6681e70720408bbc56daf1b71d4b2ffe',
    'tst': 'r',
    'Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49': '1709900122',
    'z_c0': '2|1:0|10:1709900123|4:z_c0|92:Mi4xbk1rS0V3QUFBQUFBWUZpaVpVME1HQ1lBQUFCZ0FsVk4wMDdZWmdBY19qeXVOczZhdnJRcUVFMlRZQ0t3NmwyMFR3|4a5a855581731f870a9bdd1b6451c482813a5701508ab9082e7019c802cd4b16',
    'KLBRSID': 'fe78dd346df712f9c4f126150949b853|1709900123|1709900121',
}

headers = {
    'authority': 'zhuanlan.zhihu.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    # 'cookie': 'SESSIONID=ZehN0iPTjKrdNi2WFa8ER0HYH12U9US4kgYo4OyrAw9; JOID=UFkXAUwdzw4BSxxaBBwZFvYmUVAcTqd8awp4CkBenUlQPHAJRUHN4GRNHFkB00noQ-NEB7nC7Z2WgNoZlyTCwLM=; osd=VVEWC0IYxw8LRRlSBRYXE_4nW14ZRqZ2ZQ9wC0pQmEFRNn4MTUDH7mFFHVMP1kHpSe1BD7jI45iegdAXkizDyr0=; _zap=5d19d70f-e34a-45e7-8cba-44d6328dbfa5; d_c0=AGBYomVNDBiPTgMQXEjxJSr1GFG2DZT3PU8=|1705922455; _xsrf=4b939fa4-4fa2-46dc-82f6-2857cce6a020; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1708692590,1709899965; captcha_session_v2=2|1:0|10:1709899965|18:captcha_session_v2|88:OXFTZEJJWktjZlIyNlBIQ0taVFBVeEl1SU56ODFSenFLVEhUb3owaTNBQ2FIMVZOVUI3VGVIbjdFMmRjOTQxRg==|db2630233d68810f30a1c841e1c55bbe6681e70720408bbc56daf1b71d4b2ffe; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1709900122; z_c0=2|1:0|10:1709900123|4:z_c0|92:Mi4xbk1rS0V3QUFBQUFBWUZpaVpVME1HQ1lBQUFCZ0FsVk4wMDdZWmdBY19qeXVOczZhdnJRcUVFMlRZQ0t3NmwyMFR3|4a5a855581731f870a9bdd1b6451c482813a5701508ab9082e7019c802cd4b16; KLBRSID=fe78dd346df712f9c4f126150949b853|1709900123|1709900121',
    'referer': 'https://www.zhihu.com/column/c_1090924073042837504',
    'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-site',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}

response = requests.get(url=url, cookies=cookies, headers=headers).text
selector = parsel.Selector(response)
title = selector.css('.Post-Title::text').get()
html = selector.css('.css-376mun').get()
# 把文章链接获取出来,然后替换到显示图片的位置
img_url_list = re.findall('<noscript><img src="(.*?)" data-caption=',html)
# 提取需要被替换的内容
img_list = re.findall('</noscript><img src="(.*?)" data-caption=',html)
for img_url,img in zip(img_url_list,img_list):
    html = html.replace(img,img_url)


'''把html文章内容,保存成html文件'''
html_str = '''
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Document</title>
</head>
<body>
{article}
</body>
</html>
'''
content = html_str.format(article=html)  # 字符串格式化方法,把获取到的文章内容传入到前端模板中
html_path = filename+title+'.html'
pdf_path = filename2+title+'.pdf'
with open(html_path,mode='w',encoding='utf-8') as f:
    f.write(content)
config = pdfkit.configuration(wkhtmltopdf=r'C:\Users\86187\PycharmProjects\pythonProject2\wkhtmltoodf\wkhtmltopdf\bin\wkhtmltopdf.exe')
pdfkit.from_file(html_path,pdf_path,configuration=config)


print(title)
print(content)

总结:

1.保存成html格式,首先写html框架,然后,然后获取页面,将页面格式化保存。

python 复制代码
html = selector.css('.css-376mun').get()
'''把html文章内容,保存成html文件'''
html_str = '''
<!doctype html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <title>Document</title>
</head>
<body>
{article}
</body>
</html>
'''
content = html_str.format(article=html) 

2.保存后会出现问题,图片的链接会失效,因此需要将链接进行替换。

python 复制代码
img_url_list = re.findall('<noscript><img src="(.*?)" data-caption=',html)
# 提取需要被替换的内容
img_list = re.findall('</noscript><img src="(.*?)" data-caption=',html)
for img_url,img in zip(img_url_list,img_list):
    html = html.replace(img,img_url)

3.下载一个驱动wkhtmltopdf,下载链接:wkhtmltopdf 。然后使用pdfkit库进行使用。

相关推荐
m0_7482361127 分钟前
Calcite Web 项目常见问题解决方案
开发语言·前端·rust
Watermelo61739 分钟前
详解js柯里化原理及用法,探究柯里化在Redux Selector 的场景模拟、构建复杂的数据流管道、优化深度嵌套函数中的精妙应用
开发语言·前端·javascript·算法·数据挖掘·数据分析·ecmascript
m0_7482489441 分钟前
HTML5系列(11)-- Web 无障碍开发指南
前端·html·html5
m0_748235611 小时前
从零开始学前端之HTML(三)
前端·html
LostSpeed2 小时前
在福昕(pdf)阅读器中导航到上次阅读页面的方法
pdf
旭久2 小时前
SpringBoot的Thymeleaf做一个可自定义合并td的pdf表格
pdf·html·springboot
一个处女座的程序猿O(∩_∩)O3 小时前
小型 Vue 项目,该不该用 Pinia 、Vuex呢?
前端·javascript·vue.js
hackeroink6 小时前
【2024版】最新推荐好用的XSS漏洞扫描利用工具_xss扫描工具
前端·xss
迷雾漫步者7 小时前
Flutter组件————FloatingActionButton
前端·flutter·dart
向前看-8 小时前
验证码机制
前端·后端