python
import os
import re
import requests
import pdfkit
import parsel
'''
1.先获取html文章内容获取小赖
2.把html文件转成pdf
'''
filename = 'html\\'
if not os.path.exists(filename):
os.mkdir(filename)
filename2 = 'pdf\\'
if not os.path.exists(filename2):
os.mkdir(filename2)
url = 'https://zhuanlan.zhihu.com/p/193129156'
cookies = {
'SESSIONID': 'ZehN0iPTjKrdNi2WFa8ER0HYH12U9US4kgYo4OyrAw9',
'JOID': 'UFkXAUwdzw4BSxxaBBwZFvYmUVAcTqd8awp4CkBenUlQPHAJRUHN4GRNHFkB00noQ-NEB7nC7Z2WgNoZlyTCwLM=',
'osd': 'VVEWC0IYxw8LRRlSBRYXE_4nW14ZRqZ2ZQ9wC0pQmEFRNn4MTUDH7mFFHVMP1kHpSe1BD7jI45iegdAXkizDyr0=',
'_zap': '5d19d70f-e34a-45e7-8cba-44d6328dbfa5',
'd_c0': 'AGBYomVNDBiPTgMQXEjxJSr1GFG2DZT3PU8=|1705922455',
'_xsrf': '4b939fa4-4fa2-46dc-82f6-2857cce6a020',
'Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49': '1708692590,1709899965',
'captcha_session_v2': '2|1:0|10:1709899965|18:captcha_session_v2|88:OXFTZEJJWktjZlIyNlBIQ0taVFBVeEl1SU56ODFSenFLVEhUb3owaTNBQ2FIMVZOVUI3VGVIbjdFMmRjOTQxRg==|db2630233d68810f30a1c841e1c55bbe6681e70720408bbc56daf1b71d4b2ffe',
'tst': 'r',
'Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49': '1709900122',
'z_c0': '2|1:0|10:1709900123|4:z_c0|92:Mi4xbk1rS0V3QUFBQUFBWUZpaVpVME1HQ1lBQUFCZ0FsVk4wMDdZWmdBY19qeXVOczZhdnJRcUVFMlRZQ0t3NmwyMFR3|4a5a855581731f870a9bdd1b6451c482813a5701508ab9082e7019c802cd4b16',
'KLBRSID': 'fe78dd346df712f9c4f126150949b853|1709900123|1709900121',
}
headers = {
'authority': 'zhuanlan.zhihu.com',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
# 'cookie': 'SESSIONID=ZehN0iPTjKrdNi2WFa8ER0HYH12U9US4kgYo4OyrAw9; JOID=UFkXAUwdzw4BSxxaBBwZFvYmUVAcTqd8awp4CkBenUlQPHAJRUHN4GRNHFkB00noQ-NEB7nC7Z2WgNoZlyTCwLM=; osd=VVEWC0IYxw8LRRlSBRYXE_4nW14ZRqZ2ZQ9wC0pQmEFRNn4MTUDH7mFFHVMP1kHpSe1BD7jI45iegdAXkizDyr0=; _zap=5d19d70f-e34a-45e7-8cba-44d6328dbfa5; d_c0=AGBYomVNDBiPTgMQXEjxJSr1GFG2DZT3PU8=|1705922455; _xsrf=4b939fa4-4fa2-46dc-82f6-2857cce6a020; Hm_lvt_98beee57fd2ef70ccdd5ca52b9740c49=1708692590,1709899965; captcha_session_v2=2|1:0|10:1709899965|18:captcha_session_v2|88:OXFTZEJJWktjZlIyNlBIQ0taVFBVeEl1SU56ODFSenFLVEhUb3owaTNBQ2FIMVZOVUI3VGVIbjdFMmRjOTQxRg==|db2630233d68810f30a1c841e1c55bbe6681e70720408bbc56daf1b71d4b2ffe; tst=r; Hm_lpvt_98beee57fd2ef70ccdd5ca52b9740c49=1709900122; z_c0=2|1:0|10:1709900123|4:z_c0|92:Mi4xbk1rS0V3QUFBQUFBWUZpaVpVME1HQ1lBQUFCZ0FsVk4wMDdZWmdBY19qeXVOczZhdnJRcUVFMlRZQ0t3NmwyMFR3|4a5a855581731f870a9bdd1b6451c482813a5701508ab9082e7019c802cd4b16; KLBRSID=fe78dd346df712f9c4f126150949b853|1709900123|1709900121',
'referer': 'https://www.zhihu.com/column/c_1090924073042837504',
'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'same-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
response = requests.get(url=url, cookies=cookies, headers=headers).text
selector = parsel.Selector(response)
title = selector.css('.Post-Title::text').get()
html = selector.css('.css-376mun').get()
# 把文章链接获取出来,然后替换到显示图片的位置
img_url_list = re.findall('<noscript><img src="(.*?)" data-caption=',html)
# 提取需要被替换的内容
img_list = re.findall('</noscript><img src="(.*?)" data-caption=',html)
for img_url,img in zip(img_url_list,img_list):
html = html.replace(img,img_url)
'''把html文章内容,保存成html文件'''
html_str = '''
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Document</title>
</head>
<body>
{article}
</body>
</html>
'''
content = html_str.format(article=html) # 字符串格式化方法,把获取到的文章内容传入到前端模板中
html_path = filename+title+'.html'
pdf_path = filename2+title+'.pdf'
with open(html_path,mode='w',encoding='utf-8') as f:
f.write(content)
config = pdfkit.configuration(wkhtmltopdf=r'C:\Users\86187\PycharmProjects\pythonProject2\wkhtmltoodf\wkhtmltopdf\bin\wkhtmltopdf.exe')
pdfkit.from_file(html_path,pdf_path,configuration=config)
print(title)
print(content)
总结:
1.保存成html格式,首先写html框架,然后,然后获取页面,将页面格式化保存。
python
html = selector.css('.css-376mun').get()
'''把html文章内容,保存成html文件'''
html_str = '''
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Document</title>
</head>
<body>
{article}
</body>
</html>
'''
content = html_str.format(article=html)
2.保存后会出现问题,图片的链接会失效,因此需要将链接进行替换。
python
img_url_list = re.findall('<noscript><img src="(.*?)" data-caption=',html)
# 提取需要被替换的内容
img_list = re.findall('</noscript><img src="(.*?)" data-caption=',html)
for img_url,img in zip(img_url_list,img_list):
html = html.replace(img,img_url)
3.下载一个驱动wkhtmltopdf,下载链接:wkhtmltopdf 。然后使用pdfkit库进行使用。