复制代码
源码:
# 抓取政府工作报告的文本
import requests, os, jieba, numpy
from lxml import etree
from wordcloud import WordCloud
from PIL import Image # 装库:pip install pillow
class OneSpider(object):
def __init__(self):
pass
def request_start_url(self):
# 爬虫部分
start_url = 'https://www.ynbdm.cn/news.php'
cookies = {
'PHPSESSID': 'rpkr2o2rots8pe0mr9dp0kn0d1',
}
headers = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
# 'cookie': 'PHPSESSID=rpkr2o2rots8pe0mr9dp0kn0d1',
'priority': 'u=0, i',
'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'none',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
}
params = {
'id': '31039',
}
response = requests.get(start_url, params=params, cookies=cookies, headers=headers).text
self.parse_response(response)
def parse_response(self, response):
# 解析响应
A = etree.HTML(response)
# bt = A.xpath('//title/text()')[0].replace('!', '')
nr = A.xpath('//div[@class="content_show"]//text()')
nr = ''.join(nr)
with open('政府工作报告.txt', 'w', encoding='utf-8') as f:
f.write(nr)
print('ok -- 政府工作报告.txt')
def show_image(self):
# 词云图部分
# --------1、读文本-------------
data = open('政府工作报告.txt', 'r', encoding='utf-8').read()
# --------2、jieba切割-----------
data_list = list(jieba.cut(data))
# --------3、粗略处理文本---------
data_list = [i for i in data_list if len(i) != 1]
# --------4、精确处理文本(过滤敏感信息,称为停用词)----------
tyc = open('../stop_words.txt', 'r', encoding='utf-8').read()
tyc = tyc.split('\n')
data_list = [i for i in data_list if i not in tyc]
# print(data_list)
# ------------------5、文本变字符串-------------
TEXT = ' '.join(data_list)
# --------6、添加一个背景图片------------------
img = Image.open('../Y.jpg') # 此处的image为一个数据类型
mask = numpy.array(img) # 得到矩阵形式的图片,[255 255 255 ... 255 255 255]代表RGB的含量
# --------7、建立词云图样式------------------------
wb = WordCloud(
width=500,
height=500,
background_color='white',
mask=mask,
font_path='C:\Windows\Fonts\msyh.ttc',
)
# -------8、添加数据---------------
wb.generate(TEXT)
#--------9、生成本地效果-------------
wb.to_file('第二个.png')
print('------词云图生成完毕-----------')
def main(self):
if not os.path.exists('政府工作报告.txt'):
self.request_start_url()
else:
self.show_image()
if __name__ == '__main__':
on = OneSpider()
on.main()