python爬虫之scrapy基于管道持久化存储操作
本文基于python爬虫之基于终端指令的持久化存储和python爬虫之数据解析操作而写
scrapy持久化存储
基于管道:
编码流程:
1、数据解析
2、在item类中定义相关属性
3、将解析的数据封装存储到item类型的对象
4、在管道类的process_item中要将接受到的item对象中存储的数据进行持久化存储操作
5、在配置文件中开启管道
实际操作:
1、在items.py中定义item类
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class QiushiproItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
content = scrapy.Field()
# name = scrapy.Field()
# pass
2、在qiushi.py中将数据封装到item类中
import scrapy
from qiushiPro.items import QiushiproItem
class QiushiSpider(scrapy.Spider):
name = "qiushi"
# allowed_domains = ["www.xxx.com"]
start_urls = ["https://www.qiushile.com/duanzi/"]
# def parse(self, response):
# #解析:段子标题+段子内容
# li_list = response.xpath('//*[@id="ct"]/div[1]/div[2]/ul')
# all_data = []
# for li in li_list:
# #xpath返回的是列表,但是列表元素一定是Selector类型的对象
# #extract可以将Selector对象中data参数存储的字符串提取出来
# # title = li.xpath('./li/div[2]/div[1]/a/text()')[0].extract()
# title = li.xpath('./li/div[2]/div[1]/a/text()').extract_first()
# #列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来
# content = li.xpath('./li/div[2]/div[2]//text()')[0].extract()
#
# dic = {
# 'title':title,
# 'content':content
# }
# all_data.append(dic)
# # print(title,content)
def parse(self, response):
#解析:段子标题+段子内容
li_list = response.xpath('//*[@id="ct"]/div[1]/div[2]/ul')
all_data = []
for li in li_list:
#xpath返回的是列表,但是列表元素一定是Selector类型的对象
#extract可以将Selector对象中data参数存储的字符串提取出来
# title = li.xpath('./li/div[2]/div[1]/a/text()')[0].extract()
title = li.xpath('./li/div[2]/div[1]/a/text()').extract_first()
#列表调用了extract之后,则表示将列表中每一个Selector对象中data对应的字符串提取了出来
content = li.xpath('./li/div[2]/div[2]//text()')[0].extract()
item = QiushiproItem()
item['title'] = title
item['content'] = content
yield item#将item提交给了管道
3、在pipelines.py中的process_item类中进行持久化存储
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class QiushiproPipeline:
fp = None
#重写父类的一个方法:该方法只在开始爬虫的时候被调用一次
def open_spider(self,spider):
print('开始爬虫......')
self.fp = open('./qiushi.txt','w',encoding='utf-8')
#专门用来处理item类型对象
#该方法可以接收爬虫文件提交过来的item对象
#该方法每接收到一个item就会被调用一次
def process_item(self, item, spider):
title = item['title']
content = item['content']
self.fp.write(title+':'+content+'\n')
return item
def close_spider(self,spider):
print('结束爬虫!')
self.fp.close()
4、在settings.py配置文件中取消管道注释,开启管道
ITEM_PIPELINES = {
"qiushiPro.pipelines.QiushiproPipeline": 300,
#300表示的是优先级,数值越小优先级越高
}
运行:终端输入scrapy crawl qiushi
可观察到qiushi.txt文件的生成