案例需求:
1.使用scrapy爬虫技术爬取链家租房网站(成都租房信息_成都出租房源|房屋出租价格【成都贝壳租房】 )的数据(包括标题、价格和链接)
2.利用XPath进行数据解析
3.保存为本地json文件
分析:
请求地址:
伪装浏览器
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
关闭君子协议------注释这行即可
# ROBOTSTXT_OBEY = True
XPath解析数据
#2.解析数据 name=response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@title').extract() price = response.xpath('//span[@class="content__list--item-price"]/em/text()').extract() link = response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@href').extract() # print(name) # print(price) # print(link) for names ,prices,links in zip(name,price,link): print(names) print(prices) print(links) print('=========================')
将数据打包并发送给item
#3.将数据打包 实例化类 item = MyspiderItem() # print(item) item['name']=names item['price']=prices item['link']=links #3.2返回给引擎 yield item
items
class MyspiderItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field()#标题 price = scrapy.Field()#价格 link = scrapy.Field() # 价格
保存数据,并写在pplines中
class MyspiderPipeline: def __init__(self): self.file = open('lianjia.json','w',encoding='utf-8') def process_item(self, item, spider): print('管道文件的item',item) # print(type(item)) dict_data = dict(item) print(type(dict_data)) #2.存数据 json_data = json.dumps(dict_data,ensure_ascii=False)+'\n'#ensure_ascii=False 不要让数据编程编码 #写入数据 开文件 self.file.write(json_data) return item #用完之后 关闭文件 def __del__(self): self.file.close()
这时运行结果item为空字典------则需要开开启管道才可写入数据------settings
ITEM_PIPELINES = { "myspider.pipelines.MyspiderPipeline": 300, }
创建项目:
代码示例:
import scrapy
from myspider.items import MyspiderItem
class LianjiaSpider(scrapy.Spider):
name = "lianjia" #爬虫名字
allowed_domains = ["lianjia.com"]#域名范围
start_urls = ["https://cs.lianjia.com/zufang/"]#爬虫的起始url
def parse(self, response):
# print('响应体对象',response)
# print('响应源码:',response.body)
# print(response.text)
#1.可以先去开一个文件
html_data = response.text
with open('lj.html','w')as f:
f.write(html_data)
#2.解析数据
name=response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@title').extract()
price = response.xpath('//span[@class="content__list--item-price"]/em/text()').extract()
link = response.xpath('//div[@class="content__list--item"]//a[@class="content__list--item--aside"]/@href').extract()
# print(name)
# print(price)
# print(link)
for names ,prices,links in zip(name,price,link):
print(names)
print(prices)
print(links)
print('=========================')
#3.将数据打包 实例化类
item = MyspiderItem()
# print(item)
item['name']=names
item['price']=prices
item['link']=links
#3.2返回给引擎
yield item
#4.保存数据???写在pplines
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class MyspiderItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()#标题
price = scrapy.Field()#价格
link = scrapy.Field() # 价格
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
#保存数据-json数据
class MyspiderPipeline:
def __init__(self):
self.file = open('lianjia.json','w',encoding='utf-8')
def process_item(self, item, spider):
print('管道文件的item',item)
# print(type(item))
dict_data = dict(item)
print(type(dict_data))
#2.存数据
json_data = json.dumps(dict_data,ensure_ascii=False)+'\n'#ensure_ascii=False 不要让数据编程编码
#写入数据 开文件
self.file.write(json_data)
return item
#用完之后 关闭文件
def __del__(self):
self.file.close()
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','lianjia','--nolog'])
运行结果: