项目名称:当当网的爬取一------爬取青春文学的书籍数据
案例需求:
1.使用scrapy爬虫技术爬取当当网中青春文学的书籍数据,包括(标题、现价、定价、作者、出版日期、出版社、书本详情和书本图片url)
2.将获取到的数据保存在数据库中
3.实现分页爬取
分析
1.数据包的获取
2.准备工作:
# ROBOTSTXT_OBEY = True
ITEM_PIPELINES = { "Dangd.pipelines.DangdPipeline": 300, }
USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"
3.解析数据
同理
标题: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li[3]/p[1]/a /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li[4]/p[1]/a /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[1]/a 现价: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[3]/span[1] 定价: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[3]/span[2] 作者: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[5]/span[1]/a[1] 出版日期: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[5]/span[2] 出版社: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[5]/span[3]/a 书本详情: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[2]/text() 图片: /html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/a/img
title=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/a/img/@alt').extract()
price_xz=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[3]/span[1]/text()').extract()
price_dj=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[3]/span[2]/text()').extract()
author=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[5]/span[1]/a[1]/text()').extract()
date=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[5]/span[2]/text()').extract()
cbs=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[5]/span[3]/a/text()').extract()
detail=response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/p[2]/text()').extract()
# url='http://'+response.xpath('/html/body/div[2]/div/div[3]/div[1]/div[1]/div[2]/div/ul/li/a/img/@src').extract()
url=response.xpath('//ul[@id="component_59"]/li')
for t,px,pd,a,d,c,de,u in zip(title,price_xz,price_dj,author,date,cbs,detail,url):
src = u.xpath('.//img/@data-original').extract_first()
if src:
src='http:'+src
else:
src = 'http:'+u.xpath('.//img/@src').extract_first()
# print('标题:',title)
# print('现价:',price_xz)
# print('定价:',price_dj)
# print('作者:',author)
# print('出版日期:',date)
# print('出版社:',cbs)
# print('书本详情:',detail)
# print('书本图片:',url)
print('标题:', t)
print('现价:', px)
print('定价:', pd)
print('作者:', a)
print('出版日期:', d.replace('/',''))
print('出版社:', c)
print('书本详情:', de)
print('书本图片:', src)
print('=====================================')
class DangdItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()#标题
price_xz = scrapy.Field()#现价
price_dj = scrapy.Field()#定价
author = scrapy.Field()#作者
date = scrapy.Field()#出版日期
cbs = scrapy.Field()#出版社
detail = scrapy.Field()#书本详情
url = scrapy.Field()#书本图片
4.翻页
第一页 http://category.dangdang.com/cp01.01.00.00.00.00.html
第二页 http://category.dangdang.com/pg2-cp01.01.00.00.00.00.html
第三页 http://category.dangdang.com/pg3-cp01.01.00.00.00.00.html
第四页 http://category.dangdang.com/pg4-cp01.01.00.00.00.00.html
所以总结:
http://category.dangdang.com/pg{}-cp01.01.00.00.00.00.html
if self.page<10:#就不保存完了
self.page=self.page+1
url=self.base_url+str(self.page)+'-cp01.01.00.00.00.00.html'
print('+++++++++++++++++第{}页++++++++++++++++++'.format(self.page))
yield scrapy.Request(url=url,callback=self.parse)
5.保存至数据库
class DangdPipeline:
def __init__(self):
# 打开文件
# 连接数据库
self.conn = pymysql.connect(
host='localhost',
port=3306,
user='root',
passwd='wx990826',
db='dangdang',
)
self.cur = self.conn.cursor()
def process_item(self, item, spider):
sqli = "insert into qcwx(title,price_xz,price_dj,author,date,cbs,detail,url) values(%s,%s,%s,%s,%s,%s,%s,%s)"
self.cur.execute(sqli, (
item['title'], item['price_xz'], item['price_dj'], item['author'], item['date'], item['cbs'], item['detail'], item['url']))
self.conn.commit()
print('保存完毕')
return item
6.运行
from scrapy import cmdline
cmdline.execute(['scrapy','crawl','dangd','--nolog'])
运行结果:
注意:该网站的图片为懒加载