前期我们介绍过使用xpath解析数据,这次在原基础上将爬取下的数据直接保存到MongoDB中。
参考代码如下:
from lxml import etree
import requests
import re
import pymongo
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0'
}
# 根据url抓取网页内容
def getOnePage(url):
resp = requests.get(url, headers=headers)
try:
# 服务器响应正常
if resp.status_code == 200:
return resp.text
return None
except Exception:
return None
# 分析HTML代码 xpath 获取内容 使用正则表达式匹配所需字符串
def parseOnePage(html):
# 获取连接对象
client = pymongo.MongoClient()
# 获取数据库对象如果db-books不存在新建
db = client['db-novels']
# 获取要操作的集合 如果此集合不存在 会新建
collection = db['collection-book']
selector_html = etree.HTML(html)
#选取节点 获取所有的图书的div
items = selector_html.xpath('//div[@class="doulist-item"]')
# 遍历div
for item in items:
# 图书的图片地址
pic = item.xpath('.//div[@class="post"]/a/img/@src')[0]
bname = item.xpath('.//div[@class="title"]/a/text()')[0]
bname = re.search("\\w+", bname)
bname = bname.group()
rate = item.xpath('.//div[@class="rating"]/span[last()-1]/text()')[0]
author = item.xpath('.//div[@class="abstract"]/text()')[0]
author = re.search("(?<=作者:\\s)(.*)", author, re.M)
if author is not None:
author = author.group()
company = item.xpath('.//div[@class="abstract"]/text()')[1]
company = re.search("(?<=出版社:\\s)(.*)", company)
company = company.group()
date = item.xpath('.//div[@class="abstract"]/text()')[2]
date = re.search("\\d{4}(-\\d{1,2})?", date)
if date is not None:
date = date.group()
print(bname+'\t'+author+'\t'+company+'\t'+date+'\t'+rate+'\t'+pic)
# 将数据存储在列表中
list = [['bname',bname],['author',author],['company',company],['b-date',date],['rate',rate],['pic-url',pic]]
# 将列表转为字典类型
row = dict(list)
print(row)
# 将数据插入到数据库表中
collection.insert_one(row)
#抓取URL页面,并保存到文件中
def getTop100(url):
# 获取页面的数据
html = getOnePage(url)
# 从页面提取图书信息并保存到MongoDB数据库中
parseOnePage(html)
# 分页的四个Url地址
urls = ['https://www.douban.com/doulist/45004834/?start={}'.format( str(i) ) for i in range(0,100,25)]
for url in urls:
print(url)
getTop100(url)
运行结果如下: