创建movie_douban爬虫项目:
scrapy startproject movie_douban
进入spiders:
cd movie_douban/movie_douban/spiders
创建doubanMovieSpider爬虫:
scrapy genspider doubanMovieSpider movie.douban.com
修改items.py文件:
class MovieDoubanItem(scrapy.Item):
number = scrapy.Field()
name = scrapy.Field()
grade = scrapy.Field()
move_describe = scrapy.Field()
evaluate = scrapy.Field()
introduce = scrapy.Field()
image_url = scrapy.Field()
修改doubanMovieSpider.py:
import scrapy
from ..items import MovieDoubanItem
import requests, os
from scrapy import Request
class DoubanmoviespiderSpider(scrapy.Spider):
name = 'doubanMovieSpider'
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/top250']
# 保存图片的url列表
urllist = []
#获取当前文件padouban.py所在的目录
dir = os.path.dirname(__file__)
#图片的保存目录
dir_path = dir + '/' + 'tupian'
def parse(self, response):
# 判断是否请求成功
if response.status == 404:
print('failed url')
#获取电影条目列表
move_lists = response.xpath("//ol[@class='grid_view']/li")
for move in move_lists:
item = MovieDoubanItem()
# 电影名称
item['name'] = move.xpath(".//span[@class='title'][1]/text()").extract_first()
# 电影评分
item['grade'] = move.xpath(".//div[@class='star']/span[@class='rating_num']/text()").extract_first()
# 电影排名
item['number'] = move.xpath(".//div[@class='pic']/em/text()").extract_first()
# 电影描述
item['move_describe'] = move.xpath(".//p[@class='quote']/span[@class='inq']/text()").extract_first()
# 评价人数
item['evaluate'] = move.xpath(".//div[@class='star']/span[4]/text()").extract_first()
# 电影简介
introduce_content = move.xpath(".//div[@class='bd']/p[1]/text()").extract()
# introduce_content:['\n 导演: 拉库马·希拉尼 Rajkumar Hirani\xa0\xa0\xa0主演: 阿米尔·汗 Aamir Khan / 卡...', '\n 2009\xa0/\xa0印度\xa0/\xa0 剧情 喜剧 爱情 歌舞\n ']
# 使用列表推导式和str.strip()来清理每个字符串元素,然后join成一个大字符串
introduce = '\n'.join([content.strip() for content in introduce_content])
item['introduce'] = introduce
# 电影海报图片url
image_url = move.xpath(".//div[@class='pic']/a/img/@src").extract_first()
item['image_url'] = image_url
self.urllist.append(image_url) #将图片url添加到列表中,为了后面下载图片
print(item)
yield item
# # 获取下一页内容
next_page = response.xpath("//div[@class='paginator']/span[@class='next']/a/@href").extract_first()
if next_page:
url = 'https://movie.douban.com/top250' + next_page
yield Request(url=url, callback=self.parse)
# else: #爬取完所有电影条目开始下载图片。这里先不下载图片,若要下载图片,取消注释即可。
# for url in self.urllist:
# self.save_image(url)
def save_image(self,url):
# 用requests申请图片资源
reponse = requests.get(url)
# 获取图片名字
img_name = url.split('/')[-1]
#图片保存的路径
img_path = self.dir_path + r'\{0}'.format(img_name)
# 用os进行图片保存,保存在本地
try:
if not os.path.exists(self.dir_path):
os.makedirs(self.dir_path)
if not os.path.exists(img_path):
with open(img_path, 'wb') as f:
f.write(reponse.content)
f.close()
else:
print("文件已存在")
except:
print("执行出错")
修改pipelines.py,将数据保存到数据库:
from itemadapter import ItemAdapter
from pymongo import MongoClient
class DoubanMoviePipeline(object):
def open_spider(self, spider):
# MongoDB 连接设置
self.MONGO_URI = 'mongodb://localhost:27017/'
self.DB_NAME = 'movie' # 数据库名称
self.COLLECTION_NAME = 'doubanMovie' # 集合名称
self.client = MongoClient(self.MONGO_URI)
self.db = self.client[self.DB_NAME]
self.collection = self.db[self.COLLECTION_NAME]
# 如果集合中已有数据,清空集合
self.collection.delete_many({})
print('爬取开始')
def process_item(self, item, spider):
name = item['name']
grade = item['grade']
number = item['number']
move_describe = item['move_describe']
evaluate = item['evaluate']
introduce = item['introduce']
image_url = item['image_url']
# 将item转换为字典
item_dict = {
'name': name,
'grade': grade,
'number': number,
'move_describe': move_describe,
'evaluate': evaluate,
'introduce': introduce,
'image_url': image_url
}
# 插入数据
self.collection.insert_one(item_dict)
print('即将插入数据')
print(item_dict)
print('数据插入成功')
# self.collection.insert_one(dict(item))
return item
def close_spider(self, spider):
print('爬取结束,显示数据库中所有元素')
cursor = self.collection.find()
for document in cursor:
print(document)
self.client.close()
修改settings.py:
解除piplines的注释并指定具体类名,在请求头中添加User-Agent字段。
ITEM_PIPELINES = {
'movie_douban.pipelines.DoubanMoviePipeline': 300,
}
DEFAULT_REQUEST_HEADERS = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'
}
运行项目:
在movie_douban/movie_douban文件夹下新建run.py文件:
from scrapy import cmdline
cmdline.execute("scrapy crawl doubanMovieSpider -s LOG_ENABLED=true".split())
运行run.py,得到结果如下:
爬取250条电影数据: