爬虫中间件
python
# 爬虫中间件 (了解) middlewares.py
class MysfirstscrapySpiderMiddleware:
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_spider_input(self, response, spider):
return None
def process_spider_output(self, response, result, spider):
for i in result:
yield i
def process_spider_exception(self, response, exception, spider):
pass
def process_start_requests(self, start_requests, spider):
for r in start_requests:
yield r
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
下载中间件
-进来request对象
-加代理
-加cookie
-加请求头
-出去response对象
-修改响应对象,最后进入到爬虫的parser中就是修改后的response
python
# 下载中间件
class MysfirstscrapyDownloaderMiddleware:
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
# 请求来了执行
def process_request(self, request, spider):
# 返回值可以是如下
# return None:继续处理本次请求,执行执行下一个中间件的process_request
#return Response:执行当前中间件的process_response回去,进入到引擎,被调度,进入第6步,返回到爬虫的解析方法中
# return a Request:直接返回,给引擎,被调度,进入第2步,进入调度器等待下次被调度爬取
# raise IgnoreRequest:执行 process_exception
return None
# 请求走了
def process_response(self, request, response, spider):
# 返回如下
# return Response :继续往后走,进入到引擎,被调度到爬虫中解析
# return Request :进入到引擎,被调度进调度器
# - or raise IgnoreRequest:会执行process_exception
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
# 在配置文件中配置
1.加代理
python
# 在下载中间件的def process_request(self, request, spider):写代码
# 第一步:
-在下载中间件写process_request方法
def get_proxy(self):
import requests
res = requests.get('http://127.0.0.1:5010/get/').json()
if res.get('https'):
return 'https://' + res.get('proxy')
else:
return 'http://' + res.get('proxy')
def process_request(self, request, spider):
request.meta['proxy'] = self.get_proxy()
return None
# 第二步:代理可能不能用,会触发process_exception,在里面写
def process_exception(self, request, exception, spider):
print('-----',request.url) # 这个地址没有爬
return request
2.加cookie,修改请求头,随机生成UserAgent
2.1加cookie
python
def process_request(self, request, spider):
print(request.cookies)
request.cookies['name']='lqz'
return None
2.2 修改请求头
python
def process_request(self, request, spider):
print(request.headers)
request.headers['referer'] = 'http://www.lagou.com'
return None
2.3 动态生成User-agent使用
需要先安装模块
python
pip insttall fake_useragent
python
def process_request(self, request, spider):
# fake_useragent模块
from fake_useragent import UserAgent
ua = UserAgent()
request.headers['User-Agent']=str(ua.random)
print(request.headers)
return None