目录
在使用scrapy-redis
这个库的时候,会出现以下的情况:
很烦,没说会一直监听啊,烦死了=。=,当时设置了一个拓展件RedisSpiderIdleTimeoutExtension
,这个拓展件就是用来自动结束的。分布式部署的时候,这个是用不到的!!!
python
# extension.py
import time
from scrapy import signals
from scrapy.exceptions import NotConfigured
class RedisSpiderIdleTimeoutExtension:
"""
Scrapy-Redis 空闲超时关闭扩展
- 如果在指定时间内无新任务,自动关闭爬虫
- 兼容分布式环境,避免空跑
"""
def __init__(self, timeout, crawler):
self.crawler = crawler
self.timeout = timeout # 超时时间(秒)
self.last_activity_time = None # 最后活动时间戳
@classmethod
def from_crawler(cls, crawler):
# 检查扩展是否启用
if not crawler.settings.getbool('REDIS_IDLE_TIMEOUT_ENABLED', True):
raise NotConfigured
# 从配置读取超时时间(默认5分钟)
timeout = crawler.settings.getint('REDIS_IDLE_TIMEOUT', 300)
ext = cls(timeout, crawler)
# 连接信号:任务开始、处理成功、空闲
crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(ext.request_scheduled, signal=signals.request_scheduled)
crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
crawler.signals.connect(ext.spider_idle, signal=signals.spider_idle)
return ext
def spider_opened(self, spider):
"""爬虫启动时初始化最后活动时间"""
self.last_activity_time = time.time()
spider.logger.info(f"启用空闲超时关闭,超时时间: {self.timeout}秒")
def request_scheduled(self, request, spider):
"""有新请求时更新活动时间"""
self._update_activity()
def item_scraped(self, item, spider):
"""处理到Item时更新活动时间"""
self._update_activity()
def _update_activity(self):
"""更新最后活动时间"""
self.last_activity_time = time.time()
def spider_idle(self, spider):
"""空闲时检查超时"""
if self.last_activity_time is None:
return # 初始状态无需处理
current_time = time.time()
idle_duration = current_time - self.last_activity_time
if idle_duration > self.timeout:
spider.logger.info(f"空闲超过 {self.timeout} 秒,触发关闭爬虫")
self.crawler.engine.close_spider(spider, 'idle_timeout')
else:
spider.logger.debug(f"当前空闲时间: {idle_duration:.1f}秒 (超时阈值: {self.timeout}秒)")
当然也要在settings.py
文件里面添加配置:
python
#settings
EXTENSIONS = {
"scrapy.extensions.telnet.TelnetConsole": None,
'firstpc.middlewares.extensions.RedisSpiderIdleTimeoutExtension': 500,
}
# 设置空闲超时时间(5分钟 = 300秒)
REDIS_IDLE_TIMEOUT = 300
REDIS_IDLE_TIMEOUT_ENABLED = True
