一、实现一下之前的类方法动态创建队列的思路
首先是worker部分.
当然,也可以用实例方法更简洁一些,大家可以自行测试。我目前的也在测试中。
plain
# -*- coding: utf-8 -*-
"""
Funboost 工作器 - 替代 Feapder ParserControl
"""
import time
from typing import Dict, Any, Optional
import requests
from funspider.network.request import Request
from funspider.network.response import Response
from funspider.network.item import Item, UpdateItem
from funspider.utils.fun_logger import logger
from funspider.core.trace_stats import TraceStatsManager
from funboost import fct
_spider_instance_cache = {} # 全局缓存,存储 spider 实例
# # 创建一个可以自动获取 task_id 的 logger 不使用fun_logger ,这个要加id
from funboost.core.task_id_logger import TaskIdLogger
from nb_log import LogManager
# TaskIdLogger 会自动从 fct 上下文中提取 task_id
def _find_cls_by_name(module_name: str, cls_name: str):
"""动态查找类"""
module = __import__(module_name, fromlist=[cls_name])
return getattr(module, cls_name)
class FunboostWorker:
"""
Funboost 工作器
职责:
- 处理 Funboost 队列中的任务
- 动态创建爬虫实例
- 执行完整的请求处理流程:请求处理、下载、响应处理和解析
- 将结果发送到 ItemBuffer 或其他队列
"""
def __init__(self, item_buffer, engine):
"""
初始化工作器
Args:
item_buffer: 数据缓冲器
engine: 引擎实例
"""
self.item_buffer = item_buffer
self.engine = engine
self.trace_stats = TraceStatsManager(
spider_name=engine.spider_name,
settings_manager=engine.settings_manager
)
self.stats = {
'total_requests': 0,
'success_requests': 0,
'failed_requests': 0,
'total_items': 0
}
def process_task(self, task_info: Dict[str, Any], extra: Optional[Dict[str, Any]] = None) -> Any:
"""
处理 Funboost 任务 - 主入口方法
Args:
task_info: 任务信息,包含 spider_meta、payload 和 target_method_name
extra: 额外参数
Returns:
处理结果
"""
# task_logger = fct.logger
# trace_id = fct.full_msg.get('payload', {}).get('meta', {}).get('trace_id', None)
#
# task_logger.extra['trace_id'] = trace_id
self.stats['total_requests'] += 1
logger.debug(f"Processing task with extra: {extra}")
try:
# 解析任务信息
spider_meta = task_info.get('spider_meta', {})
payload = task_info.get('payload', {})
target_method_name = task_info.get('target_method_name', 'process_request')
# 动态创建或获取 spider 实例
spider_instance = self._get_or_create_spider_instance(spider_meta)
# 根据 target_method_name 调用不同的处理方法
if target_method_name == 'process_request':
# 处理请求任务(最常见的情况)
self._process_request_task(spider_instance, payload, spider_meta)
else:
# 处理其他类型的任务(如果有的话)
self._process_custom_task(spider_instance, target_method_name, payload)
self.stats['success_requests'] += 1
return self.get_stats()
except Exception as e:
logger.error(f"任务处理失败: {e}")
self.stats['failed_requests'] += 1
return self.get_stats()
def _get_or_create_spider_instance(self, spider_meta: Dict[str, Any]):
"""获取或创建爬虫实例"""
cache_key = f"{spider_meta['module_name']}.{spider_meta['cls_name']}"
spider_instance = _spider_instance_cache.get(cache_key)
if spider_instance is None:
spider_cls = _find_cls_by_name(
spider_meta['module_name'],
spider_meta['cls_name']
)
obj_init_params = spider_meta.get('obj_init_params', {})
spider_instance = spider_cls(**obj_init_params)
spider_instance.processor_manager.discover_processors() # 初始化处理器
_spider_instance_cache[cache_key] = spider_instance
logger.info(f"爬虫实例已创建并缓存: {cache_key}")
return spider_instance
def _process_request_task(self, spider_instance, payload: Dict[str, Any], spider_meta: Dict[str, Any]) -> Any:
"""
处理请求任务 - 完整的请求处理流程
Args:
spider_instance: 爬虫实例
payload: 请求数据(序列化的 Request 对象)
spider_meta: 爬虫元信息
Returns:
处理结果
"""
request=None
start_time = time.time()
try:
# 1. 反序列化请求对象
request = Request.from_dict(payload)
logger.debug(f"Processing request: {request.url}")
# 2. 应用请求处理器(中间件)
callback_name = request.callback or 'parse'
# 📊 记录请求开始
if self.trace_stats:
self.trace_stats.record_request(
trace_id=request.trace_id,
span_id=request.span_id,
parent_span_id=request.parent_span_id,
callback_name=callback_name,
url=request.url,
status="pending"
)
processed_request = spider_instance.process_request(request)
if processed_request is None:
logger.info(f"请求被处理器中断: {callback_name}")
return []
# 更新 request 为处理后的版本
request = processed_request
# 3. 执行下载
try:
response = spider_instance.download(request)
except NotImplementedError:
response = self._default_download(request)
# 4. 应用响应处理器(中间件)
processed_response = spider_instance.process_response(response, callback_name)
if processed_response is None:
logger.info(f"响应被处理器中断: {callback_name}")
return []
# 更新 response 为处理后的版本
response = processed_response
# 5. 调用解析方法
if isinstance(callback_name, str):
parse_method = getattr(spider_instance, callback_name, spider_instance.parse)
else:
parse_method = callback_name
results = parse_method(request, response)
if results is None:
results = []
# 6. 处理解析结果(有parent_request便于传递 task_id)
self._handle_parse_results(results, spider_instance, spider_meta, parent_request=request)
duration_ms = (time.time() - start_time) * 1000
# ✅ 记录成功
if self.trace_stats:
self.trace_stats.update_status(
trace_id=request.trace_id,
span_id=request.span_id,
callback_name=callback_name,
old_status="pending",
new_status="success",
duration_ms=duration_ms
)
return results
except Exception as e:
url_for_log = request.url if request else "unknown (反序列化失败)"
import traceback
traceback.print_exc()
logger.error(f"请求处理失败: {url_for_log}: {e}", exc_info=True)
self.stats['failed_requests'] += 1
# 尝试调用失败处理回调
duration_ms = (time.time() - start_time) * 1000
if self.trace_stats and request:
self.trace_stats.update_status(
trace_id=request.trace_id,
span_id=request.span_id,
callback_name=request.callback or 'parse',
old_status="pending",
new_status="failed",
duration_ms=duration_ms,
error_msg=str(e)
)
raise
def _process_custom_task(self, spider_instance, method_name: str, payload: Dict[str, Any]):
"""
处理自定义任务
Args:
spider_instance: 爬虫实例
method_name: 方法名
payload: 载荷数据
Returns:
处理结果
"""
target_method = getattr(spider_instance, method_name, None)
if target_method is None:
logger.error(f"目标方法 '{method_name}' 未找到")
self.stats['failed_requests'] += 1
return []
# 调用目标方法
results = target_method(payload)
return results if results is not None else []
def _handle_parse_results(self, results, spider_instance, spider_meta: Dict[str, Any], parent_request: Optional[Request] = None):
"""
处理解析结果
支持 OpenTelemetry 风格的 Trace/Span 追踪
Args:
results: 解析结果列表
spider_instance: 爬虫实例
spider_meta: 爬虫元信息
parent_request: 父请求(用于获取 trace_id 和 span_id)
"""
items_yielded = 0
requests_yielded = 0
# 获取父请求的 trace_id 和 span_id
parent_trace_id = None
parent_span_id = None
if parent_request:
# 从父请求中获取 trace_id(所有子请求继承)
parent_trace_id = parent_request.trace_id
# 父请求的 span_id 作为子请求的 parent_span_id
parent_span_id = parent_request.span_id
for result in results:
if isinstance(result, Request):
# 为子请求设置 trace_id 和 parent_span_id(如果未自定义)
# trace_id 保持不变,从初始请求一直传递下去
if parent_trace_id and not result.trace_id:
result.meta['trace_id'] = parent_trace_id
result.trace_id = parent_trace_id # 更新属性
# parent_span_id 设置为父请求的 span_id(如果未自定义)
if parent_span_id and not result.parent_span_id:
result.meta['parent_span_id'] = parent_span_id
result.parent_span_id = parent_span_id # 更新属性
# 如果 Request 没有自定义 span_id,则生成新的(避免覆盖用户自定义的)
if not result.span_id:
import uuid
new_span_id = uuid.uuid4().hex[:12]
result.span_id = new_span_id
result.meta['span_id'] = new_span_id
# 构造 task_id(格式:parent_span_id:span_id)
task_id = f"{result.parent_span_id or ''}:{result.span_id or ''}"
# 日志输出(显示 trace/span 信息)
logger.debug(
f"[trace={result.trace_id}][span={result.span_id}][parent={result.parent_span_id}] "
f"发布子请求: {result.url}"
)
# 发布新请求到队列
queue_name = f"spider_{spider_instance.name}_{result.callback_name or 'parse'}"
task_info = {
'spider_meta': spider_meta,
'payload': result.to_dict(),
'target_method_name': 'process_request'
}
self.engine.publish_request(task_info, queue_name, task_id=task_id)
logger.debug(f"新 task id {task_id}发布新请求到队列 '{queue_name}': {result.url}")
requests_yielded += 1
elif isinstance(result, (Item, UpdateItem)):
# 为 Item 添加 trace_id 和 span_id 信息
# if parent_trace_id or parent_span_id:
# result.meta = result.meta or {}
# if parent_trace_id:
# result.meta['trace_id'] = parent_trace_id
# if parent_span_id:
# result.meta['span_id'] = parent_span_id # 数据项关联父请求的 span_id
#
# 添加数据项到缓冲区
self.item_buffer.put_item(result)
self.stats['total_items'] += 1
# logger.debug(f"添加数据项到缓冲区: {result.table_name}")
items_yielded += 1
elif callable(result):
# 添加回调到缓冲区
self.item_buffer.put_callback(result)
logger.debug(f"添加回调到缓冲区")
else:
logger.warning(f"未知结果类型: {type(result)}")
# 打点:解析结果
if items_yielded > 0:
spider_instance.stats_manager.emit("parse",
tags={"result_type": "item"},
fields={"count": items_yielded})
if requests_yielded > 0:
spider_instance.stats_manager.emit("parse",
tags={"result_type": "request"},
fields={"count": requests_yielded})
def _default_download(self, request: Request) -> Response:
"""
默认下载实现
Args:
request: 请求对象
Returns:
响应对象
"""
try:
# 应用配置
http_params = request.get_http_params()
url = http_params.pop('url')
method = http_params.pop('method','GET')
verify = http_params.pop('verify', False)
timeout = http_params.pop('timeout', 10)
response = requests.request(method=method,url=url, verify=verify, timeout=timeout, **http_params)
response.raise_for_status()
# print(response.text)
return Response.from_requests_response(response)
except Exception as e:
logger.error(f"下载失败 {request.url}: {e}")
raise
def get_stats(self) -> Dict[str, Any]:
"""获取统计信息"""
return self.stats.copy()
def __str__(self) -> str:
return f"<FunboostWorker for multiple spiders>"
当然,既然是类静态方法, 传入的类参数 是要有个重新实例化的地方:_get_or_create_spider_instance 。这里做了一层缓存,避免每次都实例化。
然后是调用的地方:
plain
@staticmethod
def get_consumer_function(item_buffer, engine):
"""
获取队列对应的消费函数 - 改为静态方法
Args:
item_buffer: ItemBuffer 实例
engine: Engine 实例
Returns:
消费函数
"""
def consumer_function(task_info: dict, extra: dict = None):
"""Funboost 消费函数"""
fct_context_obj = fct.function_result_status.get_status_dict()
queue_name = fct_context_obj.get('queue_name', 'N/A')
try:
from .worker import FunboostWorker
worker = FunboostWorker(item_buffer=item_buffer, engine=engine)
# return worker.process_task(task_info, extra)
result = worker.process_task(task_info, extra)
# 在这里可以打点:任务执行成功
if engine.stats_manager:
engine.stats_manager.emit("task",
tags={"status": "success"},
fields={"count": 1})
return result
except Exception as e:
if engine.stats_manager:
engine.stats_manager.emit("task",
tags={"status": "failed"},
fields={"count": 1, "exception": type(e).__name__})
logger.error(f"Consumer function error: {e}")
raise
return consumer_function
那么创建的时候 ,就要把这个方法 给他传进去
plain
def _create_booster(self, queue_name: str, callback_name: str = None):
try:
# 检查 BoostersManager 中是否已存在
if self._get_booster(queue_name):
logger.debug(f"Booster for queue '{queue_name}' already exists")
return
if not callback_name:
logger.error("No callback name specified for booster")
return
base_params = {
'queue_name': queue_name,
'broker_kind': self.broker_kind,
'is_auto_start_consuming_message': True,
'consuming_function': FunboostEngine.get_consumer_function(self._item_buffer, self),
}
## get_consumer_function 这里 就是用的类的方法
funboost_config = self.funboost_params.copy()
funboost_config.update(base_params)
# 装饰器配置
decorator_funboost_params = {}
decorator_priority_params = {}
if callback_name in self.callback_configs:
# 从装饰器配置中提取 funboost_params 和 priority_params
decorator_config = self.callback_configs.get(callback_name, {})
decorator_funboost_params = decorator_config.get('funboost_params', {})
decorator_priority_params = decorator_config.get('priority_params', {})
logger.info(
f"为队列 [{queue_name}] 应用装饰器配置:\n"
f" - funboost_params: {decorator_funboost_params}\n"
f" - priority_params: {decorator_priority_params}"
)
# 只有 funboost_params 用于创建 Booster
funboost_config.update(decorator_funboost_params)
if self.enable_pause_control:
funboost_config['is_send_consumer_hearbeat_to_redis'] = True
# 3. 验证并处理优先级配置
broker_kind = funboost_config.get('broker_kind', self.broker_kind)
broker_exclusive_config = funboost_config.get('broker_exclusive_config', {})
is_priority_queue = PriorityQueueConfig.validate_and_warn(
queue_name=queue_name,
broker_kind=broker_kind,
broker_exclusive_config=broker_exclusive_config
)
# 4. 如果配置无效,清理优先级相关配置
if not is_priority_queue and 'x-max-priority' in broker_exclusive_config:
logger.warning(f"队列 '{queue_name}' 移除无效的优先级配置")
funboost_config.pop('broker_exclusive_config', None)
# 5. 保存队列配置信息(用于发布时判断)
self._queue_configs[queue_name] = {
'is_priority_queue': is_priority_queue,
'broker_kind': broker_kind,
'max_priority': broker_exclusive_config.get('x-max-priority', 0),
'decorator_priority_params': decorator_priority_params, # 保存装饰器的 priority_params
}
# 这个队列的最终配置
params = BoosterParams(**funboost_config)
booster = BoostersManager.build_booster(params)
# 不需要 self._boosters[queue_name] = booster,因为 BoostersManager.build_booster 已经注册了
logger.info(f"Created booster for queue: {queue_name} (callback: {callback_name})")
except Exception as e:
logger.error(f"Failed to create booster for queue '{queue_name}': {e}")
二、最重要的任务发布
plain
class FunboostBatchJob:
"""
一个健壮的类,其唯一职责是:可靠地执行一批给定的任务,
并处理技术层面的失败重试。
"""
def __init__(self,
spider_class: Type[BaseSpider],
queue_name: str,
broker_kind: BrokerEnum = BrokerEnum.REDIS_ACK_ABLE,
broker_exclusive_config: dict = None):
"""
初始化支持可复用的任务执行器
Args:
spider_class: 爬虫类
queue_name: 队列名称
broker_kind: 消息队列类型
broker_exclusive_config: 队列配置
脤海:
- publisher 在此次凭一次,之后可以反复调用 run(max_retries=N) 不同重试次数
- BoostersManager 内部可能有缓存,但业务层嘎显式表不什儿复用意图,比隐罫的优化更清業
"""
if not hasattr(spider_class, '__module__') or not hasattr(spider_class, '__name__'):
raise TypeError("spider_class 必须是一个有效的类,而不是实例")
self.spider_module_name = spider_class.__module__
self.spider_class_name = spider_class.__name__
self.queue_name = queue_name
# 仅在初始化时获取一次 Publisher,团一团重复连接开销
self.publisher = BoostersManager.get_cross_project_publisher(
PublisherParams(
queue_name=self.queue_name,
broker_kind=broker_kind,
is_using_rpc_mode=True,
broker_exclusive_config=broker_exclusive_config or {}
)
)
logger.info(f"Job管理器就绪: [{self.spider_class_name}] -> 队列: [{queue_name}]")
def run(self, task_generator: Generator[Request, None, None], max_retries: int = 2) -> bool:
"""
执行批量任务(支持动态重试策略)
Args:
task_generator: 任务生成器
max_retries: 本次执行的最大重试次数(支持动态调整)
- 初始全量采集:3 次
- 补充采集:5-6 次
Returns:
bool: 是否全部成功
"""
tasks_to_run = list(task_generator)
if not tasks_to_run:
logger.info("任务列表为空,无需执行。")
return True
total_tasks = len(tasks_to_run)
logger.info(f"开始执行任务,数量: {total_tasks},最大重试: {max_retries}")
for attempt in range(max_retries + 1):
if not tasks_to_run:
logger.info("所有任务已成功完成!")
return True
logger.info(f"--- [第 {attempt + 1}/{max_retries + 1} 轮] ---")
successful_tasks, failed_tasks = self._publish_and_wait(tasks_to_run)
logger.info(f"本轮执行结果:{len(successful_tasks)} 个成功,{len(failed_tasks)} 个失败。")
if not failed_tasks:
logger.info(f"批次任务完成!总计 {total_tasks} 个。")
return True
tasks_to_run = failed_tasks
if attempt < max_retries:
wait_time = 5 * (attempt + 1) # 指数退避策略:第1轮等5秒,第2轮等10秒,第3轮等15秒
logger.warning(f"{len(failed_tasks)} 个任务失败,{wait_time}秒后重试...")
time.sleep(wait_time)
logger.error(f"最终失败任务数: {len(tasks_to_run)}")
return False
def _publish_and_wait(self, requests: List[Request], wait_for_flush: int = 5) -> Tuple[List[Request], List[Request]]:
"""并发发布任务并等待结果"""
successful_tasks = []
failed_tasks = []
lock = threading.Lock()
logger.info(f"正在向队列 [{self.queue_name}] 推送并等待 {len(requests)} 个任务...")
def process_one_request(req):
try:
task_payload = {
'payload': req.to_dict(),
'target_method_name': 'process_request'
}
time.sleep(0.2)
async_result = self.publisher.publish({'task_info': task_payload})
async_result.set_timeout(200)
# 等待结果
try:
if async_result.is_success():
logger.debug(f"任务成功: {req.url}")
with lock:
successful_tasks.append(req)
else:
logger.warning(f"✗ 任务失败: {req.url}, 原因: {async_result.result}")
with lock:
failed_tasks.append(req)
except Exception as e:
logger.warning(f"✗ 等待结果异常: {req.url}, {e}")
with lock:
failed_tasks.append(req)
except Exception as e:
logger.error(f"✗ 处理请求异常: {req.url}, {e}")
import traceback
traceback.print_exc()
with lock:
failed_tasks.append(req)
# 并发处理所有请求
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(process_one_request, req) for req in requests]
for future in as_completed(futures):
try:
future.result()
except Exception as e:
logger.error(f"线程异常: {e}")
# 刷新数据到数据库
if successful_tasks:
logger.info(f"⏰ 等待 {wait_for_flush} 秒,让 ItemBuffer 刷新数据到数据库...")
time.sleep(wait_for_flush)
logger.info(f"任务完成统计 - 成功: {len(successful_tasks)}, 失败: {len(failed_tasks)}")
return successful_tasks, failed_tasks
这是任务 发布的demo ,通过判断单个任务的is_success,收集到当前这个队列中发布任务的情况。当然,目前单层用的还是挺好的。单层请求直接入库,消费者控制qps 。这边定时发布。
plain
self.job_manager = FunboostBatchJob(
spider_class=HangyeUpdateSpider,
queue_name=f"spider_{HangyeUpdateSpider.name}_parse"
)
initial_success = self.job_manager.run(
self.generate_requests_from_stocks(STOCK_LIST), # 这个就是start_requeste的复制
max_retries=3 # 初始采集:3 次重试
)
三、新的改进思路
其实之前写的redis 统计,虽然是看起来有用,但是后来有评论给了新的思路,就是发布的时候 直接新定义新的队列。即同一份代码,发布的时候 预定义新的队列比如 批次号1201009_task1 ,1201010_task1 ,然后只要确定能自己自动消费即可。用funboost 自带的监控是否结束即可。
目前正在想办法实现中~
更多文章,敬请关注gzh:零基础爬虫第一天
