Gartner将多智能体系统列为2026年十大战略技术趋势之一。本文深入解析多智能体系统的架构设计、通信协议、协作策略与生产部署,附完整代码实现。
1. 为什么需要多智能体系统
单一AI Agent的能力存在明显上限:面对"帮我完成一个IPO项目"这样的复杂任务,没有哪个单一Agent能同时具备财务分析、法律审查、市场调研和风险评估的全部能力。
多智能体系统(Multi-Agent System) 的核心思想是:让多个专业Agent协同工作,通过分工协作解决单一Agent无法处理的复杂问题。
单Agent瓶颈:
LLM ──x──> 财务分析 + 法律审查 + 市场调研 + 风险评估
(知识过载, 专业深度不足)
多Agent协作:
Orchestrator ──> Agent(财务专家) ──> 财务分析报告
│ ├── Agent(法律顾问) ──> 合规审查意见
│ ├── Agent(市场研究) ──> 市场调研报告
└──────> Agent(风险评估) ──> 风险评级报告
↓
合并输出: IPO项目综合评估
1.1 多智能体系统的核心价值
| 能力 | 单Agent | 多智能体系统 |
|---|---|---|
| 专业深度 | 受限于单一模型能力 | 各Agent专注单一领域 |
| 可扩展性 | 受限于上下文窗口 | Agent可动态增减 |
| 容错性 | 单点故障 | 某Agent失败不影响整体 |
| 实时性 | 顺序处理 | 并行处理不同子任务 |
| 成本控制 | 单一复杂任务成本高 | 按需调用专业Agent |
2. Agent通信协议:Agent-to-Agent (A2A)
2.1 A2A协议设计
2026年,Anthropic、MCP社区和各大厂商正在推动A2A(Agent-to-Agent)协议的标准化。以下是基于MCP扩展的A2A协议设计:
python
from dataclasses import dataclass, field
from typing import List, Dict, Optional, Any, Callable
from enum import Enum
from datetime import datetime
import asyncio
import json
import uuid
class MessageType(Enum):
"""Agent间通信消息类型"""
TASK_REQUEST = "task_request"
TASK_RESPONSE = "task_response"
HEARTBEAT = "heartbeat"
ERROR = "error"
CAPABILITY_DISCOVERY = "capability_discovery"
NEGOTIATION = "negotiation"
FEEDBACK = "feedback"
class AgentCapability(Enum):
"""Agent能力枚举"""
WEB_SEARCH = "web_search"
CODE_EXECUTION = "code_execution"
DATA_ANALYSIS = "data_analysis"
FILE_OPERATION = "file_operation"
DATABASE_QUERY = "database_query"
API_CALL = "api_call"
IMAGE_GENERATION = "image_generation"
TEXT_GENERATION = "text_generation"
REASONING = "reasoning"
MATHEMATICS = "mathematics"
@dataclass
class AgentMessage:
"""Agent间通信消息结构"""
msg_id: str = field(default_factory=lambda: str(uuid.uuid4()))
sender_id: str = ""
receiver_id: str = ""
msg_type: MessageType = MessageType.TASK_REQUEST
content: Dict[str, Any] = field(default_factory=dict)
metadata: Dict[str, Any] = field(default_factory=dict)
timestamp: str = field(default_factory=lambda: datetime.utcnow().isoformat())
reply_to: Optional[str] = None
def to_dict(self) -> dict:
return {
"msg_id": self.msg_id,
"sender_id": self.sender_id,
"receiver_id": self.receiver_id,
"msg_type": self.msg_type.value,
"content": self.content,
"metadata": self.metadata,
"timestamp": self.timestamp,
"reply_to": self.reply_to
}
@classmethod
def from_dict(cls, data: dict) -> 'AgentMessage':
return cls(
msg_id=data["msg_id"],
sender_id=data["sender_id"],
receiver_id=data["receiver_id"],
msg_type=MessageType(data["msg_type"]),
content=data.get("content", {}),
metadata=data.get("metadata", {}),
timestamp=data.get("timestamp", ""),
reply_to=data.get("reply_to")
)
@dataclass
class AgentCard:
"""Agent能力描述卡:用于服务发现"""
agent_id: str
name: str
description: str
capabilities: List[AgentCapability]
endpoint: str
auth_type: str = "bearer"
rate_limit: int = 100 # 每分钟最大请求数
supported_message_types: List[MessageType] = field(default_factory=list)
input_schema: Dict[str, Any] = field(default_factory=dict)
output_schema: Dict[str, Any] = field(default_factory=dict)
def matches(self, query: str) -> bool:
"""判断Agent是否匹配查询需求"""
query_lower = query.lower()
return (query_lower in self.description.lower() or
any(c.value in query_lower for c in self.capabilities))
class AgentRegistry:
"""
Agent注册中心:管理所有Agent的元信息和生命周期
支持动态注册/注销、故障检测、能力发现
"""
def __init__(self):
self.agents: Dict[str, AgentCard] = {}
self.agent_instances: Dict[str, Any] = {}
self.health_status: Dict[str, str] = {}
self.usage_stats: Dict[str, Dict] = {}
self._lock = asyncio.Lock()
async def register(self, agent_card: AgentCard, instance: Any):
"""注册Agent"""
async with self._lock:
self.agents[agent_card.agent_id] = agent_card
self.agent_instances[agent_card.agent_id] = instance
self.health_status[agent_card.agent_id] = "healthy"
self.usage_stats[agent_card.agent_id] = {
"total_requests": 0, "success": 0, "failure": 0
}
async def discover(self, query: str, top_k: int = 5) -> List[AgentCard]:
"""基于能力发现Agent"""
candidates = []
for agent_id, card in self.agents.items():
if self.health_status.get(agent_id) == "healthy":
if card.matches(query):
candidates.append(card)
# 按描述相关性排序
candidates.sort(key=lambda c: len(c.description), reverse=True)
return candidates[:top_k]
async def get_agent(self, agent_id: str) -> Optional[AgentCard]:
"""获取Agent信息"""
return self.agents.get(agent_id)
async def heartbeat(self, agent_id: str):
"""Agent心跳保活"""
if agent_id in self.health_status:
self.health_status[agent_id] = "healthy"
async def mark_unhealthy(self, agent_id: str):
"""标记Agent为不可用"""
self.health_status[agent_id] = "unhealthy"
async def route_message(self, message: AgentMessage) -> AgentMessage:
"""消息路由"""
receiver = self.agents.get(message.receiver_id)
if not receiver:
return AgentMessage(
msg_type=MessageType.ERROR,
content={"error": f"Agent {message.receiver_id} not found"}
)
# 调用Agent处理消息
instance = self.agent_instances.get(message.receiver_id)
if instance:
response = await instance.handle_message(message)
return response
return AgentMessage(
msg_type=MessageType.ERROR,
content={"error": f"Agent instance {message.receiver_id} unavailable"}
)
2.2 异步消息队列
python
import asyncio
from collections import defaultdict
from typing import Callable, Awaitable
import heapq
class AgentMessageBus:
"""
Agent消息总线:基于发布-订阅的异步通信
支持消息持久化、优先级队列、死信队列
"""
def __init__(self):
self.subscribers: Dict[str, List[Callable]] = defaultdict(list)
self.message_queue: List[Tuple[float, AgentMessage]] = []
self.dlq: List[AgentMessage] = [] # 死信队列
self.processing: Set[str] = set() # 正在处理的消息ID
self.acknowledgements: Dict[str, asyncio.Future] = {}
self._lock = asyncio.Lock()
async def publish(self, message: AgentMessage, priority: int = 0):
"""
发布消息到总线
priority: 0=普通, 1=高优先级, 2=紧急
"""
async with self._lock:
# 计算到期时间(高优先级先处理)
score = priority * 1e9 + datetime.utcnow().timestamp()
heapq.heappush(self.message_queue, (score, message))
# 广播给订阅者
for callback in self.subscribers.get(message.receiver_id, []):
if asyncio.iscoroutinefunction(callback):
asyncio.create_task(callback(message))
else:
callback(message)
async def subscribe(self, agent_id: str, callback: Callable):
"""订阅特定Agent的消息"""
self.subscribers[agent_id].append(callback)
async def request_response(self, message: AgentMessage,
timeout: float = 30.0) -> AgentMessage:
"""发送请求并等待响应(RPC模式)"""
future = asyncio.Future()
self.acknowledgements[message.msg_id] = future
await self.publish(message)
try:
result = await asyncio.wait_for(future, timeout=timeout)
return result
except asyncio.TimeoutError:
return AgentMessage(
msg_type=MessageType.ERROR,
content={"error": "Request timeout"}
)
finally:
self.acknowledgements.pop(message.msg_id, None)
async def retry_failed(self, max_retries: int = 3):
"""重试失败的消息"""
while self.dlq:
message = self.dlq.pop(0)
retries = message.metadata.get("retries", 0)
if retries < max_retries:
message.metadata["retries"] = retries + 1
await asyncio.sleep(2 ** retries) # 指数退避
await self.publish(message)
else:
print(f"Message {message.msg_id} permanently failed after {max_retries} retries")
class AgentCommunicationProtocol:
"""
Agent通信协议:定义了Agent间的标准交互模式
支持:请求-响应、发布-订阅、广播、流水线
"""
def __init__(self, message_bus: AgentMessageBus, registry: AgentRegistry):
self.bus = message_bus
self.registry = registry
async def send_task(self, sender_id: str, receiver_id: str,
task: Dict[str, Any],
require_response: bool = True) -> Optional[AgentMessage]:
"""发送任务请求"""
message = AgentMessage(
sender_id=sender_id,
receiver_id=receiver_id,
msg_type=MessageType.TASK_REQUEST,
content={"task": task}
)
if require_response:
return await self.bus.request_response(message)
else:
await self.bus.publish(message)
return None
async def negotiate(self, agent_a: str, agent_b: str,
context: Dict) -> Dict:
"""
两阶段协商协议:
Phase 1: Agent A提出方案
Phase 2: Agent B评估并提出反方案或接受
"""
# Phase 1
proposal = AgentMessage(
sender_id=agent_a,
receiver_id=agent_b,
msg_type=MessageType.NEGOTIATION,
content={"phase": 1, "proposal": context}
)
response = await self.bus.request_response(proposal)
if response.content.get("accepted"):
return response.content["result"]
# Phase 2: 迭代协商
for _ in range(5):
counter_proposal = AgentMessage(
sender_id=agent_b,
receiver_id=agent_a,
msg_type=MessageType.NEGOTIATION,
content={"phase": 2, "counter": response.content.get("counter")}
)
response = await self.bus.request_response(counter_proposal)
if response.content.get("accepted"):
return response.content["result"]
return {"status": "negotiation_failed"}
3. 多智能体协作框架
3.1 层级式编排器(Hierarchical Orchestrator)
python
from typing import List, Dict, Optional
import anthropic
class HierarchicalOrchestrator:
"""
层级式编排器:适用于大型复杂任务的多智能体协调
架构:战略层(规划) -> 战术层(协调) -> 执行层(具体任务)
"""
def __init__(self, registry: AgentRegistry, message_bus: AgentMessageBus):
self.registry = registry
self.bus = message_bus
self.protocol = AgentCommunicationProtocol(message_bus, registry)
self.client = anthropic.Anthropic()
async def execute_task(self, user_request: str) -> dict:
"""
层级式任务执行流程:
1. 战略层:拆解任务为子任务图
2. 战术层:为每个子任务分配合适的Agent
3. 执行层:并行/顺序执行子任务
4. 汇总层:整合各Agent输出,生成最终答案
"""
# ===== 战略层:任务规划 =====
task_graph = await self._strategic_planning(user_request)
# ===== 战术层:Agent分配 =====
agent_assignments = await self._tactical_allocation(task_graph)
# ===== 执行层:并行执行 =====
execution_results = await self._parallel_execution(agent_assignments)
# ===== 汇总层:整合结果 =====
final_output = await self._summarize_results(
user_request, task_graph, execution_results
)
return final_output
async def _strategic_planning(self, request: str) -> Dict:
"""战略层:大语言模型进行任务拆解"""
prompt = f"""将以下复杂任务拆解为可执行的子任务图。
要求:
1. 识别任务中的依赖关系(哪些子任务必须先完成)
2. 标记可并行执行的子任务
3. 指定每个子任务所需的专业能力
任务:{request}
以JSON格式返回:
{{
"task_graph": [
{{"id": "t1", "description": "描述", "capability_needed": "能力", "depends_on": []}},
{{"id": "t2", "description": "描述", "capability_needed": "能力", "depends_on": ["t1"]}}
],
"parallel_groups": [["t1"], ["t2", "t3"], ["t4"]], // 可并行执行的组
"execution_order": ["t1", "t2", "t3", "t4"]
}}"""
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
messages=[{"role": "user", "content": prompt}]
)
import json
try:
content = json.loads(response.content[0].text)
return content
except:
# 降级处理
return {"task_graph": [{"id": "t1", "description": request,
"capability_needed": "reasoning", "depends_on": []}]}
async def _tactical_allocation(self, task_graph: Dict) -> Dict[str, str]:
"""战术层:将子任务分配给最适合的Agent"""
assignments = {}
for task in task_graph.get("task_graph", []):
task_id = task["id"]
capability = task["capability_needed"]
# 发现匹配该能力的Agent
candidates = await self.registry.discover(capability, top_k=3)
if candidates:
# 选择负载最低的Agent
best_agent = min(
candidates,
key=lambda a: self.registry.usage_stats[a.agent_id]["total_requests"]
)
assignments[task_id] = best_agent.agent_id
else:
# 没有合适Agent,创建临时专用Agent
temp_agent_id = f"temp_{task_id}"
assignments[task_id] = temp_agent_id
return assignments
async def _parallel_execution(self, assignments: Dict) -> Dict:
"""执行层:按依赖顺序并行执行任务"""
results = {}
completed = set()
# 按批次执行:每批内部可并行
while len(completed) < len(assignments):
batch_tasks = [
(tid, aid) for tid, aid in assignments.items()
if tid not in completed and all(
dep in completed for dep in [] # 简化
)
]
if not batch_tasks:
break
# 本批次并行执行
batch_coroutines = [
self._execute_single_task(task_id, agent_id, results)
for task_id, agent_id in batch_tasks
]
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
for task_id, result in zip([t[0] for t in batch_tasks], batch_results):
if isinstance(result, Exception):
results[task_id] = {"status": "error", "error": str(result)}
else:
results[task_id] = result
completed.add(task_id)
return results
async def _execute_single_task(self, task_id: str, agent_id: str,
all_results: Dict) -> dict:
"""执行单个子任务"""
agent_card = await self.registry.get_agent(agent_id)
if not agent_card:
# 临时Agent:直接用LLM处理
return await self._execute_with_llm(task_id)
# 构建任务消息
message = AgentMessage(
sender_id="orchestrator",
receiver_id=agent_id,
msg_type=MessageType.TASK_REQUEST,
content={
"task_id": task_id,
"context": all_results
}
)
response = await self.bus.request_response(message)
return response.content
async def _execute_with_llm(self, task_id: str) -> dict:
"""无合适Agent时,使用LLM处理"""
return {"status": "processed_by_llm", "output": "processed"}
async def _summarize_results(self, original_request: str,
task_graph: Dict,
results: Dict) -> dict:
"""汇总层:整合所有子任务结果"""
# 构建汇总prompt
summary_prompt = f"""原始任务:{original_request}
各子任务执行结果:
{json.dumps(results, ensure_ascii=False, indent=2)}
请综合以上结果,给出最终答案。"""
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=4096,
messages=[{"role": "user", "content": summary_prompt}]
)
return {
"final_output": response.content[0].text,
"task_graph": task_graph,
"results": results
}
3.2 多智能体辩论框架
python
class MultiAgentDebate:
"""
多智能体辩论:让多个Agent从不同角度审视问题
适用于:决策分析、风险评估、技术方案选型
"""
def __init__(self, agents: List[dict], judge_llm: str = "claude-sonnet-4-20250514"):
self.agents = agents # [{"role": "pro", "name": "支持者", "model": "..."}]
self.judge_llm = judge_llm
self.client = anthropic.Anthropic()
async def debate(self, topic: str, num_rounds: int = 3) -> dict:
"""
多轮辩论流程:
Round 1: 各Agent独立陈述立场
Round 2+: 交叉辩论(反驳对方观点)
Final: 裁判总结
"""
debate_history = []
# Round 1: 开场陈述
opening_statements = {}
for agent in self.agents:
statement = await self._agent_statement(agent, topic, round=1)
opening_statements[agent["role"]] = statement
debate_history.append({
"round": 1,
"speaker": agent["role"],
"type": "opening",
"content": statement
})
# Round 2+: 交叉辩论
cross_arguments = {}
for round_num in range(2, num_rounds + 1):
for agent in self.agents:
# 获取对方最新观点
opponents = [a for a in self.agents if a["role"] != agent["role"]]
opponent_args = [opening_statements.get(o["role"], "")
for o in opponents]
rebuttal = await self._agent_rebuttal(
agent, topic, opponent_args, round_num
)
cross_arguments[agent["role"]] = rebuttal
debate_history.append({
"round": round_num,
"speaker": agent["role"],
"type": "rebuttal",
"content": rebuttal
})
# 裁判总结
judgment = await self._judge_summarize(topic, debate_history)
return {
"debate_history": debate_history,
"judgment": judgment,
"opening_statements": opening_statements,
"final_arguments": cross_arguments
}
async def _agent_statement(self, agent: dict, topic: str, round: int) -> str:
"""Agent开场陈述"""
prompt = f"""你是{agent['name']}({agent['role']}视角)。
请就以下议题发表开场陈述,阐述你的核心观点。
议题:{topic}
辩论轮次:第{round}轮
要求:
- 清晰表达你的立场
- 提供3个支持你观点的论据
- 预见对方可能的反驳
"""
response = self.client.messages.create(
model=agent.get("model", "claude-sonnet-4-20250514"),
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
async def _agent_rebuttal(self, agent: dict, topic: str,
opponent_args: List[str], round: int) -> str:
"""Agent反驳"""
prompt = f"""你是{agent['name']}({agent['role']}视角)。
这是第{round}轮辩论,你需要反驳对方的观点。
议题:{topic}
对方观点:
{chr(10).join([f'- {arg}' for arg in opponent_args])}
要求:
- 指出对方论点的薄弱之处
- 用证据和逻辑反驳
- 坚守并深化自己的立场
"""
response = self.client.messages.create(
model=agent.get("model", "claude-sonnet-4-20250514"),
max_tokens=1024,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text
async def _judge_summarize(self, topic: str,
debate_history: List[dict]) -> dict:
"""裁判总结"""
history_text = "\n".join([
f"[Round {h['round']}] {h['speaker']} ({h['type']}): {h['content'][:200]}"
for h in debate_history
])
prompt = f"""你是这场辩论的裁判。请基于以下辩论记录,评判哪一方更有说服力,并给出理由。
议题:{topic}
辩论记录:
{history_text}
请给出:
1. 胜出一方及理由
2. 各方优缺点分析
3. 综合建议
"""
response = self.client.messages.create(
model=self.judge_llm,
max_tokens=2048,
messages=[{"role": "user", "content": prompt}]
)
return {"summary": response.content[0].text, "winner": "determined"}
4. Agent生命周期管理
4.1 Agent基类与生命周期
python
import asyncio
from abc import ABC, abstractmethod
from typing import Any, Optional
from datetime import datetime, timedelta
class AgentLifecycle(Enum):
CREATED = "created"
INITIALIZING = "initializing"
READY = "ready"
RUNNING = "running"
SUSPENDED = "suspended"
TERMINATED = "terminated"
ERROR = "error"
class BaseAgent(ABC):
"""
Agent基类:定义所有Agent的通用接口和生命周期
"""
def __init__(self, agent_id: str, name: str):
self.agent_id = agent_id
self.name = name
self.state = AgentLifecycle.CREATED
self.created_at = datetime.utcnow()
self.last_active = datetime.utcnow()
self.task_count = 0
self.error_count = 0
# 能力定义(子类覆盖)
self.capabilities: List[AgentCapability] = []
self.description: str = ""
@abstractmethod
async def initialize(self, config: Dict[str, Any]):
"""初始化Agent(加载模型、配置等)"""
pass
@abstractmethod
async def process(self, task: Dict[str, Any]) -> Dict[str, Any]:
"""处理任务(核心业务逻辑,子类必须实现)"""
pass
async def handle_message(self, message: AgentMessage) -> AgentMessage:
"""处理收到的消息"""
try:
self.state = AgentLifecycle.RUNNING
self.last_active = datetime.utcnow()
if message.msg_type == MessageType.TASK_REQUEST:
result = await self.process(message.content["task"])
self.task_count += 1
self.state = AgentLifecycle.READY
return AgentMessage(
sender_id=self.agent_id,
receiver_id=message.sender_id,
msg_type=MessageType.TASK_RESPONSE,
content={"result": result, "task_id": message.content.get("task_id")},
reply_to=message.msg_id
)
elif message.msg_type == MessageType.HEARTBEAT:
return AgentMessage(
sender_id=self.agent_id,
receiver_id=message.sender_id,
msg_type=MessageType.HEARTBEAT,
content={"status": self.state.value, "task_count": self.task_count}
)
except Exception as e:
self.error_count += 1
self.state = AgentLifecycle.ERROR
return AgentMessage(
sender_id=self.agent_id,
receiver_id=message.sender_id,
msg_type=MessageType.ERROR,
content={"error": str(e)}
)
async def health_check(self) -> bool:
"""健康检查"""
if self.state == AgentLifecycle.ERROR:
return False
if datetime.utcnow() - self.last_active > timedelta(minutes=5):
return False
return True
async def suspend(self):
"""暂停Agent"""
self.state = AgentLifecycle.SUSPENDED
async def resume(self):
"""恢复Agent"""
self.state = AgentLifecycle.READY
async def terminate(self):
"""终止Agent"""
self.state = AgentLifecycle.TERMINATED
4.2 具体Agent实现
python
class WebSearchAgent(BaseAgent):
"""网络搜索Agent"""
def __init__(self, agent_id: str):
super().__init__(agent_id, "WebSearchAgent")
self.capabilities = [AgentCapability.WEB_SEARCH]
self.description = "网络搜索和网页内容提取"
self.search_engine = None
async def initialize(self, config: Dict):
# 初始化搜索API
self.search_engine = config.get("search_engine", "duckduckgo")
self.state = AgentLifecycle.READY
async def process(self, task: Dict) -> Dict:
query = task.get("query", "")
max_results = task.get("max_results", 10)
# 执行搜索
results = await self._search(query, max_results)
return {
"query": query,
"results": results,
"count": len(results)
}
async def _search(self, query: str, max_results: int) -> List[Dict]:
"""实际搜索逻辑"""
# 简化实现
return [{"title": f"Result for {query}", "url": "https://example.com", "snippet": ""}]
class DataAnalysisAgent(BaseAgent):
"""数据分析Agent"""
def __init__(self, agent_id: str):
super().__init__(agent_id, "DataAnalysisAgent")
self.capabilities = [AgentCapability.DATA_ANALYSIS]
self.description = "数据处理、统计分析和可视化"
self.client = anthropic.Anthropic()
async def initialize(self, config: Dict):
self.state = AgentLifecycle.READY
async def process(self, task: Dict) -> Dict:
data = task.get("data")
analysis_type = task.get("analysis_type", "summary")
if analysis_type == "summary":
return await self._summary_analysis(data)
elif analysis_type == "correlation":
return await self._correlation_analysis(data)
elif analysis_type == "trend":
return await self._trend_analysis(data)
return {"status": "unknown_analysis_type"}
async def _summary_analysis(self, data: Any) -> Dict:
prompt = f"""对以下数据进行统计分析,生成摘要报告:
{data}
请输出:
1. 基本统计量(均值、中位数、标准差等)
2. 关键发现
3. 异常值检测"""
response = self.client.messages.create(
model="claude-sonnet-4-20250514",
max_tokens=2048,
messages=[{"role": "user", "content": prompt}]
)
return {"analysis": response.content[0].text, "type": "summary"}
class CodeExecutionAgent(BaseAgent):
"""代码执行Agent"""
def __init__(self, agent_id: str):
super().__init__(agent_id, "CodeExecutionAgent")
self.capabilities = [AgentCapability.CODE_EXECUTION]
self.description = "代码编写、调试和执行"
self.execution_timeout = 30 # 秒
async def initialize(self, config: Dict):
self.execution_timeout = config.get("timeout", 30)
self.state = AgentLifecycle.READY
async def process(self, task: Dict) -> Dict:
code = task.get("code")
language = task.get("language", "python")
input_data = task.get("input", {})
# 安全沙箱执行
result = await self._safe_execute(code, language, input_data)
return result
async def _safe_execute(self, code: str, language: str,
input_data: Dict) -> Dict:
"""安全执行代码"""
if language == "python":
try:
import io, sys, contextlib
output = io.StringIO()
with contextlib.redirect_stdout(output):
exec(code, {"__builtins__": __builtins__})
return {
"status": "success",
"output": output.getvalue(),
"language": language
}
except Exception as e:
return {
"status": "error",
"error": str(e),
"language": language
}
return {"status": "unsupported_language"}
5. 生产级部署架构
5.1 多智能体服务架构
python
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
app = FastAPI(title="Multi-Agent System API", version="1.0.0")
class TaskRequest(BaseModel):
request: str
priority: int = 0
timeout: float = 60.0
require_justification: bool = False
class AgentSystem:
"""多智能体系统主控制器"""
def __init__(self):
self.registry = AgentRegistry()
self.message_bus = AgentMessageBus()
self.orchestrator = HierarchicalOrchestrator(self.registry, self.message_bus)
self._initialized = False
async def initialize(self):
"""初始化系统并注册所有Agent"""
# 注册各种专业Agent
await self.registry.register(
AgentCard(
agent_id="web_search_1",
name="WebSearchAgent",
description="网络搜索和信息检索",
capabilities=[AgentCapability.WEB_SEARCH],
endpoint="internal",
supported_message_types=[MessageType.TASK_REQUEST]
),
WebSearchAgent("web_search_1")
)
await self.registry.register(
AgentCard(
agent_id="data_analyst_1",
name="DataAnalysisAgent",
description="数据分析和统计建模",
capabilities=[AgentCapability.DATA_ANALYSIS],
endpoint="internal"
),
DataAnalysisAgent("data_analyst_1")
)
await self.registry.register(
AgentCard(
agent_id="code_executor_1",
name="CodeExecutionAgent",
description="代码执行和程序验证",
capabilities=[AgentCapability.CODE_EXECUTION],
endpoint="internal"
),
CodeExecutionAgent("code_executor_1")
)
self._initialized = True
async def process(self, request: str, timeout: float = 60.0) -> dict:
"""处理用户请求"""
if not self._initialized:
await self.initialize()
return await self.orchestrator.execute_task(request)
# 全局Agent系统实例
agent_system = AgentSystem()
@app.post("/agent/task")
async def submit_task(req: TaskRequest):
"""提交任务给多智能体系统"""
try:
result = await asyncio.wait_for(
agent_system.process(req.request),
timeout=req.timeout
)
return result
except asyncio.TimeoutError:
raise HTTPException(status_code=504, detail="Task timeout")
@app.get("/agent/capabilities")
async def list_capabilities():
"""列出所有Agent能力"""
cards = []
for agent_id, card in agent_system.registry.agents.items():
cards.append({
"agent_id": agent_id,
"name": card.name,
"capabilities": [c.value for c in card.capabilities],
"health": agent_system.registry.health_status.get(agent_id, "unknown")
})
return {"agents": cards}
@app.get("/health")
async def health_check():
"""系统健康检查"""
unhealthy = [
aid for aid, status in agent_system.registry.health_status.items()
if status != "healthy"
]
return {
"status": "healthy" if not unhealthy else "degraded",
"total_agents": len(agent_system.registry.agents),
"unhealthy_agents": unhealthy
}
# 启动服务
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
5.2 负载均衡与容错
python
class AgentLoadBalancer:
"""
Agent负载均衡器:根据Agent负载、响应时间和健康状态分配请求
策略:加权轮询 + 最少连接 + 熔断器
"""
def __init__(self, registry: AgentRegistry):
self.registry = registry
self.connection_counts: Dict[str, int] = defaultdict(int)
self.response_times: Dict[str, List[float]] = defaultdict(list)
self.circuit_breakers: Dict[str, CircuitBreaker] = {}
async def select_agent(self, capability: AgentCapability) -> Optional[str]:
"""选择最合适的Agent"""
candidates = await self.registry.discover(capability.value, top_k=10)
if not candidates:
return None
# 过滤熔断器断开的Agent
available = [
c for c in candidates
if c.agent_id not in self.circuit_breakers or
not self.circuit_breakers[c.agent_id].is_open
]
if not available:
return None
# 加权评分选择
best_agent = None
best_score = float('inf')
for agent in available:
score = self._compute_score(agent.agent_id)
if score < best_score:
best_score = score
best_agent = agent.agent_id
if best_agent:
self.connection_counts[best_agent] += 1
return best_agent
def _compute_score(self, agent_id: str) -> float:
"""综合评分:负载 × 延迟 × 健康因子"""
load_score = self.connection_counts.get(agent_id, 0)
# 响应时间评分(EMA)
recent_times = self.response_times.get(agent_id, [])
latency_score = np.mean(recent_times[-10:]) if recent_times else 1.0
# 健康因子
health = self.registry.health_status.get(agent_id, "unknown")
health_factor = 1.0 if health == "healthy" else 100.0
return load_score * latency_score * health_factor
def record_response(self, agent_id: str, duration: float):
"""记录响应时间和断开连接"""
self.connection_counts[agent_id] = max(0, self.connection_counts.get(agent_id, 1) - 1)
self.response_times[agent_id].append(duration)
# 检查熔断器
if agent_id in self.circuit_breakers:
self.circuit_breakers[agent_id].record_success()
if duration > 5.0: # 5秒超时
self.circuit_breakers[agent_id].record_failure()
class CircuitBreaker:
"""熔断器模式:防止故障Agent被持续调用"""
def __init__(self, failure_threshold: int = 5, timeout: float = 60.0):
self.failure_threshold = failure_threshold
self.timeout = timeout
self.failure_count = 0
self.last_failure_time = None
self.state = "closed" # closed, open, half_open
def record_failure(self):
self.failure_count += 1
self.last_failure_time = datetime.utcnow()
if self.failure_count >= self.failure_threshold:
self.state = "open"
def record_success(self):
self.failure_count = 0
self.state = "closed"
@property
def is_open(self) -> bool:
if self.state == "open":
if (datetime.utcnow() - self.last_failure_time).total_seconds() > self.timeout:
self.state = "half_open"
return False
return True
return False
6. 协作策略深度解析
6.1 任务分配算法
python
class TaskAllocator:
"""
任务分配器:基于能力的贪心分配 + 约束满足
支持:技能匹配、负载均衡、依赖约束、时间窗口
"""
def __init__(self, registry: AgentRegistry):
self.registry = registry
async def allocate(self, tasks: List[Dict], agents: List[str]) -> Dict[str, str]:
"""
分配任务给Agent
返回:{task_id: agent_id}
"""
assignments = {}
agent_loads = {a: 0 for a in agents}
task_queue = sorted(tasks, key=lambda t: t.get("priority", 0), reverse=True)
for task in task_queue:
task_id = task["id"]
required_capabilities = task.get("capabilities", [])
# 找到负载最轻且满足能力的Agent
best_agent = None
best_load = float('inf')
for agent_id in agents:
agent_card = await self.registry.get_agent(agent_id)
if not agent_card:
continue
# 检查能力匹配
agent_caps = {c for c in agent_card.capabilities}
required = {AgentCapability(c) for c in required_capabilities}
if not required.issubset(agent_caps):
continue
# 选择负载最小的
if agent_loads[agent_id] < best_load:
best_load = agent_loads[agent_id]
best_agent = agent_id
if best_agent:
assignments[task_id] = best_agent
agent_loads[best_agent] += task.get("complexity", 1)
else:
assignments[task_id] = None # 无法分配
return assignments
async def rebalance(self, current_assignments: Dict,
threshold: float = 0.3) -> Dict[str, str]:
"""
负载再平衡:当Agent间负载差异超过阈值时重新分配
"""
agent_loads = defaultdict(list)
for task_id, agent_id in current_assignments.items():
if agent_id:
agent_loads[agent_id].append(task_id)
loads = {a: len(tasks) for a, tasks in agent_loads.items()}
max_load = max(loads.values())
min_load = min(loads.values())
# 如果负载差异超过阈值,尝试迁移任务
if (max_load - min_load) / (max_load + 1e-8) > threshold:
# 从高负载Agent迁移任务到低负载Agent
# 实现略
pass
return current_assignments
7. 总结与实践建议
技术选型
| 场景 | 推荐架构 | 框架 |
|---|---|---|
| 简单任务流水线 | 顺序Pipeline | LangChain LCEL |
| 复杂多领域任务 | 层级式编排器 | 自研/OpenManus |
| 需要辩论/共识 | 多Agent辩论 | 自研 |
| 开放世界任务 | 联邦式MAS | CrewAI/Swarm |
| 高可靠生产系统 | 微服务架构 | FastAPI + MessageBus |
2026年多智能体生态
- 协议层:A2A协议逐渐标准化,MCP成为事实标准
- 编排层:层级式编排 > 顺序Pipeline(复杂任务)
- 通信层:异步消息总线 + RPC混合
- 监控层:全链路追踪、Agent级别SLA监控
工程实践
- 从简单开始:先用Pipeline模式验证业务流程,再升级为完整MAS
- 定义清晰边界:每个Agent职责单一,避免能力重叠
- 容错优先:每个Agent独立容错,编排器不依赖单点
- 可观测性:每条消息打标签,全链路追踪
- 成本控制:按token计费的Agent设置预算上限