作为 Agent 开发工程师,我来为你提供一套 从用户交互到系统架构层面的完整停止方案,涵盖前端、后端、协议和运维各个维度。
一、用户层:即时中断机制
1. 信号中断(Signal-Based)
python
# Python 示例:捕获用户中断信号
import signal
import sys
class AgentController:
def __init__(self):
self.current_task = None
self.interrupted = False
# 注册信号处理器
signal.signal(signal.SIGINT, self._handle_interrupt) # Ctrl+C
signal.signal(signal.SIGTERM, self._handle_terminate) # kill
def _handle_interrupt(self, signum, frame):
"""处理 Ctrl+C"""
if self.current_task:
self.interrupted = True
self.current_task.cancel() # 取消当前任务
print("\n⚠️ 用户中断,正在安全停止...")
else:
sys.exit(0)
def run_task(self, task):
self.current_task = task
try:
result = task.execute()
return result
except asyncio.CancelledError:
return {"status": "cancelled", "reason": "user_interrupt"}
finally:
self.current_task = None
2. 流式响应中的取消令牌(Cancellation Token)
typescript
// TypeScript 示例:前端到后端的取消流
class AgentStreamController {
private abortController: AbortController;
constructor() {
this.abortController = new AbortController();
}
async streamResponse(prompt: string) {
const response = await fetch('/api/agent/stream', {
method: 'POST',
body: JSON.stringify({ prompt }),
signal: this.abortController.signal, // 关键:传递取消信号
});
const reader = response.body.getReader();
try {
while (true) {
const { done, value } = await reader.read();
if (done) break;
// 检查是否被取消
if (this.abortController.signal.aborted) {
reader.cancel(); // 取消流读取
throw new DOMException('Aborted', 'AbortError');
}
yield this.decodeChunk(value);
}
} finally {
reader.releaseLock();
}
}
stop() {
this.abortController.abort(); // 用户点击停止按钮时调用
}
}
二、协议层:SSE/WebSocket 中断设计
SSE(Server-Sent Events)方案
python
from flask import Flask, Response, stream_with_context
import json
import queue
app = Flask(__name__)
class AgentTask:
def __init__(self):
self.cancel_event = threading.Event()
self.output_queue = queue.Queue()
def generate(self):
"""生成流式响应,支持取消检查"""
for chunk in self.llm_stream():
if self.cancel_event.is_set():
yield f"data: {json.dumps({'type': 'cancelled'})}\n\n"
break
yield f"data: {json.dumps({'type': 'chunk', 'content': chunk})}\n\n"
yield f"data: {json.dumps({'type': 'done'})}\n\n"
@app.route('/api/agent/stream')
def agent_stream():
task = AgentTask()
# 存储任务引用,供取消接口使用
active_tasks[task.id] = task
def event_stream():
try:
yield from task.generate()
finally:
active_tasks.pop(task.id, None)
return Response(
stream_with_context(event_stream()),
mimetype='text/event-stream',
headers={
'Cache-Control': 'no-cache',
'X-Accel-Buffering': 'no', # 禁用 Nginx 缓冲
}
)
@app.route('/api/agent/<task_id>/cancel', methods=['POST'])
def cancel_task(task_id):
"""取消特定任务"""
task = active_tasks.get(task_id)
if task:
task.cancel_event.set()
return {"status": "cancelled"}
return {"status": "not_found"}, 404
WebSocket 方案(更实时)
javascript
// WebSocket 服务端(Node.js 示例)
class AgentSession {
constructor(ws) {
this.ws = ws;
this.currentOperation = null;
this.cancelled = false;
ws.on('message', (data) => {
const msg = JSON.parse(data);
if (msg.type === 'stop') {
this.handleStop();
} else if (msg.type === 'prompt') {
this.runAgent(msg.content);
}
});
}
async runAgent(prompt) {
this.cancelled = false;
// 使用 AbortController 包装 LLM 调用
const controller = new AbortController();
this.currentOperation = controller;
try {
const stream = await this.llm.chat.completions.create({
messages: [{ role: 'user', content: prompt }],
stream: true,
}, { signal: controller.signal });
for await (const chunk of stream) {
if (this.cancelled) break;
this.ws.send(JSON.stringify({
type: 'chunk',
content: chunk.choices[0]?.delta?.content
}));
}
} catch (err) {
if (err.name === 'AbortError') {
this.ws.send(JSON.stringify({ type: 'cancelled' }));
}
} finally {
this.currentOperation = null;
}
}
handleStop() {
this.cancelled = true;
if (this.currentOperation) {
this.currentOperation.abort();
}
}
}
三、后端架构:分层终止策略
架构图
scss
┌─────────────────┐ ┌──────────────────┐ ┌─────────────────┐
│ 用户界面层 │────▶│ API Gateway │────▶│ Agent 服务 │
│ (停止按钮) │ │ (路由/限流/认证) │ │ (业务逻辑) │
└─────────────────┘ └──────────────────┘ └────────┬────────┘
│
┌───────────────────────────┼───────────┐
▼ ▼ ▼
┌─────────┐ ┌──────────┐ ┌─────────┐
│ 任务队列 │ │ LLM 服务 │ │ 工具调用 │
│ (Redis) │ │(OpenAI等)│ │(代码执行)│
└────┬────┘ └────┬─────┘ └────┬────┘
│ │ │
└──────────────────────────┴────────────┘
│
▼
┌─────────────┐
│ 执行引擎 │
│ (Sandbox) │
│ 可强制终止 │
└─────────────┘
核心组件:可中断的 Agent 执行器
python
import asyncio
from dataclasses import dataclass
from enum import Enum, auto
from typing import Optional, Callable
class TaskStatus(Enum):
PENDING = auto()
RUNNING = auto()
CANCELLING = auto()
CANCELLED = auto()
COMPLETED = auto()
FAILED = auto()
@dataclass
class TaskContext:
task_id: str
status: TaskStatus
cancel_event: asyncio.Event
heartbeat: asyncio.Event # 用于检测卡死
steps: list # 执行步骤记录,支持断点恢复
class InterruptibleAgent:
def __init__(self):
self.tasks: dict[str, TaskContext] = {}
self._cleanup_interval = 30 # 秒
async def execute(
self,
task_id: str,
workflow: list[Callable],
timeout: Optional[float] = None
) -> dict:
"""执行可中断的工作流"""
ctx = TaskContext(
task_id=task_id,
status=TaskStatus.PENDING,
cancel_event=asyncio.Event(),
heartbeat=asyncio.Event(),
steps=[]
)
self.tasks[task_id] = ctx
try:
ctx.status = TaskStatus.RUNNING
for step_idx, step_func in enumerate(workflow):
# 检查取消信号
if ctx.cancel_event.is_set():
ctx.status = TaskStatus.CANCELLED
return {
"status": "cancelled",
"completed_steps": step_idx,
"checkpoint": self._create_checkpoint(ctx)
}
# 执行步骤(带超时)
try:
step_result = await asyncio.wait_for(
self._run_step(step_func, ctx),
timeout=timeout
)
ctx.steps.append({
"step": step_idx,
"result": step_result,
"timestamp": time.time()
})
except asyncio.TimeoutError:
ctx.status = TaskStatus.FAILED
raise StepTimeout(f"Step {step_idx} timeout")
# 更新心跳
ctx.heartbeat.set()
ctx.heartbeat.clear()
ctx.status = TaskStatus.COMPLETED
return {"status": "completed", "result": ctx.steps}
except asyncio.CancelledError:
ctx.status = TaskStatus.CANCELLED
return {"status": "cancelled", "checkpoint": self._create_checkpoint(ctx)}
finally:
# 清理资源
await self._cleanup(ctx)
async def _run_step(self, func: Callable, ctx: TaskContext):
"""包装步骤执行,支持细粒度中断"""
# 在子进程中运行可能阻塞的操作
loop = asyncio.get_event_loop()
# 使用线程池执行同步代码
with concurrent.futures.ThreadPoolExecutor() as pool:
future = pool.submit(func)
# 轮询检查取消状态
while not future.done():
if ctx.cancel_event.is_set():
# 尝试优雅取消,必要时强制终止
self._try_cancel_future(future)
raise asyncio.CancelledError()
await asyncio.sleep(0.1) # 100ms 检查间隔
return future.result()
async def cancel(self, task_id: str, force: bool = False) -> bool:
"""取消任务"""
ctx = self.tasks.get(task_id)
if not ctx:
return False
if ctx.status != TaskStatus.RUNNING:
return False
ctx.status = TaskStatus.CANCELLING
ctx.cancel_event.set()
if force:
# 强制终止:发送 SIGTERM 到子进程
await self._force_terminate(task_id)
return True
def _create_checkpoint(self, ctx: TaskContext) -> dict:
"""创建断点,支持恢复"""
return {
"task_id": ctx.task_id,
"completed_steps": len(ctx.steps),
"steps_data": ctx.steps,
"timestamp": time.time()
}
async def resume_from_checkpoint(self, checkpoint: dict) -> dict:
"""从断点恢复执行"""
task_id = checkpoint["task_id"]
workflow = self._rebuild_workflow(checkpoint)
# 跳过已完成的步骤
start_idx = checkpoint["completed_steps"]
return await self.execute(
task_id=f"{task_id}_resumed",
workflow=workflow[start_idx:],
timeout=None
)
四、工具/沙箱层:强制终止
Docker 容器化强制停止
python
import docker
from docker.errors import NotFound
class SandboxExecutor:
def __init__(self):
self.client = docker.from_env()
self.active_containers: dict[str, str] = {} # task_id -> container_id
async def execute_code(
self,
task_id: str,
code: str,
timeout: int = 60,
memory_limit: str = "512m"
) -> dict:
"""在隔离容器中执行代码"""
container = self.client.containers.run(
image="python:3.11-slim",
command=f"python -c '{code}'",
detach=True,
mem_limit=memory_limit,
cpu_quota=100000, # 限制 CPU
network_mode="none", # 隔离网络
labels={"agent_task_id": task_id}
)
self.active_containers[task_id] = container.id
try:
result = container.wait(timeout=timeout)
logs = container.logs().decode('utf-8')
return {
"exit_code": result["StatusCode"],
"logs": logs,
"task_id": task_id
}
except Exception as e:
return {"error": str(e), "task_id": task_id}
finally:
# 确保容器被清理
try:
container.stop(timeout=1)
container.remove(force=True)
except NotFound:
pass
self.active_containers.pop(task_id, None)
def force_stop(self, task_id: str) -> bool:
"""强制停止容器"""
container_id = self.active_containers.get(task_id)
if not container_id:
return False
try:
container = self.client.containers.get(container_id)
container.kill(signal="SIGKILL") # 强制终止
container.remove(force=True)
return True
except NotFound:
return False
进程树终止(非容器环境)
python
import psutil
import signal
import os
def kill_process_tree(pid: int, include_parent: bool = True) -> None:
"""终止整个进程树"""
try:
parent = psutil.Process(pid)
children = parent.children(recursive=True)
# 先终止子进程
for child in children:
try:
child.send_signal(signal.SIGTERM)
except psutil.NoSuchProcess:
pass
# 等待优雅终止
gone, alive = psutil.wait_procs(children, timeout=3)
# 强制终止残留进程
for p in alive:
p.kill()
# 最后终止父进程
if include_parent:
parent.send_signal(signal.SIGTERM)
parent.wait(timeout=3)
except psutil.NoSuchProcess:
pass
五、监控与自愈
心跳检测 + 自动终止
python
import asyncio
from datetime import datetime, timedelta
class AgentMonitor:
def __init__(self):
self.heartbeats: dict[str, datetime] = {}
self.timeout_seconds = 60 # 60秒无心跳视为卡死
self._monitor_task = None
async def start_monitoring(self):
"""启动监控循环"""
self._monitor_task = asyncio.create_task(self._monitor_loop())
async def _monitor_loop(self):
while True:
now = datetime.now()
dead_tasks = []
for task_id, last_beat in self.heartbeats.items():
if (now - last_beat).seconds > self.timeout_seconds:
dead_tasks.append(task_id)
for task_id in dead_tasks:
await self._handle_dead_task(task_id)
await asyncio.sleep(10) # 每10秒检查一次
def update_heartbeat(self, task_id: str):
"""由 Agent 定期调用"""
self.heartbeats[task_id] = datetime.now()
async def _handle_dead_task(self, task_id: str):
"""处理卡死任务"""
print(f"⚠️ 任务 {task_id} 心跳超时,强制终止")
# 1. 标记为失败
await self.mark_failed(task_id, reason="heartbeat_timeout")
# 2. 强制释放资源
await self.force_cleanup(task_id)
# 3. 触发告警
self.send_alert(f"Agent task {task_id} killed due to timeout")
# 4. 清理心跳记录
self.heartbeats.pop(task_id, None)
六、最佳实践总结
| 层级 | 策略 | 适用场景 |
|---|---|---|
| 用户层 | Esc / Ctrl+C / 停止按钮 |
正常交互中断 |
| 协议层 | SSE 取消端点 / WebSocket 信号 | 网络延迟/前端取消 |
| 任务层 | CancellationToken + 断点恢复 | 长任务优雅停止 |
| 进程层 | SIGTERM → SIGKILL |
代码执行超时 |
| 容器层 | Docker kill + 资源限制 | 隔离环境强制终止 |
| 系统层 | 心跳监控 + 自动清理 | 无人值守自愈 |
关键设计原则
- 分层防御:用户信号 → 软件取消 → 硬件终止,层层递进
- 资源隔离:所有外部调用(LLM、代码执行、工具)必须在可取消的上下文中
- 状态可恢复:支持断点保存,取消后可从中间状态恢复而非完全重来
- 快速失败:设置合理的超时,避免无限等待
- ** observability**:完善的日志和监控,能追踪"为什么卡住"
需要针对特定框架(如 LangChain、LlamaIndex、AutoGen)的实现细节吗?