从零实现SKILLHARNESS:让AI Agent学会安全地做事

对前篇SKILLHARNESS:让AI Agent学会"安全地做事"- 掘金的进一步剖析


前言

我们在开发AI Agent时经常遇到这个问题:Agent学会了一个技能,但这个技能在环境变化时要么失效,要么产生危险行为。

今天我们来复现浙江大学提出的SKILLHARNESS 框架,它解决了一个核心问题:如何让Agent学会"什么时候可以用这个技能"

读完这篇文章,你将知道:

  1. 如何用代码实现宏技能和微技能的解耦表示
  2. 如何实现"三源监督"的技能学习
  3. 如何实现"选择性激活"的安全利用机制
  4. 如何实现"模板重放+LLM回退"的双模式执行

一、核心概念速览

在开始写代码之前,先弄清楚SKILLHARNESS的两层设计:

arduino 复制代码
┌─────────────────────────────────────────────────────────────┐
│  宏技能层 (战略层)                                           │
│  "我知道这个技能的目标是什么"                                  │
│  "成功的时候是什么样子"                                       │
│  "有哪些坑不能踩"                                             │
│  "什么前提条件必须满足"                                        │
└─────────────────────────────────────────────────────────────┘
                           ↓ 关联微技能
┌─────────────────────────────────────────────────────────────┐
│  微技能层 (战术层)                                           │
│  "具体怎么操作"                                               │
│  "用模板 + 运行时参数绑定"                                    │
│  "模板失效了走LLM"                                           │
└─────────────────────────────────────────────────────────────┘

二、数据结构:技能如何表示

2.1 宏技能:战略层

python 复制代码
from dataclasses import dataclass, field
from typing import List, Optional, Set
from enum import Enum

class SkillStatus(Enum):
    ACTIVE = "active"
    SUSPENDED = "suspended"  # 安全边界不满足时暂停
    DEPRECATED = "deprecated"

@dataclass
class SuccessPattern:
    """成功模式:do + done_when"""
    do: str                              # 可复用的动作路径描述
    done_when: str                       # 可观测的完成条件
    confidence: float = 1.0              # 置信度

@dataclass
class Lesson:
    """教训:从失败中学到的经验"""
    failure_type: str                   # 失败类型
    recovery_signal: str                 # 恢复信号
    generalization: str                  # 泛化描述

@dataclass
class RiskGuard:
    """风险守卫:环境必须满足的条件"""
    condition: str                       # 条件描述
    description: str                     # 风险说明
    severity: str = "medium"            # severity: low/medium/high

@dataclass
class MacroSkill:
    """宏技能:捕获可复用策略"""
    skill_id: str
    intent: str                          # φ: 宏意图,自然语言目标
    success_patterns: List[SuccessPattern] = field(default_factory=list)  # P
    lessons: List[Lesson] = field(default_factory=list)                   # L
    risk_guards: List[RiskGuard] = field(default_factory=list)            # R
    linked_micro_skills: Set[str] = field(default_factory=set)            # N_M
    status: SkillStatus = SkillStatus.ACTIVE

    def add_success_pattern(self, do: str, done_when: str):
        """添加成功模式"""
        pattern = SuccessPattern(do=do, done_when=done_when)
        self.success_patterns.append(pattern)

    def add_lesson(self, failure_type: str, recovery: str):
        """添加教训"""
        lesson = Lesson(
            failure_type=failure_type,
            recovery_signal=recovery,
            generalization=f"遇到{failure_type}时,尝试{recovery}"
        )
        self.lessons.append(lesson)

    def add_risk_guard(self, condition: str, description: str):
        """添加风险守卫"""
        guard = RiskGuard(condition=condition, description=description)
        self.risk_guards.append(guard)

    def __repr__(self):
        return f"MacroSkill(id={self.skill_id}, intent={self.intent})"

2.2 微技能:战术层

python 复制代码
from dataclasses import dataclass, field
from typing import Dict, Any, Optional
import re

@dataclass
class MicroSkill:
    """微技能:参数化的动作序列"""
    skill_id: str
    semantic_label: str                   # σ: 语义标签,如"点击提交按钮"
    execution_template: str               # E: 执行模板,占位符格式如"click('{button_id}')"
    placeholders: Set[str] = field(default_factory=set)  # Θ: 占位符集合
    bind_count: int = 0                  # 成功绑定次数
    consecutive_failures: int = 0         # 连续失败次数
    max_failures_before_bypass: int = 3  # 连续失败阈值,触发绕过

    @staticmethod
    def from_action(action: str, label: str) -> 'MicroSkill':
        """
        从具体动作创建微技能
        例如: "click('submit_btn')" -> 模板 "click('{button_id}')"
        """
        # 提取占位符
        placeholders = set(re.findall(r'\{(\w+)\}', action))

        # 替换为占位符格式
        template = action
        for ph in placeholders:
            template = template.replace(f'{{{ph}}}', f'{{{ph}}}')

        return MicroSkill(
            skill_id=f"micro_{label}_{hash(action) % 10000}",
            semantic_label=label,
            execution_template=template,
            placeholders=placeholders
        )

    def bind(self, state: Dict[str, Any]) -> Optional[str]:
        """
        模板绑定:将占位符替换为当前状态的值
        返回绑定后的可执行代码,失败返回None
        """
        self.bind_count += 1

        try:
            bound = self.execution_template
            for placeholder in self.placeholders:
                # 从状态中查找对应的值
                value = self._resolve_placeholder(placeholder, state)
                if value is None:
                    self.consecutive_failures += 1
                    return None
                bound = bound.replace(f'{{{placeholder}}}', str(value))

            # 绑定成功,重置连续失败计数
            self.consecutive_failures = 0
            return bound

        except Exception as e:
            self.consecutive_failures += 1
            return None

    def _resolve_placeholder(self, placeholder: str, state: Dict) -> Optional[str]:
        """从状态中解析占位符的值"""
        # 简化实现:直接从state中查找
        # 实际实现可能需要更复杂的解析逻辑
        return state.get(placeholder)

    def should_bypass(self) -> bool:
        """是否应该绕过模板,直接走LLM"""
        return self.consecutive_failures >= self.max_failures_before_bypass

    def __repr__(self):
        return f"MicroSkill(id={self.skill_id}, label={self.semantic_label})"

2.3 技能库

python 复制代码
from typing import Dict, List, Optional, Set
import json

class SkillLibrary:
    """技能库:管理所有宏技能和微技能"""

    def __init__(self):
        self.macro_skills: Dict[str, MacroSkill] = {}
        self.micro_skills: Dict[str, MicroSkill] = {}

    def add_macro_skill(self, skill: MacroSkill):
        """添加宏技能"""
        self.macro_skills[skill.skill_id] = skill
        print(f"[SkillLibrary] 添加宏技能: {skill}")

    def add_micro_skill(self, skill: MicroSkill, macro_skill_id: str):
        """添加微技能并关联到宏技能"""
        self.micro_skills[skill.skill_id] = skill

        # 关联到宏技能
        if macro_skill_id in self.macro_skills:
            self.macro_skills[macro_skill_id].linked_micro_skills.add(skill.skill_id)

        print(f"[SkillLibrary] 添加微技能: {skill} -> {macro_skill_id}")

    def get_macro_skill(self, skill_id: str) -> Optional[MacroSkill]:
        return self.macro_skills.get(skill_id)

    def get_micro_skill(self, skill_id: str) -> Optional[MicroSkill]:
        return self.micro_skills.get(skill_id)

    def get_micro_skills_for_macro(self, macro_skill_id: str) -> List[MicroSkill]:
        """获取宏技能关联的所有微技能"""
        macro = self.get_macro_skill(macro_skill_id)
        if not macro:
            return []

        return [
            self.get_micro_skill(mid)
            for mid in macro.linked_micro_skills
            if self.get_micro_skill(mid)
        ]

    def retrieve_relevant_macros(
        self,
        current_state: Dict,
        task_goal: str,
        top_k: int = 3
    ) -> List[MacroSkill]:
        """
        检索相关的宏技能(简化实现)
        实际应用中应该用嵌入向量相似度
        """
        # 简化:用关键词匹配
        results = []
        for skill in self.macro_skills.values():
            # 检查意图是否包含任务目标的相关词
            if any(word in skill.intent.lower()
                   for word in task_goal.lower().split()):
                results.append(skill)

        # 按成功模式数量排序(经验越丰富的优先)
        results.sort(key=lambda s: len(s.success_patterns), reverse=True)
        return results[:top_k]

    def save(self, path: str):
        """保存到文件"""
        data = {
            "macro_skills": {
                sid: {
                    "skill_id": s.skill_id,
                    "intent": s.intent,
                    "success_patterns": [
                        {"do": p.do, "done_when": p.done_when}
                        for p in s.success_patterns
                    ],
                    "lessons": [
                        {"failure_type": l.failure_type, "recovery": l.recovery_signal}
                        for l in s.lessons
                    ],
                    "risk_guards": [
                        {"condition": r.condition, "description": r.description}
                        for r in s.risk_guards
                    ],
                    "linked_micro_skills": list(s.linked_micro_skills)
                }
                for sid, s in self.macro_skills.items()
            }
        }
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
        print(f"[SkillLibrary] 已保存到 {path}")

    def load(self, path: str):
        """从文件加载"""
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for sid, sdata in data["macro_skills"].items():
            skill = MacroSkill(skill_id=sdata["skill_id"], intent=sdata["intent"])

            for p in sdata.get("success_patterns", []):
                skill.add_success_pattern(p["do"], p["done_when"])

            for l in sdata.get("lessons", []):
                skill.add_lesson(l["failure_type"], l["recovery"])

            for r in sdata.get("risk_guards", []):
                skill.add_risk_guard(r["condition"], r["description"])

            skill.linked_micro_skills = set(sdata.get("linked_micro_skills", []))
            self.macro_skills[skill.skill_id] = skill

        print(f"[SkillLibrary] 已从 {path} 加载")

三、核心组件实现

3.1 规划器:安全检查的关键

python 复制代码
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass

@dataclass
class PlanningDecision:
    """规划器输出的决策"""
    subtask: str                          # u_t: 下一个原子子任务
    expected_effect: str                  # ê_t: 期望的可观测效果
    is_complete: bool = False             # y_t: 是否完成任务
    micro_skill_id: Optional[str] = None  # id_t: 可选的微技能引用
    constraints: List[str] = field(default_factory=list)  # 传给执行器的约束

class Planner:
    """
    规划器:决定每一步怎么走,检查安全边界
    """

    def __init__(self, skill_library: SkillLibrary, llm_client=None):
        self.library = skill_library
        self.llm_client = llm_client  # 可选,用于LLM语义判断

    def plan(
        self,
        current_state: Dict[str, Any],
        history: List[Dict],
        task_goal: str
    ) -> PlanningDecision:
        """
        规划下一步行动

        核心逻辑:
        1. 检索相关宏技能
        2. 检查每个宏技能的风险守卫是否满足
        3. 决定是否激活微技能
        """

        # 1. 检索相关宏技能
        relevant_macros = self.library.retrieve_relevant_macros(
            current_state, task_goal
        )

        if not relevant_macros:
            # 没有相关技能,走LLM自由发挥
            return self._plan_with_llm(current_state, task_goal)

        # 2. 选择最相关的宏技能(简化:选第一个)
        selected_macro = relevant_macros[0]

        # 3. 检查安全边界
        safety_result = self._check_safety_boundary(selected_macro, current_state)

        if not safety_result.is_safe:
            # 安全边界不满足,抑制微技能
            print(f"[Planner] 安全检查失败: {safety_result.reason}")
            return PlanningDecision(
                subtask=safety_result.recommended_action,
                expected_effect="安全执行",
                is_complete=False,
                micro_skill_id=None,  # 抑制微技能
                constraints=safety_result.constraints
            )

        # 4. 安全检查通过,选择微技能
        micro_skills = self.library.get_micro_skills_for_macro(selected_macro.skill_id)

        if micro_skills:
            # 选第一个可用的微技能
            selected_micro = micro_skills[0]
            return PlanningDecision(
                subtask=selected_micro.semantic_label,
                expected_effect=self._infer_expected_effect(selected_macro),
                is_complete=False,
                micro_skill_id=selected_micro.skill_id,
                constraints=[str(rg.condition) for rg in selected_macro.risk_guards]
            )

        # 没有可用微技能,走LLM
        return self._plan_with_llm(current_state, task_goal)

    def _check_safety_boundary(
        self,
        macro_skill: MacroSkill,
        current_state: Dict[str, Any]
    ) -> 'SafetyCheckResult':
        """
        检查安全边界:风险守卫是否满足
        """
        violations = []
        recommended_actions = []

        for guard in macro_skill.risk_guards:
            # 简化实现:检查状态中是否有满足条件的证据
            if not self._evaluate_guard(guard, current_state):
                violations.append(guard.description)
                recommended_actions.append(f"需要满足: {guard.condition}")

        if violations:
            return SafetyCheckResult(
                is_safe=False,
                reason=f"风险守卫不满足: {', '.join(violations)}",
                constraints=recommended_actions
            )

        return SafetyCheckResult(is_safe=True)

    def _evaluate_guard(self, guard: RiskGuard, state: Dict) -> bool:
        """
        评估风险守卫是否满足
        简化实现,实际应该用更复杂的逻辑
        """
        condition = guard.condition.lower()

        # 检查用户同意
        if "user consent" in condition or "用户同意" in condition:
            return state.get("user_consent_verified", False)

        # 检查边界
        if "boundary" in condition or "scope" in condition or "边界" in condition:
            return state.get("within_boundary", True)

        # 检查特定内容
        if "verify" in condition or "确认" in condition:
            return state.get("content_verified", False)

        # 默认满足
        return True

    def _infer_expected_effect(self, macro_skill: MacroSkill) -> str:
        """从成功模式推断期望效果"""
        if macro_skill.success_patterns:
            return macro_skill.success_patterns[0].done_when
        return "操作完成"

    def _plan_with_llm(
        self,
        state: Dict[str, Any],
        goal: str
    ) -> PlanningDecision:
        """没有相关技能时,用LLM规划"""
        # 这里简化处理,实际应该调用LLM
        return PlanningDecision(
            subtask=f"执行任务: {goal}",
            expected_effect="任务完成",
            is_complete=False,
            micro_skill_id=None,
            constraints=[]
        )

@dataclass
class SafetyCheckResult:
    is_safe: bool
    reason: Optional[str] = None
    constraints: List[str] = field(default_factory=list)

3.2 执行器:双模式执行

python 复制代码
import asyncio
from typing import Dict, Any, Optional

class Executor:
    """
    执行器:负责动作执行
    支持两种模式:
    1. 模板重放(确定性,高效)
    2. LLM回退(灵活,安全)
    """

    def __init__(self, skill_library: SkillLibrary, llm_client=None):
        self.library = skill_library
        self.llm_client = llm_client

    async def execute(
        self,
        decision: PlanningDecision,
        current_state: Dict[str, Any]
    ) -> 'ExecutionResult':
        """
        执行规划器输出的决策
        """

        # 情况1:有微技能ID,尝试模板重放
        if decision.micro_skill_id:
            micro_skill = self.library.get_micro_skill(decision.micro_skill_id)

            if micro_skill and not micro_skill.should_bypass():
                # 尝试模板绑定
                bound_action = micro_skill.bind(current_state)

                if bound_action:
                    # 绑定成功,执行模板
                    result = await self._execute_template(bound_action, current_state)
                    return ExecutionResult(
                        success=True,
                        action_type="template",
                        action=bound_action,
                        output=result
                    )
                else:
                    # 绑定失败,记录
                    print(f"[Executor] 模板绑定失败,准备回退: {micro_skill.execution_template}")

        # 情况2:没有微技能或绑定失败,走LLM回退
        return await self._execute_with_llm(decision, current_state)

    async def _execute_template(
        self,
        bound_action: str,
        state: Dict[str, Any]
    ) -> Dict[str, Any]:
        """
        执行模板代码
        简化实现:实际应该调用真正的UI自动化框架
        """
        print(f"[Executor] 执行模板: {bound_action}")

        # 模拟执行
        # 实际实现中,这里会调用 pyautogui / playwright / selenium 等
        await asyncio.sleep(0.1)  # 模拟操作延迟

        # 返回执行后的新状态
        return {
            "action_executed": bound_action,
            "new_state": state,  # 简化:假设状态不变
            "effect": "模板执行成功"
        }

    async def _execute_with_llm(
        self,
        decision: PlanningDecision,
        state: Dict[str, Any]
    ) -> ExecutionResult:
        """
        用LLM生成并执行动作
        """
        print(f"[Executor] LLM回退模式: {decision.subtask}")

        if self.llm_client:
            # 调用LLM生成动作
            action = await self.llm_client.generate_action(
                subtask=decision.subtask,
                state=state,
                constraints=decision.constraints
            )
        else:
            # 简化实现
            action = f"LLM执行: {decision.subtask}"

        return ExecutionResult(
            success=True,
            action_type="llm_fallback",
            action=action,
            output={"effect": "LLM生成执行"}
        )

    def check_completion(
        self,
        macro_skill: MacroSkill,
        state: Dict[str, Any]
    ) -> bool:
        """
        检查任务是否完成
        """
        for pattern in macro_skill.success_patterns:
            # 简化:检查done_when条件是否满足
            if self._check_done_when(pattern.done_when, state):
                return True
        return False

    def _check_done_when(self, condition: str, state: Dict) -> bool:
        """
        检查done_when条件
        简化实现
        """
        # 实际应该解析条件并检查状态
        return state.get("task_completed", False)

@dataclass
class ExecutionResult:
    success: bool
    action_type: str          # "template" 或 "llm_fallback"
    action: str
    output: Dict[str, Any]

3.3 技能演化:从经验中学习

python 复制代码
from typing import List, Tuple, Optional
from dataclasses import dataclass

@dataclass
class TrajectoryAnalysis:
    """轨迹分析结果"""
    has_new_knowledge: bool
    new_success_patterns: List[Tuple[str, str]] = field(default_factory=list)
    new_lessons: List[Tuple[str, str]] = field(default_factory=list)
    new_risk_guards: List[str = field(default_factory=list)

class SkillEvolution:
    """
    技能演化:从探索轨迹中学习/更新技能
    """

    def __init__(self, skill_library: SkillLibrary):
        self.library = skill_library

    def analyze_trajectory(
        self,
        trajectory: List[Dict],
        task_goal: str,
        policy_violations: List[str] = None
    ) -> TrajectoryAnalysis:
        """
        分析轨迹,提取监督信号

        Args:
            trajectory: 执行轨迹,每步包含 action, observation, success
            task_goal: 任务目标
            policy_violations: 策略违规列表
        """

        analysis = TrajectoryAnalysis(has_new_knowledge=False)

        # 1. 分析成功子任务,提取成功模式
        success_segments = self._extract_success_segments(trajectory)
        for do, done_when in success_segments:
            analysis.new_success_patterns.append((do, done_when))
            analysis.has_new_knowledge = True

        # 2. 分析失败子任务,提炼教训
        failure_segments = self._extract_failure_segments(trajectory)
        for failure_type, recovery in failure_segments:
            analysis.new_lessons.append((failure_type, recovery))
            analysis.has_new_knowledge = True

        # 3. 分析策略违规,汇聚风险守卫
        if policy_violations:
            for violation in policy_violations:
                analysis.new_risk_guards.append(violation)
                analysis.has_new_knowledge = True

        return analysis

    def _extract_success_segments(
        self,
        trajectory: List[Dict]
    ) -> List[Tuple[str, str]]:
        """提取成功的子任务序列"""
        segments = []

        current_segment = []
        for step in trajectory:
            if step.get("success"):
                current_segment.append(step.get("action"))
            else:
                if current_segment:
                    # 形成一个成功段
                    do = " -> ".join(current_segment)
                    done_when = step.get("observation", "步骤完成")
                    segments.append((do, done_when))
                    current_segment = []

        return segments

    def _extract_failure_segments(
        self,
        trajectory: List[Dict]
    ) -> List[Tuple[str, str]]:
        """提取失败的子任务及恢复方式"""
        segments = []

        for step in trajectory:
            if not step.get("success") and step.get("recovery"):
                failure_type = step.get("error_type", "未知错误")
                recovery = step.get("recovery", "重启")
                segments.append((failure_type, recovery))

        return segments

    def create_or_update_skill(
        self,
        analysis: TrajectoryAnalysis,
        task_goal: str
    ) -> Optional[str]:
        """
        根据分析结果创建或更新技能

        Returns:
            技能ID,如果没有新知识则返回None
        """

        if not analysis.has_new_knowledge:
            print("[SkillEvolution] 没有新知识,不创建技能")
            return None

        # 检查是否有相似的现有技能
        existing = self._find_similar_skill(task_goal)

        if existing:
            # 更新现有技能
            return self._update_skill(existing, analysis)
        else:
            # 创建新技能
            return self._create_new_skill(task_goal, analysis)

    def _find_similar_skill(self, goal: str) -> Optional[MacroSkill]:
        """查找相似的现有技能"""
        goal_keywords = set(goal.lower().split())

        for skill in self.library.macro_skills.values():
            intent_keywords = set(skill.intent.lower().split())
            # 简单重叠检查
            if goal_keywords & intent_keywords:
                return skill

        return None

    def _create_new_skill(
        self,
        goal: str,
        analysis: TrajectoryAnalysis
    ) -> str:
        """创建新技能"""
        skill_id = f"macro_{goal[:20].replace(' ', '_')}_{len(self.library.macro_skills)}"

        skill = MacroSkill(skill_id=skill_id, intent=goal)

        # 添加成功模式
        for do, done_when in analysis.new_success_patterns:
            skill.add_success_pattern(do, done_when)

        # 添加教训
        for failure_type, recovery in analysis.new_lessons:
            skill.add_lesson(failure_type, recovery)

        # 添加风险守卫
        for guard in analysis.new_risk_guards:
            skill.add_risk_guard(guard, f"策略约束: {guard}")

        self.library.add_macro_skill(skill)

        return skill_id

    def _update_skill(
        self,
        skill: MacroSkill,
        analysis: TrajectoryAnalysis
    ) -> str:
        """更新现有技能"""
        print(f"[SkillEvolution] 更新技能: {skill.skill_id}")

        # 添加新的成功模式(去重)
        existing_patterns = {(p.do, p.done_when) for p in skill.success_patterns}
        for do, done_when in analysis.new_success_patterns:
            if (do, done_when) not in existing_patterns:
                skill.add_success_pattern(do, done_when)

        # 添加新的教训(去重)
        existing_lessons = {l.failure_type for l in skill.lessons}
        for failure_type, recovery in analysis.new_lessons:
            if failure_type not in existing_lessons:
                skill.add_lesson(failure_type, recovery)

        # 添加新的风险守卫(去重)
        existing_guards = {r.condition for r in skill.risk_guards}
        for guard in analysis.new_risk_guards:
            if guard not in existing_guards:
                skill.add_risk_guard(guard, f"策略约束: {guard}")

        return skill.skill_id

四、端到端示例:实现一个安全的学习循环

4.1 完整的Agent实现

python 复制代码
import asyncio
from typing import Dict, Any, List

class SkillHarnessAgent:
    """
    完整的SKILLHARNESS Agent实现
    """

    def __init__(self):
        # 核心组件
        self.skill_library = SkillLibrary()
        self.planner = Planner(self.skill_library)
        self.executor = Executor(self.skill_library)
        self.evolution = SkillEvolution(self.skill_library)

        # 状态
        self.current_task = None
        self.execution_history = []

    async def learn_from_exploration(
        self,
        exploration_goals: List[str],
        max_rounds: int = 30
    ):
        """
        从探索中学习技能
        """
        print(f"[Agent] 开始探索学习,最多 {max_rounds} 轮")

        for round_num in range(max_rounds):
            print(f"\n--- 探索轮次 {round_num + 1}/{max_rounds} ---")

            # 1. 选择探索目标(优先选覆盖不足的能力簇)
            goal = await self._select_exploration_goal(exploration_goals)

            # 2. 执行探索
            trajectory = await self._execute_exploration(goal)

            # 3. 分析轨迹
            violations = self._detect_policy_violations(trajectory)
            analysis = self.evolution.analyze_trajectory(
                trajectory, goal, violations
            )

            # 4. 创建或更新技能
            if analysis.has_new_knowledge:
                skill_id = self.evolution.create_or_update_skill(analysis, goal)
                print(f"[Agent] 技能更新: {skill_id}")

    async def _select_exploration_goal(self, goals: List[str]) -> str:
        """选择探索目标(简化实现)"""
        # 简化:随机选择
        import random
        return random.choice(goals)

    async def _execute_exploration(self, goal: str) -> List[Dict]:
        """执行探索,返回轨迹"""
        # 简化:模拟执行
        return [
            {"action": "click('btn1')", "success": True, "observation": "按钮点击成功"},
            {"action": "fill('input', 'value')", "success": True, "observation": "输入完成"},
            {"action": "click('submit')", "success": True, "observation": "提交成功"},
        ]

    def _detect_policy_violations(self, trajectory: List[Dict]) -> List[str]:
        """检测策略违规(简化实现)"""
        # 简化:无违规
        return []

    async def execute_task(self, task_goal: str, initial_state: Dict):
        """
        执行任务(利用已学习的技能)
        """
        print(f"\n[Agent] 执行任务: {task_goal}")

        self.current_task = task_goal
        current_state = initial_state.copy()
        self.execution_history = []

        max_steps = 50
        for step in range(max_steps):
            # 1. 规划下一步
            decision = self.planner.plan(
                current_state=current_state,
                history=self.execution_history,
                task_goal=task_goal
            )

            print(f"[Step {step+1}] 决策: {decision.subtask} "
                  f"(微技能: {decision.micro_skill_id})")

            # 2. 执行
            result = await self.executor.execute(decision, current_state)
            print(f"[Step {step+1}] 执行结果: {result.action_type} - {result.action}")

            # 3. 更新状态(简化)
            current_state.update(result.output.get("new_state", {}))
            self.execution_history.append({
                "step": step,
                "decision": decision,
                "result": result
            })

            # 4. 检查是否完成
            if decision.is_complete:
                print(f"[Agent] 任务完成!")
                return {"success": True, "history": self.execution_history}

        print(f"[Agent] 达到最大步数 {max_steps},任务未完成")
        return {"success": False, "history": self.execution_history}

4.2 使用示例

python 复制代码
async def main():
    # 创建Agent
    agent = SkillHarnessAgent()

    # 模拟:先学习一些技能
    await agent.learn_from_exploration([
        "创建GitLab项目",
        "提交代码到仓库",
        "创建合并请求",
        "添加项目成员"
    ])

    # 保存技能库
    agent.skill_library.save("skill_library.json")

    # 后来使用时加载技能库
    agent2 = SkillHarnessAgent()
    agent2.skill_library.load("skill_library.json")

    # 执行任务
    result = await agent2.execute_task(
        task_goal="在GitLab上创建一个名为'planner'的私有项目",
        initial_state={
            "current_page": "gitlab.com/projects",
            "user_consent_verified": True,
            "within_boundary": True
        }
    )

    print(f"\n执行结果: {result['success']}")

asyncio.run(main())

4.3 输出示例

ini 复制代码
[Agent] 开始探索学习,最多 30 轮

--- 探索轮次 1/30 ---
[SkillLibrary] 添加宏技能: MacroSkill(id=macro_GitLab项目, intent=创建GitLab项目)
[Agent] 技能更新: macro_GitLab项目

--- 探索轮次 2/30 ---
[SkillLibrary] 添加微技能: MicroSkill(id=micro_点击创建按钮, label=点击创建按钮) -> macro_GitLab项目

[Agent] 执行任务: 在GitLab上创建一个名为'planner'的私有项目

[Step 1] 决策: 点击创建按钮 (微技能: micro_点击创建按钮)
[Executor] 执行模板: click('create_project_btn')
[Step 1] 执行结果: template - click('create_project_btn')

[Step 2] 决策: 输入项目名称 (微技能: None)
[Executor] LLM回退模式: 输入项目名称
[Step 2] 执行结果: llm_fallback - LLM执行: 输入项目名称

[Agent] 任务完成!

五、关键设计决策

5.1 为什么分离宏技能和微技能?

核心原因:意图和落地需要不同的抽象层次。

arduino 复制代码
宏技能回答:     "我要完成什么目标?"
                 "怎么算成功了?"
                 "有什么危险?"

微技能回答:     "具体按哪个按钮?"
                 "填什么值?"
                 "界面变了怎么办?"

好处

  1. 宏技能存储的是稳定知识(目标、策略、安全边界),不会因为UI变化而失效
  2. 微技能存储的是具体操作,UI变了只更新微技能即可
  3. 安全边界在宏技能层,同一个目标的不同实现都共享同一套安全约束

5.2 为什么需要风险守卫?

传统方法认为"成功过的操作就是安全的",但SKILLHARNESS发现:

arduino 复制代码
问题:偶然的成功 ≠ 安全的操作

例子:用户没注意,Agent跳过了"确认"步骤,成功提交了。
     下次遇到类似情况,如果用户真的需要确认,就会出大问题。

风险守卫的作用:把"观察到的不安全情况"积累成约束条件,下次激活技能前必须检查。

5.3 双模式执行的权衡

markdown 复制代码
确定性(模板重放)          灵活性(LLM回退)
      ↑                            ↑
      │                            │
   高效、可靠              适应新环境、处理异常
      │                            │
      └────────────┬────────────────┘
                 环境变化程度

自适应绕过:同一意图连续失败多次后,禁用模板重放。这防止了脆弱的模板在环境变化后累积错误。


六、实际集成建议

6.1 与现有Agent框架集成

SKILLHARNESS的组件可以很方便地集成到LangChain、AutoGen等框架:

python 复制代码
# 以LangChain Agent为例
from langchain.agents import AgentExecutor

class SkillHarnessTool:
    """包装为LangChain Tool"""

    def __init__(self, skill_library: SkillLibrary):
        self.library = skill_library

    def run(self, tool_input: str) -> str:
        # 使用SKILLHARNESS的规划器选择技能
        planner = Planner(self.library)
        decision = planner.plan(
            current_state={"input": tool_input},
            history=[],
            task_goal=tool_input
        )

        # 执行
        executor = Executor(self.library)
        result = executor.execute_sync(decision, {"input": tool_input})

        return result.output.get("effect", "执行完成")

6.2 持久化与增量学习

python 复制代码
import json
from datetime import datetime

class PersistentSkillLibrary(SkillLibrary):
    """支持持久化和增量学习的技能库"""

    def __init__(self, storage_path: str = "skills/"):
        super().__init__()
        self.storage_path = storage_path

    def auto_save(self):
        """每次更新后自动保存"""
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        self.save(f"{self.storage_path}/skill_library_{timestamp}.json")

    def load_latest(self):
        """加载最新的技能库"""
        import os
        files = sorted([
            f for f in os.listdir(self.storage_path)
            if f.startswith("skill_library_")
        ])
        if files:
            self.load(f"{self.storage_path}/{files[-1]}")

    def merge(self, other: 'PersistentSkillLibrary'):
        """合并两个技能库(用于多Agent协作)"""
        for skill_id, skill in other.macro_skills.items():
            if skill_id in self.macro_skills:
                # 合并到现有技能
                existing = self.macro_skills[skill_id]
                existing.success_patterns.extend(skill.success_patterns)
                existing.lessons.extend(skill.lessons)
                existing.risk_guards.extend(skill.risk_guards)
            else:
                # 添加新技能
                self.add_macro_skill(skill)

6.3 监控与调试

python 复制代码
class SkillHarnessMonitor:
    """监控SKILLHARNESS运行状态"""

    def __init__(self):
        self.metrics = {
            "template_success": 0,
            "template_failure": 0,
            "llm_fallback": 0,
            "safety_blocked": 0,
            "skills_created": 0,
            "skills_updated": 0
        }

    def record(self, event: str, details: dict = None):
        """记录事件"""
        if event in self.metrics:
            self.metrics[event] += 1

        print(f"[Monitor] {event}: {details or {}}")

    def report(self) -> dict:
        """生成报告"""
        total_executions = (
            self.metrics["template_success"] +
            self.metrics["template_failure"] +
            self.metrics["llm_fallback"]
        )

        return {
            "template_success_rate": (
                self.metrics["template_success"] / total_executions
                if total_executions > 0 else 0
            ),
            "llm_fallback_rate": (
                self.metrics["llm_fallback"] / total_executions
                if total_executions > 0 else 0
            ),
            "safety_block_rate": (
                self.metrics["safety_blocked"] / total_executions
                if total_executions > 0 else 0
            ),
            **self.metrics
        }

七、总结

SKILLHARNESS的核心贡献是把**"怎么做""什么时候做"**分开:

层次 回答的问题 存储的内容
宏技能 目标是什么?成功什么样?有什么坑? 意图、成功模式、教训、风险守卫
微技能 具体怎么按?填什么值? 语义标签、执行模板、占位符

安全的关键在于:不是学会了就能用,而是每次用之前都要检查安全边界。

灵活的关键在于:模板失效了还有LLM兜底,不会彻底歇菜。

希望这篇文章能帮助你把SKILLHARNESS用到自己的项目中!


参考论文:浙江大学 SKILLHARNESS,2026年6月

相关推荐
IT_陈寒1 小时前
Vite打包后的路径问题差点让我改了一天代码
前端·人工智能·后端
米小虾2 小时前
SKILLHARNESS:让AI Agent学会"安全地做事"
人工智能·agent
葫芦和十三2 小时前
图解 MongoDB 12|索引与查询优化地图:一条主线,三个判断轴
后端·mongodb·agent
葫芦和十三8 小时前
图解 MongoDB 11|慢查询排查闭环:从 Profile 到 explain 的分层路径
后端·mongodb·agent
葫芦和十三12 小时前
图解 MongoDB 09|explain 再读:从 queryPlanner 到 executionStats
后端·mongodb·agent
葫芦和十三12 小时前
图解 MongoDB 10|覆盖查询:让索引把活干完,根本不用回表
后端·mongodb·agent
冬奇Lab14 小时前
每日一个开源项目(第140篇):AgentScope 2.0 - 阿里开源的生产级 Agent 框架
人工智能·开源·agent
冬奇Lab14 小时前
Skill 系列(04):Skill 指标体系——L1/L2/L3 三层监控,让质量下降有据可查
人工智能·开源·llm