PPO 示例

import gymnasium as gym import numpy as np import torch import torch.nn.functional as F #import matplotlib.pyplot as plt from tqdm import tqdm import time # ==================== 1. PPO算法实现 ==================== class PolicyNet(torch.nn.Module): def init(self, state_dim, hidden_dim, action_dim): super(PolicyNet, self).init() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, action_dim) def forward(self, x): x = F.relu(self.fc1(x)) return F.softmax(self.fc2(x), dim=1) class ValueNet(torch.nn.Module): def init(self, state_dim, hidden_dim): super(ValueNet, self).init() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, 1) def forward(self, x): x = F.relu(self.fc1(x)) return self.fc2(x) class PPO: ''' PPO算法,采用截断方式 ''' def init(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device): self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) self.critic = ValueNet(state_dim, hidden_dim).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.gamma = gamma self.lmbda = lmbda self.epochs = epochs self.eps = eps self.device = device def save_model(self, filepath): '''保存模型权重''' torch.save({ 'actor_state_dict': self.actor.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'actor_optimizer_state_dict': self.actor_optimizer.state_dict(), 'critic_optimizer_state_dict': self.critic_optimizer.state_dict(), }, filepath) print(f"模型已保存到: {filepath}") def load_model(self, filepath): '''加载模型权重''' checkpoint = torch.load(filepath, map_location=self.device) self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict']) self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict']) print(f"模型已从 {filepath} 加载") def take_action(self, state): state = torch.tensor([state], dtype=torch.float).to(self.device) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) action = action_dist.sample() return action.item() def update(self, transition_dict): states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( self.device) rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device) next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device) td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) td_delta = td_target - self.critic(states) # 计算优势函数 (GAE) advantage = compute_advantage(self.gamma, self.lmbda, td_delta.cpu()).to(self.device) old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach() for _ in range(self.epochs): log_probs = torch.log(self.actor(states).gather(1, actions)) ratio = torch.exp(log_probs - old_log_probs) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage actor_loss = torch.mean(-torch.min(surr1, surr2)) critic_loss = torch.mean( F.mse_loss(self.critic(states), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step() def compute_advantage(gamma, lmbda, td_delta): '''计算优势函数 (GAE - Generalized Advantage Estimation)''' td_delta = td_delta.detach().numpy() advantage_list = [] advantage = 0.0 for delta in td_delta[::-1]: advantage = gamma * lmbda * advantage + delta advantage_list.append(advantage) advantage_list.reverse() return torch.tensor(advantage_list, dtype=torch.float) # ==================== 2. 简单策略演示 ==================== def simple_policy_demo(env_name="CartPole-v1", max_episodes=5): '''简单策略演示:基于规则的策略''' env = gym.make(env_name, render_mode="human") print("\n" + "=" * 60) print("1. 简单策略演示") print("=" * 60) print(f"环境: {env_name}") print("策略: 如果杆向右倾斜 → 向右移动") print(" 如果杆向左倾斜 → 向左移动") print("=" * 60) episode_rewards = [] for episode in range(max_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 print(f"\n--- 回合 {episode + 1} ---") while not done: position, velocity, angle, angular_velocity = observation # 简单策略 if angle > 0: action = 1 # 向右 else: action = 0 # 向左 observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 # 减慢速度便于观察 time.sleep(0.02) episode_rewards.append(episode_reward) print(f" 步数: {step_count:3d}, 奖励: {episode_reward:.0f}") time.sleep(1) env.close() print(f"\n简单策略平均奖励: {np.mean(episode_rewards):.2f}") return episode_rewards # ==================== 3. PPO训练函数(支持保存模型) ==================== def train_ppo(env_name="CartPole-v1", num_episodes=500, hidden_dim=128, save_path=None): '''使用PPO算法训练智能体,返回训练好的agent和奖励列表''' print("\n" + "=" * 60) print("2. PPO算法训练") print("=" * 60) # 创建环境 env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # 设置设备 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建PPO智能体 agent = PPO( state_dim=state_dim, hidden_dim=hidden_dim, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) return_list = [] best_reward = -float('inf') # 训练循环 for i in range(10): # 10个阶段 with tqdm(total=int(num_episodes/10), desc=f'阶段 {i+1}/10') as pbar: for i_episode in range(int(num_episodes/10)): episode_return = 0 transition_dict = { 'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': [] } observation, info = env.reset() done = False while not done: action = agent.take_action(observation) next_observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated transition_dict['states'].append(observation) transition_dict['actions'].append(action) transition_dict['next_states'].append(next_observation) transition_dict['rewards'].append(reward) transition_dict['dones'].append(done) observation = next_observation episode_return += reward return_list.append(episode_return) agent.update(transition_dict) # 保存最佳模型 if episode_return > best_reward: best_reward = episode_return if save_path: # 保存最佳模型 best_path = save_path.replace('.pth', '_best.pth') agent.save_model(best_path) if (i_episode + 1) % 10 == 0: pbar.set_postfix({ '平均奖励': f'{np.mean(return_list[-10:]):.2f}' }) pbar.update(1) env.close() # 保存最终模型 if save_path: # 确保目录存在 import os os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True) agent.save_model(save_path) print(f"\nPPO训练完成!最终平均奖励: {np.mean(return_list[-50:]):.2f}") print(f"最佳奖励: {best_reward:.2f}") return return_list, agent # ==================== 6. 测试训练好的PPO模型 ==================== def test_trained_ppo(agent, env_name="CartPole-v1", num_episodes=5, render=True): '''使用训练好的PPO模型测试表现,可视化展示''' print("\n" + "=" * 60) print("测试训练好的PPO模型") print("=" * 60) # 创建环境 render_mode = "human" if render else None env = gym.make(env_name, render_mode=render_mode) episode_rewards = [] for episode in range(num_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = agent.take_action(observation) observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 if render: time.sleep(0.02) episode_rewards.append(episode_reward) print(f"回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") if render: time.sleep(1) env.close() print(f"\nPPO模型测试平均奖励: {np.mean(episode_rewards):.2f}") print(f"最大奖励: {np.max(episode_rewards)}") print(f"最小奖励: {np.min(episode_rewards)}") return episode_rewards def load_and_test_model(model_path, env_name="CartPole-v1", num_episodes=5): '''加载保存的模型并测试''' print("\n" + "=" * 60) print("加载保存的PPO模型") print("=" * 60) # 创建环境获取参数 env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n env.close() # 创建PPO智能体 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = PPO( state_dim=state_dim, hidden_dim=128, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) # 加载模型 agent.load_model(model_path) # 测试模型 test_trained_ppo(agent, num_episodes=num_episodes) return agent # ==================== 4. 随机策略对比 ==================== def random_policy_demo(env_name="CartPole-v1", max_episodes=5): '''随机策略演示:随机选择动作''' env = gym.make(env_name) print("\n" + "=" * 60) print("0. 随机策略演示") print("=" * 60) episode_rewards = [] for episode in range(max_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = env.action_space.sample() # 随机选择动作 observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 episode_rewards.append(episode_reward) print(f"回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") env.close() print(f"随机策略平均奖励: {np.mean(episode_rewards):.2f}") return episode_rewards # ==================== 5. 可视化对比 ==================== def plot_comparison(random_rewards, simple_rewards, ppo_rewards): '''绘制三种策略的对比图''' plt.figure(figsize=(15, 5)) # 随机策略 plt.subplot(1, 3, 1) plt.plot(random_rewards, 'r-', alpha=0.7) plt.axhline(y=np.mean(random_rewards), color='r', linestyle='--', label=f'平均: {np.mean(random_rewards):.1f}') plt.title('随机策略') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) # 简单策略 plt.subplot(1, 3, 2) plt.plot(simple_rewards, 'g-', alpha=0.7) plt.axhline(y=np.mean(simple_rewards), color='g', linestyle='--', label=f'平均: {np.mean(simple_rewards):.1f}') plt.title('简单规则策略') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) # PPO策略 plt.subplot(1, 3, 3) plt.plot(ppo_rewards, 'b-', alpha=0.7) # 计算滑动平均 if len(ppo_rewards) >= 10: moving_avg = np.convolve(ppo_rewards, np.ones(10)/10, mode='valid') plt.plot(range(9, len(ppo_rewards)), moving_avg, 'b--', label='滑动平均(10)') plt.axhline(y=np.mean(ppo_rewards[-50:]), color='b', linestyle='--', label=f'最终平均: {np.mean(ppo_rewards[-50:]):.1f}') plt.title('PPO算法') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) plt.suptitle('CartPole-v1 不同策略性能对比', fontsize=16) plt.tight_layout() plt.show() # ==================== 7. 主程序 ==================== if name == "main": print("=" * 60) print("CartPole 强化学习算法对比") print("=" * 60) import os # 设置模型保存路径 model_dir = "./saved_models" os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "ppo_cartpole_final.pth") # 1. 随机策略(5回合) print("\n正在运行随机策略...") random_rewards = random_policy_demo(max_episodes=5) # 2. 简单策略(5回合) print("\n正在运行简单策略...") simple_rewards = simple_policy_demo(max_episodes=5) # 3. 询问是否要重新训练或加载已有模型 train_new = input("\n是否要重新训练模型?(y/n,默认y): ").lower() != 'n' if train_new or not os.path.exists(model_path): # 训练新模型 print("\n正在训练PPO算法(300回合)...") ppo_rewards, trained_agent = train_ppo(num_episodes=300, save_path=model_path) # 绘制训练曲线(如果matplotlib可用) try: import matplotlib.pyplot as plt plt.figure(figsize=(10, 5)) plt.plot(ppo_rewards, alpha=0.6, label='每回合奖励') # 计算滑动平均 if len(ppo_rewards) >= 50: moving_avg = np.convolve(ppo_rewards, np.ones(50)/50, mode='valid') plt.plot(range(49, len(ppo_rewards)), moving_avg, 'r-', linewidth=2, label='滑动平均(50)') plt.xlabel('回合') plt.ylabel('奖励') plt.title('PPO训练曲线') plt.legend() plt.grid(True, alpha=0.3) plt.savefig(os.path.join(model_dir, 'training_curve.png')) plt.show() except: print("matplotlib不可用,跳过绘图") else: # 加载已有模型 print(f"\n找到已保存的模型: {model_path}") load_model = input("是否加载已有模型?(y/n,默认y): ").lower() != 'n' if load_model: # 创建环境获取参数 env = gym.make("CartPole-v1") state_dim = env.observation_space.shape[0] action_dim = env.action_space.n env.close() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trained_agent = PPO( state_dim=state_dim, hidden_dim=128, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) trained_agent.load_model(model_path) else: # 重新训练 print("\n重新训练PPO算法...") ppo_rewards, trained_agent = train_ppo(num_episodes=300, save_path=model_path) # 4. 测试训练好的模型(可视化) print("\n" + "=" * 60) print("测试训练好的PPO模型") print("=" * 60) test_episodes = int(input("请输入要测试的回合数(默认5): ") or "5") test_trained_ppo(trained_agent, num_episodes=test_episodes, render=True) # 5. 对比所有策略 print("\n" + "=" * 60) print("三种策略可视化对比") print("=" * 60) compare_episodes = int(input("请输入每个策略对比的回合数(默认2): ") or "2") strategies = [ ("随机策略", lambda obs: gym.make("CartPole-v1").action_space.sample()), ("简单策略", lambda obs: 1 if obs[2] > 0 else 0), ("PPO策略", lambda obs: trained_agent.take_action(obs)) ] for strategy_name, strategy_func in strategies: print(f"\n--- 观察 {strategy_name} ---") env = gym.make("CartPole-v1", render_mode="human") for episode in range(compare_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = strategy_func(observation) observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 time.sleep(0.02) print(f" 回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") time.sleep(1) env.close() print("\n" + "=" * 60) print("最终总结") print("=" * 60) print("1. 随机策略:表现最差,完全随机移动") print("2. 简单策略:基于规则的策略,能基本平衡") print("3. PPO策略:通过学习得到的最优策略,表现最好") print(f"\n模型已保存在: {model_path}") print("=" * 60) 详细解释 并且解释ppo 策略

一个完整的PPO(Proximal Policy Optimization,近端策略优化)算法实现 ,用于解决OpenAI Gym中的CartPole-v1(车杆平衡)问题。让我详细解释整个代码结构和PPO算法原理:

📊 CartPole-v1 环境介绍

CartPole是一个经典的强化学习问题:

  • 状态空间(4维):小车位置、小车速度、杆的角度、杆的角速度

  • 动作空间(2维):向左移动(0)、向右移动(1)

  • 目标:尽可能长时间保持杆子直立,每坚持一步得1分,最多500分

  • 终止条件:杆倾斜超过15度或小车超出边界

🧠 PPO算法详解

PPO的核心思想

PPO是OpenAI在2017年提出的算法,它解决了传统策略梯度方法的两个关键问题:

  1. 更新步长难确定:步长太小学习慢,步长太大容易崩溃

  2. 样本效率低:每个样本只用一次就丢弃

PPO的创新点在于使用"裁剪"机制限制策略更新的幅度,确保新策略不会离旧策略太远。

PPO算法流程

text

复制代码
1. 收集一批经验数据(状态、动作、奖励等)
2. 计算优势函数(评估动作好坏)
3. 计算新旧策略的比率
4. 裁剪比率,限制更新幅度
5. 更新策略网络和价值网络
6. 重复以上步骤

📝 代码结构解析

1. 神经网络定义

python

复制代码
class PolicyNet(torch.nn.Module):  # 策略网络:决定要做什么动作
    # 输入:状态 -> 隐藏层 -> 输出:动作概率分布
    
class ValueNet(torch.nn.Module):   # 价值网络:评估当前状态的好坏
    # 输入:状态 -> 隐藏层 -> 输出:状态价值

策略网络 输出每个动作的概率(如向左60%,向右40%),价值网络输出一个数值表示当前状态的价值。

2. PPO主类

python

复制代码
class PPO:
    def take_action(self, state):  # 根据策略选择动作
        # 从概率分布中采样动作
        
    def update(self, transition_dict):  # 核心更新逻辑
        # 1. 计算TD目标和优势函数
        # 2. 计算新旧策略比率
        # 3. 裁剪比率,计算损失
        # 4. 更新网络

3. 关键数学概念

优势函数(GAE - Generalized Advantage Estimation)

python

复制代码
def compute_advantage(gamma, lmbda, td_delta):

优势函数回答:"在这个状态下采取这个动作,比平均情况好多少?"

  • 正值:这个动作比平均好,应该增加它的概率

  • 负值:这个动作比平均差,应该减少它的概率

PPO的核心损失函数

python

复制代码
ratio = torch.exp(log_probs - old_log_probs)  # 新旧策略比率
surr1 = ratio * advantage                      # 原始目标
surr2 = torch.clamp(ratio, 1-eps, 1+eps) * advantage  # 裁剪后的目标
actor_loss = torch.mean(-torch.min(surr1, surr2))  # 取最小值

这个设计的妙处在于:

  • 如果advantage > 0(好动作),但ratio > 1+eps(变化太大),就裁剪到1+eps

  • 如果advantage < 0(坏动作),但ratio < 1-eps(变化太大),就裁剪到1-eps

🔄 三种策略对比

1. 随机策略

python

复制代码
action = env.action_space.sample()  # 完全随机选择
  • 表现:平均20-30分

  • 特点:没有任何学习,纯随机探索

2. 简单规则策略

python

复制代码
if angle > 0: action = 1  # 向右
else: action = 0          # 向左
  • 表现:平均50-80分

  • 特点:基于简单规则,但无法处理复杂情况

3. PPO策略

  • 表现:平均300-500分

  • 特点:通过学习得到最优策略,能自适应各种情况

🎯 训练过程可视化

python

复制代码
def train_ppo(num_episodes=300):
    # 每个episode收集数据
    # 每10个episode计算平均奖励
    # 保存最佳模型

训练曲线通常呈现:

  1. 初期:表现类似随机策略(探索阶段)

  2. 中期:快速提升(找到有效策略)

  3. 后期:收敛到最优(稳定在450-500分)

💾 模型保存与加载

python

复制代码
def save_model(self, filepath):  # 保存模型权重和优化器状态
def load_model(self, filepath):  # 加载训练好的模型

这样可以在训练后反复测试,无需重新训练。

📈 代码执行流程

  1. 随机策略演示:展示基线表现

  2. 简单规则策略:展示简单启发式方法

  3. PPO训练:学习最优策略

  4. 模型保存:保存最佳模型

  5. 测试可视化:观察训练好的智能体表现

  6. 对比分析:直观比较三种策略

⚙️ 关键超参数解释

python

复制代码
actor_lr=1e-3,    # 策略网络学习率
critic_lr=1e-2,   # 价值网络学习率
lmbda=0.95,       # GAE参数,平衡方差和偏差
epochs=10,        # 每批数据复用次数
eps=0.2,          # 裁剪范围,控制更新幅度
gamma=0.98,       # 折扣因子,关注长期回报

🎨 代码特色

  1. 模块化设计:策略网络、价值网络、PPO算法分离

  2. 设备自适应:自动选择CPU/GPU

  3. 模型持久化:自动保存最佳模型

  4. 多策略对比:直观展示算法优势

  5. 可视化支持:训练曲线和实时渲染

💡 实际应用建议

  • 初学者:从简单策略开始理解

  • 研究者:可以调整超参数观察影响

  • 开发者:可以应用到其他Gym环境

相关推荐
ws2019072 小时前
湾区锚点,技术聚合:AUTO TECH China 2026广州汽车零部件展启幕在即
人工智能·科技·汽车
GISer_Jing2 小时前
AI Agent交互模式深度解析:浏览器书签&插件进行AI对话
前端·人工智能·aigc·交互
翱翔的苍鹰2 小时前
通过LangChain Agent模拟实现美团外卖下单场景
人工智能·深度学习·语言模型·自然语言处理·langchain·vllm
Agent产品评测局2 小时前
中国龙虾ai软件有哪些选择?2026自动化选型指南
运维·人工智能·ai·chatgpt·自动化
健康人猿2 小时前
Grok成年模式(Spicy Mode)深度解析 | 有多野?怎么启用?
人工智能
duyinbi75172 小时前
感受野坐标注意力卷积改进YOLOv26双向空间加权与自适应通道建模协同突破
人工智能·yolo·目标跟踪
炎爆的土豆翔2 小时前
bitwise_not`性能测试:手写普通循环、AVX2 与 OpenCV 内置实现对比
人工智能·opencv·webpack
INDEMIND2 小时前
牵手海尔、TCL,INDEMIND家用具身陪伴机器人AI平台加速家庭AI陪伴落地
人工智能·机器人·陪伴机器人
wheelmouse77882 小时前
AI 时代的 Git 进阶术:如何优雅地让多个 Agent 并行开发
人工智能·git·ai编程