PPO 示例 - 技术栈

import gymnasium as gym import numpy as np import torch import torch.nn.functional as F #import matplotlib.pyplot as plt from tqdm import tqdm import time # ==================== 1. PPO算法实现 ==================== class PolicyNet(torch.nn.Module): def init(self, state_dim, hidden_dim, action_dim): super(PolicyNet, self).init() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, action_dim) def forward(self, x): x = F.relu(self.fc1(x)) return F.softmax(self.fc2(x), dim=1) class ValueNet(torch.nn.Module): def init(self, state_dim, hidden_dim): super(ValueNet, self).init() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, 1) def forward(self, x): x = F.relu(self.fc1(x)) return self.fc2(x) class PPO: ''' PPO算法,采用截断方式 ''' def init(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device): self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) self.critic = ValueNet(state_dim, hidden_dim).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.gamma = gamma self.lmbda = lmbda self.epochs = epochs self.eps = eps self.device = device def save_model(self, filepath): '''保存模型权重''' torch.save({ 'actor_state_dict': self.actor.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'actor_optimizer_state_dict': self.actor_optimizer.state_dict(), 'critic_optimizer_state_dict': self.critic_optimizer.state_dict(), }, filepath) print(f"模型已保存到: {filepath}") def load_model(self, filepath): '''加载模型权重''' checkpoint = torch.load(filepath, map_location=self.device) self.actor.load_state_dict(checkpoint $'actor_state_dict'$ ) self.critic.load_state_dict(checkpoint $'critic_state_dict'$ ) self.actor_optimizer.load_state_dict(checkpoint $'actor_optimizer_state_dict'$ ) self.critic_optimizer.load_state_dict(checkpoint $'critic_optimizer_state_dict'$ ) print(f"模型已从 {filepath} 加载") def take_action(self, state): state = torch.tensor( $state$ , dtype=torch.float).to(self.device) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) action = action_dist.sample() return action.item() def update(self, transition_dict): states = torch.tensor(transition_dict $'states'$ , dtype=torch.float).to(self.device) actions = torch.tensor(transition_dict $'actions'$ ).view(-1, 1).to( self.device) rewards = torch.tensor(transition_dict $'rewards'$ , dtype=torch.float).view(-1, 1).to(self.device) next_states = torch.tensor(transition_dict $'next_states'$ , dtype=torch.float).to(self.device) dones = torch.tensor(transition_dict $'dones'$ , dtype=torch.float).view(-1, 1).to(self.device) td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) td_delta = td_target - self.critic(states) # 计算优势函数 (GAE) advantage = compute_advantage(self.gamma, self.lmbda, td_delta.cpu()).to(self.device) old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach() for _ in range(self.epochs): log_probs = torch.log(self.actor(states).gather(1, actions)) ratio = torch.exp(log_probs - old_log_probs) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage actor_loss = torch.mean(-torch.min(surr1, surr2)) critic_loss = torch.mean( F.mse_loss(self.critic(states), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step() def compute_advantage(gamma, lmbda, td_delta): '''计算优势函数 (GAE - Generalized Advantage Estimation)''' td_delta = td_delta.detach().numpy() advantage_list = \[\] advantage = 0.0 for delta in td_delta $::-1$ : advantage = gamma * lmbda * advantage + delta advantage_list.append(advantage) advantage_list.reverse() return torch.tensor(advantage_list, dtype=torch.float) # ==================== 2. 简单策略演示 ==================== def simple_policy_demo(env_name="CartPole-v1", max_episodes=5): '''简单策略演示：基于规则的策略''' env = gym.make(env_name, render_mode="human") print("\n" + "=" * 60) print("1. 简单策略演示") print("=" * 60) print(f"环境: {env_name}") print("策略: 如果杆向右倾斜 → 向右移动") print(" 如果杆向左倾斜 → 向左移动") print("=" * 60) episode_rewards = \[\] for episode in range(max_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 print(f"\n--- 回合 {episode + 1} ---") while not done: position, velocity, angle, angular_velocity = observation # 简单策略 if angle > 0: action = 1 # 向右 else: action = 0 # 向左 observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 # 减慢速度便于观察 time.sleep(0.02) episode_rewards.append(episode_reward) print(f" 步数: {step_count:3d}, 奖励: {episode_reward:.0f}") time.sleep(1) env.close() print(f"\n简单策略平均奖励: {np.mean(episode_rewards):.2f}") return episode_rewards # ==================== 3. PPO训练函数（支持保存模型） ==================== def train_ppo(env_name="CartPole-v1", num_episodes=500, hidden_dim=128, save_path=None): '''使用PPO算法训练智能体，返回训练好的agent和奖励列表''' print("\n" + "=" * 60) print("2. PPO算法训练") print("=" * 60) # 创建环境 env = gym.make(env_name) state_dim = env.observation_space.shape $0$ action_dim = env.action_space.n # 设置设备 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建PPO智能体 agent = PPO( state_dim=state_dim, hidden_dim=hidden_dim, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) return_list = \[\] best_reward = -float('inf') # 训练循环 for i in range(10): # 10个阶段 with tqdm(total=int(num_episodes/10), desc=f'阶段 {i+1}/10') as pbar: for i_episode in range(int(num_episodes/10)): episode_return = 0 transition_dict = { 'states': \[\], 'actions': \[\], 'next_states': \[\], 'rewards': \[\], 'dones': \[\] } observation, info = env.reset() done = False while not done: action = agent.take_action(observation) next_observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated transition_dict $'states'$ .append(observation) transition_dict $'actions'$ .append(action) transition_dict $'next_states'$ .append(next_observation) transition_dict $'rewards'$ .append(reward) transition_dict $'dones'$ .append(done) observation = next_observation episode_return += reward return_list.append(episode_return) agent.update(transition_dict) # 保存最佳模型 if episode_return > best_reward: best_reward = episode_return if save_path: # 保存最佳模型 best_path = save_path.replace('.pth', '_best.pth') agent.save_model(best_path) if (i_episode + 1) % 10 == 0: pbar.set_postfix({ '平均奖励': f'{np.mean(return_list $-10:$ ):.2f}' }) pbar.update(1) env.close() # 保存最终模型 if save_path: # 确保目录存在 import os os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True) agent.save_model(save_path) print(f"\nPPO训练完成！最终平均奖励: {np.mean(return_list $-50:$ ):.2f}") print(f"最佳奖励: {best_reward:.2f}") return return_list, agent # ==================== 6. 测试训练好的PPO模型 ==================== def test_trained_ppo(agent, env_name="CartPole-v1", num_episodes=5, render=True): '''使用训练好的PPO模型测试表现，可视化展示''' print("\n" + "=" * 60) print("测试训练好的PPO模型") print("=" * 60) # 创建环境 render_mode = "human" if render else None env = gym.make(env_name, render_mode=render_mode) episode_rewards = \[\] for episode in range(num_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = agent.take_action(observation) observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 if render: time.sleep(0.02) episode_rewards.append(episode_reward) print(f"回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") if render: time.sleep(1) env.close() print(f"\nPPO模型测试平均奖励: {np.mean(episode_rewards):.2f}") print(f"最大奖励: {np.max(episode_rewards)}") print(f"最小奖励: {np.min(episode_rewards)}") return episode_rewards def load_and_test_model(model_path, env_name="CartPole-v1", num_episodes=5): '''加载保存的模型并测试''' print("\n" + "=" * 60) print("加载保存的PPO模型") print("=" * 60) # 创建环境获取参数 env = gym.make(env_name) state_dim = env.observation_space.shape $0$ action_dim = env.action_space.n env.close() # 创建PPO智能体 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = PPO( state_dim=state_dim, hidden_dim=128, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) # 加载模型 agent.load_model(model_path) # 测试模型 test_trained_ppo(agent, num_episodes=num_episodes) return agent # ==================== 4. 随机策略对比 ==================== def random_policy_demo(env_name="CartPole-v1", max_episodes=5): '''随机策略演示：随机选择动作''' env = gym.make(env_name) print("\n" + "=" * 60) print("0. 随机策略演示") print("=" * 60) episode_rewards = \[\] for episode in range(max_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = env.action_space.sample() # 随机选择动作 observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 episode_rewards.append(episode_reward) print(f"回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") env.close() print(f"随机策略平均奖励: {np.mean(episode_rewards):.2f}") return episode_rewards # ==================== 5. 可视化对比 ==================== def plot_comparison(random_rewards, simple_rewards, ppo_rewards): '''绘制三种策略的对比图''' plt.figure(figsize=(15, 5)) # 随机策略 plt.subplot(1, 3, 1) plt.plot(random_rewards, 'r-', alpha=0.7) plt.axhline(y=np.mean(random_rewards), color='r', linestyle='--', label=f'平均: {np.mean(random_rewards):.1f}') plt.title('随机策略') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) # 简单策略 plt.subplot(1, 3, 2) plt.plot(simple_rewards, 'g-', alpha=0.7) plt.axhline(y=np.mean(simple_rewards), color='g', linestyle='--', label=f'平均: {np.mean(simple_rewards):.1f}') plt.title('简单规则策略') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) # PPO策略 plt.subplot(1, 3, 3) plt.plot(ppo_rewards, 'b-', alpha=0.7) # 计算滑动平均 if len(ppo_rewards) >= 10: moving_avg = np.convolve(ppo_rewards, np.ones(10)/10, mode='valid') plt.plot(range(9, len(ppo_rewards)), moving_avg, 'b--', label='滑动平均(10)') plt.axhline(y=np.mean(ppo_rewards $-50:$ ), color='b', linestyle='--', label=f'最终平均: {np.mean(ppo_rewards $-50:$ ):.1f}') plt.title('PPO算法') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) plt.suptitle('CartPole-v1 不同策略性能对比', fontsize=16) plt.tight_layout() plt.show() # ==================== 7. 主程序 ==================== if name == "main": print("=" * 60) print("CartPole 强化学习算法对比") print("=" * 60) import os # 设置模型保存路径 model_dir = "./saved_models" os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "ppo_cartpole_final.pth") # 1. 随机策略（5回合） print("\n正在运行随机策略...") random_rewards = random_policy_demo(max_episodes=5) # 2. 简单策略（5回合） print("\n正在运行简单策略...") simple_rewards = simple_policy_demo(max_episodes=5) # 3. 询问是否要重新训练或加载已有模型 train_new = input("\n是否要重新训练模型？(y/n，默认y): ").lower() != 'n' if train_new or not os.path.exists(model_path): # 训练新模型 print("\n正在训练PPO算法（300回合）...") ppo_rewards, trained_agent = train_ppo(num_episodes=300, save_path=model_path) # 绘制训练曲线（如果matplotlib可用） try: import matplotlib.pyplot as plt plt.figure(figsize=(10, 5)) plt.plot(ppo_rewards, alpha=0.6, label='每回合奖励') # 计算滑动平均 if len(ppo_rewards) >= 50: moving_avg = np.convolve(ppo_rewards, np.ones(50)/50, mode='valid') plt.plot(range(49, len(ppo_rewards)), moving_avg, 'r-', linewidth=2, label='滑动平均(50)') plt.xlabel('回合') plt.ylabel('奖励') plt.title('PPO训练曲线') plt.legend() plt.grid(True, alpha=0.3) plt.savefig(os.path.join(model_dir, 'training_curve.png')) plt.show() except: print("matplotlib不可用，跳过绘图") else: # 加载已有模型 print(f"\n找到已保存的模型: {model_path}") load_model = input("是否加载已有模型？(y/n，默认y): ").lower() != 'n' if load_model: # 创建环境获取参数 env = gym.make("CartPole-v1") state_dim = env.observation_space.shape $0$ action_dim = env.action_space.n env.close() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trained_agent = PPO( state_dim=state_dim, hidden_dim=128, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) trained_agent.load_model(model_path) else: # 重新训练 print("\n重新训练PPO算法...") ppo_rewards, trained_agent = train_ppo(num_episodes=300, save_path=model_path) # 4. 测试训练好的模型（可视化） print("\n" + "=" * 60) print("测试训练好的PPO模型") print("=" * 60) test_episodes = int(input("请输入要测试的回合数（默认5）: ") or "5") test_trained_ppo(trained_agent, num_episodes=test_episodes, render=True) # 5. 对比所有策略 print("\n" + "=" * 60) print("三种策略可视化对比") print("=" * 60) compare_episodes = int(input("请输入每个策略对比的回合数（默认2）: ") or "2") strategies = $("随机策略", lambda obs: gym.make("CartPole-v1").action_space.sample()), ("简单策略", lambda obs: 1 if obs\[2$ > 0 else 0), ("PPO策略", lambda obs: trained_agent.take_action(obs)) ] for strategy_name, strategy_func in strategies: print(f"\n--- 观察 {strategy_name} ---") env = gym.make("CartPole-v1", render_mode="human") for episode in range(compare_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = strategy_func(observation) observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 time.sleep(0.02) print(f" 回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") time.sleep(1) env.close() print("\n" + "=" * 60) print("最终总结") print("=" * 60) print("1. 随机策略：表现最差，完全随机移动") print("2. 简单策略：基于规则的策略，能基本平衡") print("3. PPO策略：通过学习得到的最优策略，表现最好") print(f"\n模型已保存在: {model_path}") print("=" * 60) 详细解释并且解释ppo 策略

一个完整的PPO（Proximal Policy Optimization，近端策略优化）算法实现 ，用于解决OpenAI Gym中的CartPole-v1（车杆平衡）问题。让我详细解释整个代码结构和PPO算法原理：

📊 CartPole-v1 环境介绍

CartPole是一个经典的强化学习问题：

状态空间（4维）：小车位置、小车速度、杆的角度、杆的角速度
动作空间（2维）：向左移动（0）、向右移动（1）
目标：尽可能长时间保持杆子直立，每坚持一步得1分，最多500分
终止条件：杆倾斜超过15度或小车超出边界

🧠 PPO算法详解

PPO的核心思想

PPO是OpenAI在2017年提出的算法，它解决了传统策略梯度方法的两个关键问题：

更新步长难确定：步长太小学习慢，步长太大容易崩溃
样本效率低：每个样本只用一次就丢弃

PPO的创新点在于使用"裁剪"机制限制策略更新的幅度，确保新策略不会离旧策略太远。

PPO算法流程

text

复制代码

1. 收集一批经验数据（状态、动作、奖励等）
2. 计算优势函数（评估动作好坏）
3. 计算新旧策略的比率
4. 裁剪比率，限制更新幅度
5. 更新策略网络和价值网络
6. 重复以上步骤

📝 代码结构解析

1. 神经网络定义

python

复制代码

class PolicyNet(torch.nn.Module):  # 策略网络：决定要做什么动作
    # 输入：状态 -> 隐藏层 -> 输出：动作概率分布
    
class ValueNet(torch.nn.Module):   # 价值网络：评估当前状态的好坏
    # 输入：状态 -> 隐藏层 -> 输出：状态价值

策略网络 输出每个动作的概率（如向左60%，向右40%），价值网络输出一个数值表示当前状态的价值。

2. PPO主类

python

复制代码

class PPO:
    def take_action(self, state):  # 根据策略选择动作
        # 从概率分布中采样动作
        
    def update(self, transition_dict):  # 核心更新逻辑
        # 1. 计算TD目标和优势函数
        # 2. 计算新旧策略比率
        # 3. 裁剪比率，计算损失
        # 4. 更新网络

3. 关键数学概念

优势函数（GAE - Generalized Advantage Estimation）

python

复制代码

def compute_advantage(gamma, lmbda, td_delta):

优势函数回答："在这个状态下采取这个动作，比平均情况好多少？"

正值：这个动作比平均好，应该增加它的概率
负值：这个动作比平均差，应该减少它的概率

PPO的核心损失函数

python

复制代码

ratio = torch.exp(log_probs - old_log_probs)  # 新旧策略比率
surr1 = ratio * advantage                      # 原始目标
surr2 = torch.clamp(ratio, 1-eps, 1+eps) * advantage  # 裁剪后的目标
actor_loss = torch.mean(-torch.min(surr1, surr2))  # 取最小值

这个设计的妙处在于：

如果advantage > 0（好动作），但ratio > 1+eps（变化太大），就裁剪到1+eps
如果advantage < 0（坏动作），但ratio < 1-eps（变化太大），就裁剪到1-eps

🔄 三种策略对比

1. 随机策略

python

复制代码

action = env.action_space.sample()  # 完全随机选择

表现：平均20-30分
特点：没有任何学习，纯随机探索

2. 简单规则策略

python

复制代码

if angle > 0: action = 1  # 向右
else: action = 0          # 向左

表现：平均50-80分
特点：基于简单规则，但无法处理复杂情况

3. PPO策略

表现：平均300-500分
特点：通过学习得到最优策略，能自适应各种情况

🎯 训练过程可视化

python

复制代码

def train_ppo(num_episodes=300):
    # 每个episode收集数据
    # 每10个episode计算平均奖励
    # 保存最佳模型

训练曲线通常呈现：

初期：表现类似随机策略（探索阶段）
中期：快速提升（找到有效策略）
后期：收敛到最优（稳定在450-500分）

💾 模型保存与加载

python

复制代码

def save_model(self, filepath):  # 保存模型权重和优化器状态
def load_model(self, filepath):  # 加载训练好的模型

这样可以在训练后反复测试，无需重新训练。

📈 代码执行流程

随机策略演示：展示基线表现
简单规则策略：展示简单启发式方法
PPO训练：学习最优策略
模型保存：保存最佳模型
测试可视化：观察训练好的智能体表现
对比分析：直观比较三种策略

⚙️ 关键超参数解释

python

复制代码

actor_lr=1e-3,    # 策略网络学习率
critic_lr=1e-2,   # 价值网络学习率
lmbda=0.95,       # GAE参数，平衡方差和偏差
epochs=10,        # 每批数据复用次数
eps=0.2,          # 裁剪范围，控制更新幅度
gamma=0.98,       # 折扣因子，关注长期回报

🎨 代码特色

模块化设计：策略网络、价值网络、PPO算法分离
设备自适应：自动选择CPU/GPU
模型持久化：自动保存最佳模型
多策略对比：直观展示算法优势
可视化支持：训练曲线和实时渲染

💡 实际应用建议

初学者：从简单策略开始理解
研究者：可以调整超参数观察影响
开发者：可以应用到其他Gym环境