import gymnasium as gym import numpy as np import torch import torch.nn.functional as F #import matplotlib.pyplot as plt from tqdm import tqdm import time # ==================== 1. PPO算法实现 ==================== class PolicyNet(torch.nn.Module): def init(self, state_dim, hidden_dim, action_dim): super(PolicyNet, self).init() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, action_dim) def forward(self, x): x = F.relu(self.fc1(x)) return F.softmax(self.fc2(x), dim=1) class ValueNet(torch.nn.Module): def init(self, state_dim, hidden_dim): super(ValueNet, self).init() self.fc1 = torch.nn.Linear(state_dim, hidden_dim) self.fc2 = torch.nn.Linear(hidden_dim, 1) def forward(self, x): x = F.relu(self.fc1(x)) return self.fc2(x) class PPO: ''' PPO算法,采用截断方式 ''' def init(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, lmbda, epochs, eps, gamma, device): self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device) self.critic = ValueNet(state_dim, hidden_dim).to(device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr) self.gamma = gamma self.lmbda = lmbda self.epochs = epochs self.eps = eps self.device = device def save_model(self, filepath): '''保存模型权重''' torch.save({ 'actor_state_dict': self.actor.state_dict(), 'critic_state_dict': self.critic.state_dict(), 'actor_optimizer_state_dict': self.actor_optimizer.state_dict(), 'critic_optimizer_state_dict': self.critic_optimizer.state_dict(), }, filepath) print(f"模型已保存到: {filepath}") def load_model(self, filepath): '''加载模型权重''' checkpoint = torch.load(filepath, map_location=self.device) self.actor.load_state_dict(checkpoint['actor_state_dict']) self.critic.load_state_dict(checkpoint['critic_state_dict']) self.actor_optimizer.load_state_dict(checkpoint['actor_optimizer_state_dict']) self.critic_optimizer.load_state_dict(checkpoint['critic_optimizer_state_dict']) print(f"模型已从 {filepath} 加载") def take_action(self, state): state = torch.tensor([state], dtype=torch.float).to(self.device) probs = self.actor(state) action_dist = torch.distributions.Categorical(probs) action = action_dist.sample() return action.item() def update(self, transition_dict): states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device) actions = torch.tensor(transition_dict['actions']).view(-1, 1).to( self.device) rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1, 1).to(self.device) next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device) dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1, 1).to(self.device) td_target = rewards + self.gamma * self.critic(next_states) * (1 - dones) td_delta = td_target - self.critic(states) # 计算优势函数 (GAE) advantage = compute_advantage(self.gamma, self.lmbda, td_delta.cpu()).to(self.device) old_log_probs = torch.log(self.actor(states).gather(1, actions)).detach() for _ in range(self.epochs): log_probs = torch.log(self.actor(states).gather(1, actions)) ratio = torch.exp(log_probs - old_log_probs) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1 - self.eps, 1 + self.eps) * advantage actor_loss = torch.mean(-torch.min(surr1, surr2)) critic_loss = torch.mean( F.mse_loss(self.critic(states), td_target.detach())) self.actor_optimizer.zero_grad() self.critic_optimizer.zero_grad() actor_loss.backward() critic_loss.backward() self.actor_optimizer.step() self.critic_optimizer.step() def compute_advantage(gamma, lmbda, td_delta): '''计算优势函数 (GAE - Generalized Advantage Estimation)''' td_delta = td_delta.detach().numpy() advantage_list = [] advantage = 0.0 for delta in td_delta[::-1]: advantage = gamma * lmbda * advantage + delta advantage_list.append(advantage) advantage_list.reverse() return torch.tensor(advantage_list, dtype=torch.float) # ==================== 2. 简单策略演示 ==================== def simple_policy_demo(env_name="CartPole-v1", max_episodes=5): '''简单策略演示:基于规则的策略''' env = gym.make(env_name, render_mode="human") print("\n" + "=" * 60) print("1. 简单策略演示") print("=" * 60) print(f"环境: {env_name}") print("策略: 如果杆向右倾斜 → 向右移动") print(" 如果杆向左倾斜 → 向左移动") print("=" * 60) episode_rewards = [] for episode in range(max_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 print(f"\n--- 回合 {episode + 1} ---") while not done: position, velocity, angle, angular_velocity = observation # 简单策略 if angle > 0: action = 1 # 向右 else: action = 0 # 向左 observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 # 减慢速度便于观察 time.sleep(0.02) episode_rewards.append(episode_reward) print(f" 步数: {step_count:3d}, 奖励: {episode_reward:.0f}") time.sleep(1) env.close() print(f"\n简单策略平均奖励: {np.mean(episode_rewards):.2f}") return episode_rewards # ==================== 3. PPO训练函数(支持保存模型) ==================== def train_ppo(env_name="CartPole-v1", num_episodes=500, hidden_dim=128, save_path=None): '''使用PPO算法训练智能体,返回训练好的agent和奖励列表''' print("\n" + "=" * 60) print("2. PPO算法训练") print("=" * 60) # 创建环境 env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n # 设置设备 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"使用设备: {device}") # 创建PPO智能体 agent = PPO( state_dim=state_dim, hidden_dim=hidden_dim, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) return_list = [] best_reward = -float('inf') # 训练循环 for i in range(10): # 10个阶段 with tqdm(total=int(num_episodes/10), desc=f'阶段 {i+1}/10') as pbar: for i_episode in range(int(num_episodes/10)): episode_return = 0 transition_dict = { 'states': [], 'actions': [], 'next_states': [], 'rewards': [], 'dones': [] } observation, info = env.reset() done = False while not done: action = agent.take_action(observation) next_observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated transition_dict['states'].append(observation) transition_dict['actions'].append(action) transition_dict['next_states'].append(next_observation) transition_dict['rewards'].append(reward) transition_dict['dones'].append(done) observation = next_observation episode_return += reward return_list.append(episode_return) agent.update(transition_dict) # 保存最佳模型 if episode_return > best_reward: best_reward = episode_return if save_path: # 保存最佳模型 best_path = save_path.replace('.pth', '_best.pth') agent.save_model(best_path) if (i_episode + 1) % 10 == 0: pbar.set_postfix({ '平均奖励': f'{np.mean(return_list[-10:]):.2f}' }) pbar.update(1) env.close() # 保存最终模型 if save_path: # 确保目录存在 import os os.makedirs(os.path.dirname(save_path) if os.path.dirname(save_path) else '.', exist_ok=True) agent.save_model(save_path) print(f"\nPPO训练完成!最终平均奖励: {np.mean(return_list[-50:]):.2f}") print(f"最佳奖励: {best_reward:.2f}") return return_list, agent # ==================== 6. 测试训练好的PPO模型 ==================== def test_trained_ppo(agent, env_name="CartPole-v1", num_episodes=5, render=True): '''使用训练好的PPO模型测试表现,可视化展示''' print("\n" + "=" * 60) print("测试训练好的PPO模型") print("=" * 60) # 创建环境 render_mode = "human" if render else None env = gym.make(env_name, render_mode=render_mode) episode_rewards = [] for episode in range(num_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = agent.take_action(observation) observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 if render: time.sleep(0.02) episode_rewards.append(episode_reward) print(f"回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") if render: time.sleep(1) env.close() print(f"\nPPO模型测试平均奖励: {np.mean(episode_rewards):.2f}") print(f"最大奖励: {np.max(episode_rewards)}") print(f"最小奖励: {np.min(episode_rewards)}") return episode_rewards def load_and_test_model(model_path, env_name="CartPole-v1", num_episodes=5): '''加载保存的模型并测试''' print("\n" + "=" * 60) print("加载保存的PPO模型") print("=" * 60) # 创建环境获取参数 env = gym.make(env_name) state_dim = env.observation_space.shape[0] action_dim = env.action_space.n env.close() # 创建PPO智能体 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") agent = PPO( state_dim=state_dim, hidden_dim=128, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) # 加载模型 agent.load_model(model_path) # 测试模型 test_trained_ppo(agent, num_episodes=num_episodes) return agent # ==================== 4. 随机策略对比 ==================== def random_policy_demo(env_name="CartPole-v1", max_episodes=5): '''随机策略演示:随机选择动作''' env = gym.make(env_name) print("\n" + "=" * 60) print("0. 随机策略演示") print("=" * 60) episode_rewards = [] for episode in range(max_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = env.action_space.sample() # 随机选择动作 observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 episode_rewards.append(episode_reward) print(f"回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") env.close() print(f"随机策略平均奖励: {np.mean(episode_rewards):.2f}") return episode_rewards # ==================== 5. 可视化对比 ==================== def plot_comparison(random_rewards, simple_rewards, ppo_rewards): '''绘制三种策略的对比图''' plt.figure(figsize=(15, 5)) # 随机策略 plt.subplot(1, 3, 1) plt.plot(random_rewards, 'r-', alpha=0.7) plt.axhline(y=np.mean(random_rewards), color='r', linestyle='--', label=f'平均: {np.mean(random_rewards):.1f}') plt.title('随机策略') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) # 简单策略 plt.subplot(1, 3, 2) plt.plot(simple_rewards, 'g-', alpha=0.7) plt.axhline(y=np.mean(simple_rewards), color='g', linestyle='--', label=f'平均: {np.mean(simple_rewards):.1f}') plt.title('简单规则策略') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) # PPO策略 plt.subplot(1, 3, 3) plt.plot(ppo_rewards, 'b-', alpha=0.7) # 计算滑动平均 if len(ppo_rewards) >= 10: moving_avg = np.convolve(ppo_rewards, np.ones(10)/10, mode='valid') plt.plot(range(9, len(ppo_rewards)), moving_avg, 'b--', label='滑动平均(10)') plt.axhline(y=np.mean(ppo_rewards[-50:]), color='b', linestyle='--', label=f'最终平均: {np.mean(ppo_rewards[-50:]):.1f}') plt.title('PPO算法') plt.xlabel('回合') plt.ylabel('奖励') plt.legend() plt.grid(True, alpha=0.3) plt.suptitle('CartPole-v1 不同策略性能对比', fontsize=16) plt.tight_layout() plt.show() # ==================== 7. 主程序 ==================== if name == "main": print("=" * 60) print("CartPole 强化学习算法对比") print("=" * 60) import os # 设置模型保存路径 model_dir = "./saved_models" os.makedirs(model_dir, exist_ok=True) model_path = os.path.join(model_dir, "ppo_cartpole_final.pth") # 1. 随机策略(5回合) print("\n正在运行随机策略...") random_rewards = random_policy_demo(max_episodes=5) # 2. 简单策略(5回合) print("\n正在运行简单策略...") simple_rewards = simple_policy_demo(max_episodes=5) # 3. 询问是否要重新训练或加载已有模型 train_new = input("\n是否要重新训练模型?(y/n,默认y): ").lower() != 'n' if train_new or not os.path.exists(model_path): # 训练新模型 print("\n正在训练PPO算法(300回合)...") ppo_rewards, trained_agent = train_ppo(num_episodes=300, save_path=model_path) # 绘制训练曲线(如果matplotlib可用) try: import matplotlib.pyplot as plt plt.figure(figsize=(10, 5)) plt.plot(ppo_rewards, alpha=0.6, label='每回合奖励') # 计算滑动平均 if len(ppo_rewards) >= 50: moving_avg = np.convolve(ppo_rewards, np.ones(50)/50, mode='valid') plt.plot(range(49, len(ppo_rewards)), moving_avg, 'r-', linewidth=2, label='滑动平均(50)') plt.xlabel('回合') plt.ylabel('奖励') plt.title('PPO训练曲线') plt.legend() plt.grid(True, alpha=0.3) plt.savefig(os.path.join(model_dir, 'training_curve.png')) plt.show() except: print("matplotlib不可用,跳过绘图") else: # 加载已有模型 print(f"\n找到已保存的模型: {model_path}") load_model = input("是否加载已有模型?(y/n,默认y): ").lower() != 'n' if load_model: # 创建环境获取参数 env = gym.make("CartPole-v1") state_dim = env.observation_space.shape[0] action_dim = env.action_space.n env.close() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") trained_agent = PPO( state_dim=state_dim, hidden_dim=128, action_dim=action_dim, actor_lr=1e-3, critic_lr=1e-2, lmbda=0.95, epochs=10, eps=0.2, gamma=0.98, device=device ) trained_agent.load_model(model_path) else: # 重新训练 print("\n重新训练PPO算法...") ppo_rewards, trained_agent = train_ppo(num_episodes=300, save_path=model_path) # 4. 测试训练好的模型(可视化) print("\n" + "=" * 60) print("测试训练好的PPO模型") print("=" * 60) test_episodes = int(input("请输入要测试的回合数(默认5): ") or "5") test_trained_ppo(trained_agent, num_episodes=test_episodes, render=True) # 5. 对比所有策略 print("\n" + "=" * 60) print("三种策略可视化对比") print("=" * 60) compare_episodes = int(input("请输入每个策略对比的回合数(默认2): ") or "2") strategies = [ ("随机策略", lambda obs: gym.make("CartPole-v1").action_space.sample()), ("简单策略", lambda obs: 1 if obs[2] > 0 else 0), ("PPO策略", lambda obs: trained_agent.take_action(obs)) ] for strategy_name, strategy_func in strategies: print(f"\n--- 观察 {strategy_name} ---") env = gym.make("CartPole-v1", render_mode="human") for episode in range(compare_episodes): observation, info = env.reset() episode_reward = 0 done = False step_count = 0 while not done: action = strategy_func(observation) observation, reward, terminated, truncated, info = env.step(action) done = terminated or truncated episode_reward += reward step_count += 1 time.sleep(0.02) print(f" 回合 {episode + 1}: 步数={step_count:3d}, 奖励={episode_reward:.0f}") time.sleep(1) env.close() print("\n" + "=" * 60) print("最终总结") print("=" * 60) print("1. 随机策略:表现最差,完全随机移动") print("2. 简单策略:基于规则的策略,能基本平衡") print("3. PPO策略:通过学习得到的最优策略,表现最好") print(f"\n模型已保存在: {model_path}") print("=" * 60) 详细解释 并且解释ppo 策略
一个完整的PPO(Proximal Policy Optimization,近端策略优化)算法实现 ,用于解决OpenAI Gym中的CartPole-v1(车杆平衡)问题。让我详细解释整个代码结构和PPO算法原理:
📊 CartPole-v1 环境介绍
CartPole是一个经典的强化学习问题:
-
状态空间(4维):小车位置、小车速度、杆的角度、杆的角速度
-
动作空间(2维):向左移动(0)、向右移动(1)
-
目标:尽可能长时间保持杆子直立,每坚持一步得1分,最多500分
-
终止条件:杆倾斜超过15度或小车超出边界
🧠 PPO算法详解
PPO的核心思想
PPO是OpenAI在2017年提出的算法,它解决了传统策略梯度方法的两个关键问题:
-
更新步长难确定:步长太小学习慢,步长太大容易崩溃
-
样本效率低:每个样本只用一次就丢弃
PPO的创新点在于使用"裁剪"机制限制策略更新的幅度,确保新策略不会离旧策略太远。
PPO算法流程
text
1. 收集一批经验数据(状态、动作、奖励等)
2. 计算优势函数(评估动作好坏)
3. 计算新旧策略的比率
4. 裁剪比率,限制更新幅度
5. 更新策略网络和价值网络
6. 重复以上步骤
📝 代码结构解析
1. 神经网络定义
python
class PolicyNet(torch.nn.Module): # 策略网络:决定要做什么动作
# 输入:状态 -> 隐藏层 -> 输出:动作概率分布
class ValueNet(torch.nn.Module): # 价值网络:评估当前状态的好坏
# 输入:状态 -> 隐藏层 -> 输出:状态价值
策略网络 输出每个动作的概率(如向左60%,向右40%),价值网络输出一个数值表示当前状态的价值。
2. PPO主类
python
class PPO:
def take_action(self, state): # 根据策略选择动作
# 从概率分布中采样动作
def update(self, transition_dict): # 核心更新逻辑
# 1. 计算TD目标和优势函数
# 2. 计算新旧策略比率
# 3. 裁剪比率,计算损失
# 4. 更新网络
3. 关键数学概念
优势函数(GAE - Generalized Advantage Estimation)
python
def compute_advantage(gamma, lmbda, td_delta):
优势函数回答:"在这个状态下采取这个动作,比平均情况好多少?"
-
正值:这个动作比平均好,应该增加它的概率
-
负值:这个动作比平均差,应该减少它的概率
PPO的核心损失函数
python
ratio = torch.exp(log_probs - old_log_probs) # 新旧策略比率
surr1 = ratio * advantage # 原始目标
surr2 = torch.clamp(ratio, 1-eps, 1+eps) * advantage # 裁剪后的目标
actor_loss = torch.mean(-torch.min(surr1, surr2)) # 取最小值
这个设计的妙处在于:
-
如果
advantage > 0(好动作),但ratio > 1+eps(变化太大),就裁剪到1+eps -
如果
advantage < 0(坏动作),但ratio < 1-eps(变化太大),就裁剪到1-eps
🔄 三种策略对比
1. 随机策略
python
action = env.action_space.sample() # 完全随机选择
-
表现:平均20-30分
-
特点:没有任何学习,纯随机探索
2. 简单规则策略
python
if angle > 0: action = 1 # 向右
else: action = 0 # 向左
-
表现:平均50-80分
-
特点:基于简单规则,但无法处理复杂情况
3. PPO策略
-
表现:平均300-500分
-
特点:通过学习得到最优策略,能自适应各种情况
🎯 训练过程可视化
python
def train_ppo(num_episodes=300):
# 每个episode收集数据
# 每10个episode计算平均奖励
# 保存最佳模型
训练曲线通常呈现:
-
初期:表现类似随机策略(探索阶段)
-
中期:快速提升(找到有效策略)
-
后期:收敛到最优(稳定在450-500分)
💾 模型保存与加载
python
def save_model(self, filepath): # 保存模型权重和优化器状态
def load_model(self, filepath): # 加载训练好的模型
这样可以在训练后反复测试,无需重新训练。
📈 代码执行流程
-
随机策略演示:展示基线表现
-
简单规则策略:展示简单启发式方法
-
PPO训练:学习最优策略
-
模型保存:保存最佳模型
-
测试可视化:观察训练好的智能体表现
-
对比分析:直观比较三种策略
⚙️ 关键超参数解释
python
actor_lr=1e-3, # 策略网络学习率
critic_lr=1e-2, # 价值网络学习率
lmbda=0.95, # GAE参数,平衡方差和偏差
epochs=10, # 每批数据复用次数
eps=0.2, # 裁剪范围,控制更新幅度
gamma=0.98, # 折扣因子,关注长期回报
🎨 代码特色
-
模块化设计:策略网络、价值网络、PPO算法分离
-
设备自适应:自动选择CPU/GPU
-
模型持久化:自动保存最佳模型
-
多策略对比:直观展示算法优势
-
可视化支持:训练曲线和实时渲染
💡 实际应用建议
-
初学者:从简单策略开始理解
-
研究者:可以调整超参数观察影响
-
开发者:可以应用到其他Gym环境