【无标题】 - 技术栈

Gazebo中使用PPO训练机械臂是一个复杂的系统工程。我来提供一个完整的、可操作的实现方案，从环境搭建到训练部署。

📋 整体架构图

text

复制代码

┌─────────────────────────────────────────────────────────────┐
│                    训练系统架构                               │
├─────────────────────────────────────────────────────────────┤
│  Gazebo仿真环境             ↔         Python训练脚本          │
│  ┌─────────────────┐                  ┌─────────────────┐   │
│  │ 机械臂URDF模型   │                  │ PPO算法实现     │   │
│  │ 物理引擎         │ ←─── ROS 2 ───→ │ 神经网络       │   │
│  │ 传感器数据       │   (或gym接口)    │ 经验回放       │   │
│  │ 障碍物/目标物体  │                  │ 模型保存/加载   │   │
│  └─────────────────┘                  └─────────────────┘   │
└─────────────────────────────────────────────────────────────┘

🛠️ 第一步：环境搭建

1.1 安装必要软件

bash

复制代码

# 1. 安装Gazebo（建议用Harmonic版本）
sudo apt install gz-harmonic

# 2. 安装ROS 2 Humble（用于通信）
sudo apt install ros-humble-desktop
sudo apt install ros-humble-gazebo-ros-pkgs

# 3. 安装Python依赖
pip install torch torchvision torchaudio
pip install gymnasium pybullet stable-baselines3
pip install numpy matplotlib tqdm

1.2 准备机械臂URDF模型

bash

复制代码

# 下载URDF模型（以AUBO i5为例）
mkdir -p ~/robot_ws/src
cd ~/robot_ws/src
git clone https://github.com/auboliujie/aubo_robot.git

# 编译
cd ~/robot_ws
colcon build
source install/setup.bash

🎯 第二步：创建训练环境接口

2.1 Gymnasium风格环境包装器

python

复制代码

#!/usr/bin/env python3
# robot_gazebo_env.py

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import rclpy
from rclpy.node import Node
from sensor_msgs.msg import JointState
from geometry_msgs.msg import Pose, Point
from std_msgs.msg import Float64MultiArray
import time

class RobotArmGazeboEnv(gym.Env):
    """Gazebo机械臂训练环境"""
    
    def __init__(self, robot_name='aubo_i5', render_mode=None):
        super().__init__()
        
        # 初始化ROS2节点
        rclpy.init(args=None)
        self.node = Node('robot_arm_env')
        
        # 机器人参数
        self.robot_name = robot_name
        self.num_joints = 6
        self.joint_names = [f'joint_{i+1}' for i in range(self.num_joints)]
        
        # 关节限位（弧度）
        self.joint_limits = {
            'min': np.array([-3.14, -2.0, -2.5, -3.14, -2.0, -3.14]),
            'max': np.array([3.14, 2.0, 2.5, 3.14, 2.0, 3.14])
        }
        
        # 动作空间：6个关节的角度增量 [-0.1, 0.1] 弧度
        self.action_space = spaces.Box(
            low=-0.1, high=0.1, shape=(self.num_joints,), dtype=np.float32
        )
        
        # 状态空间：12维（6关节角度 + 6关节速度）+ 3目标位置 + 3末端位置
        # 可根据需要添加障碍物信息
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(12 + 3 + 3,), dtype=np.float32
        )
        
        # ROS2 通信
        self.joint_state_sub = self.node.create_subscription(
            JointState, '/joint_states', self.joint_state_callback, 10
        )
        self.arm_pub = self.node.create_publisher(
            Float64MultiArray, '/position_controller/commands', 10
        )
        
        # 状态变量
        self.current_joint_positions = np.zeros(self.num_joints)
        self.current_joint_velocities = np.zeros(self.num_joints)
        self.target_position = np.array([0.5, 0.0, 0.5])  # 目标位置
        
        # 末端执行器位置（需从正运动学计算）
        self.ee_position = np.zeros(3)
        
        # 控制频率
        self.dt = 0.1  # 100ms
        
    def joint_state_callback(self, msg):
        """关节状态回调"""
        for i, name in enumerate(msg.name):
            if name in self.joint_names:
                idx = self.joint_names.index(name)
                self.current_joint_positions[idx] = msg.position[idx]
                self.current_joint_velocities[idx] = msg.velocity[idx]
    
    def compute_forward_kinematics(self, joint_angles):
        """正运动学计算末端位置（简化版）"""
        # 实际应用中需要根据机械臂DH参数计算
        # 这里用简化模型
        l1, l2, l3 = 0.3, 0.3, 0.2  # 连杆长度
        theta1, theta2, theta3, theta4, theta5, theta6 = joint_angles
        
        # 简化计算（实际应用需用完整DH矩阵）
        x = l2 * np.cos(theta1) * np.cos(theta2) + l3 * np.cos(theta1) * np.cos(theta2+theta3)
        y = l2 * np.sin(theta1) * np.cos(theta2) + l3 * np.sin(theta1) * np.cos(theta2+theta3)
        z = l1 + l2 * np.sin(theta2) + l3 * np.sin(theta2+theta3)
        
        return np.array([x, y, z])
    
    def get_state(self):
        """获取当前状态"""
        # 计算末端位置
        self.ee_position = self.compute_forward_kinematics(self.current_joint_positions)
        
        # 组合状态向量
        state = np.concatenate([
            self.current_joint_positions,      # 6维
            self.current_joint_velocities,     # 6维
            self.target_position,               # 3维
            self.ee_position                     # 3维
        ])
        
        return state.astype(np.float32)
    
    def compute_reward(self):
        """计算奖励"""
        # 到目标的距离
        dist_to_target = np.linalg.norm(self.ee_position - self.target_position)
        
        # 基础奖励：负距离
        reward = -dist_to_target
        
        # 目标到达奖励
        if dist_to_target < 0.05:  # 5cm内算成功
            reward += 100
            self.success = True
        
        # 关节限位惩罚
        for i in range(self.num_joints):
            if self.current_joint_positions[i] < self.joint_limits['min'][i]:
                reward -= 10
            if self.current_joint_positions[i] > self.joint_limits['max'][i]:
                reward -= 10
        
        return reward
    
    def step(self, action):
        """执行一步动作"""
        # 应用动作：关节角度增量
        target_positions = self.current_joint_positions + action
        target_positions = np.clip(target_positions, 
                                   self.joint_limits['min'], 
                                   self.joint_limits['max'])
        
        # 发布控制命令
        cmd_msg = Float64MultiArray()
        cmd_msg.data = target_positions.tolist()
        self.arm_pub.publish(cmd_msg)
        
        # 等待一个控制周期
        time.sleep(self.dt)
        
        # 获取新状态
        rclpy.spin_once(self.node, timeout_sec=0.1)
        next_state = self.get_state()
        
        # 计算奖励
        reward = self.compute_reward()
        
        # 判断是否结束
        terminated = self.success
        truncated = False
        
        return next_state, reward, terminated, truncated, {}
    
    def reset(self, seed=None):
        """重置环境"""
        super().reset(seed=seed)
        
        # 随机初始化关节角度
        random_angles = np.random.uniform(
            self.joint_limits['min'] * 0.5,  # 限制在中间范围
            self.joint_limits['max'] * 0.5,
            self.num_joints
        )
        
        # 发布初始位置
        cmd_msg = Float64MultiArray()
        cmd_msg.data = random_angles.tolist()
        self.arm_pub.publish(cmd_msg)
        time.sleep(1.0)  # 等待机器人到位
        
        # 随机化目标位置（可选）
        if np.random.random() < 0.3:
            self.target_position = np.random.uniform([0.3, -0.3, 0.3], 
                                                     [0.7, 0.3, 0.7])
        
        self.success = False
        
        # 获取初始状态
        rclpy.spin_once(self.node, timeout_sec=0.1)
        state = self.get_state()
        
        return state, {}
    
    def close(self):
        """关闭环境"""
        self.node.destroy_node()
        rclpy.shutdown()

🧠 第三步：PPO算法实现

python

复制代码

#!/usr/bin/env python3
# ppo_robot_arm.py

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque
import random

class ActorNetwork(nn.Module):
    """策略网络"""
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        
        # 均值输出
        self.mean = nn.Linear(hidden_dim, action_dim)
        # 标准差（可学习参数）
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        mean = self.mean(x)
        std = self.log_std.exp()
        return mean, std

class CriticNetwork(nn.Module):
    """价值网络"""
    def __init__(self, state_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class PPOAgent:
    """PPO智能体"""
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, 
                 clip_epsilon=0.2, epochs=10, batch_size=64):
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"使用设备: {self.device}")
        
        self.actor = ActorNetwork(state_dim, action_dim).to(self.device)
        self.critic = CriticNetwork(state_dim).to(self.device)
        self.optimizer = optim.Adam(
            list(self.actor.parameters()) + list(self.critic.parameters()), 
            lr=lr
        )
        
        self.gamma = gamma
        self.clip_epsilon = clip_epsilon
        self.epochs = epochs
        self.batch_size = batch_size
        
        # 经验缓冲区
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []
        self.log_probs = []
        
    def get_action(self, state, deterministic=False):
        """选择动作"""
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            mean, std = self.actor(state)
            
        if deterministic:
            # 测试时取均值
            action = mean
            log_prob = None
        else:
            # 训练时采样
            dist = torch.distributions.Normal(mean, std)
            action = dist.sample()
            log_prob = dist.log_prob(action).sum(dim=-1)
            
        return action.cpu().numpy()[0], log_prob
    
    def store_transition(self, state, action, reward, next_state, done, log_prob):
        """存储经验"""
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.next_states.append(next_state)
        self.dones.append(done)
        self.log_probs.append(log_prob)
    
    def compute_gae(self, rewards, values, next_values, dones, gamma=0.99, lam=0.95):
        """计算广义优势估计"""
        advantages = []
        gae = 0
        
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                delta = rewards[t] + gamma * next_values[t] * (1 - dones[t]) - values[t]
            else:
                delta = rewards[t] + gamma * values[t+1] * (1 - dones[t]) - values[t]
            
            gae = delta + gamma * lam * (1 - dones[t]) * gae
            advantages.insert(0, gae)
        
        return advantages
    
    def update(self):
        """更新策略"""
        if len(self.states) < self.batch_size:
            return
        
        # 转换为tensor
        states = torch.FloatTensor(np.array(self.states)).to(self.device)
        actions = torch.FloatTensor(np.array(self.actions)).to(self.device)
        rewards = torch.FloatTensor(np.array(self.rewards)).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(np.array(self.next_states)).to(self.device)
        dones = torch.FloatTensor(np.array(self.dones)).unsqueeze(1).to(self.device)
        old_log_probs = torch.FloatTensor(np.array(self.log_probs)).unsqueeze(1).to(self.device)
        
        # 计算价值和优势
        with torch.no_grad():
            values = self.critic(states)
            next_values = self.critic(next_states)
            
        advantages = self.compute_gae(
            rewards.cpu().numpy().flatten(),
            values.cpu().numpy().flatten(),
            next_values.cpu().numpy().flatten(),
            dones.cpu().numpy().flatten()
        )
        advantages = torch.FloatTensor(advantages).unsqueeze(1).to(self.device)
        
        # 计算回报
        returns = advantages + values
        
        # PPO更新
        for _ in range(self.epochs):
            # 随机采样
            indices = np.random.permutation(len(states))
            for start in range(0, len(states), self.batch_size):
                end = start + self.batch_size
                batch_indices = indices[start:end]
                
                batch_states = states[batch_indices]
                batch_actions = actions[batch_indices]
                batch_returns = returns[batch_indices]
                batch_advantages = advantages[batch_indices]
                batch_old_log_probs = old_log_probs[batch_indices]
                
                # 标准化优势
                batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-8)
                
                # 计算新策略的对数概率
                mean, std = self.actor(batch_states)
                dist = torch.distributions.Normal(mean, std)
                new_log_probs = dist.log_prob(batch_actions).sum(dim=-1, keepdim=True)
                
                # 计算比率
                ratio = torch.exp(new_log_probs - batch_old_log_probs)
                
                # PPO截断目标
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_advantages
                actor_loss = -torch.min(surr1, surr2).mean()
                
                # 价值损失
                values_pred = self.critic(batch_states)
                critic_loss = F.mse_loss(values_pred, batch_returns)
                
                # 总损失
                total_loss = actor_loss + 0.5 * critic_loss
                
                # 反向传播
                self.optimizer.zero_grad()
                total_loss.backward()
                self.optimizer.step()
        
        # 清空缓冲区
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []
        self.log_probs = []
    
    def save(self, path):
        """保存模型"""
        torch.save({
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }, path)
        print(f"模型已保存到 {path}")
    
    def load(self, path):
        """加载模型"""
        checkpoint = torch.load(path)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        print(f"模型已从 {path} 加载")

🚀 第四步：训练脚本

python

复制代码

#!/usr/bin/env python3
# train_robot_arm.py

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os

from robot_gazebo_env import RobotArmGazeboEnv
from ppo_robot_arm import PPOAgent

def train():
    """训练主函数"""
    
    # 创建环境
    env = RobotArmGazeboEnv(robot_name='aubo_i5')
    
    # 获取状态和动作维度
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    print(f"状态维度: {state_dim}")
    print(f"动作维度: {action_dim}")
    
    # 创建PPO智能体
    agent = PPOAgent(
        state_dim=state_dim,
        action_dim=action_dim,
        lr=3e-4,
        gamma=0.99,
        clip_epsilon=0.2,
        epochs=10,
        batch_size=64
    )
    
    # 训练参数
    num_episodes = 2000
    max_steps_per_episode = 200
    save_interval = 100
    
    # 记录训练数据
    episode_rewards = []
    success_rates = []
    best_reward = -float('inf')
    
    print("\n开始训练...")
    print("="*60)
    
    for episode in tqdm(range(num_episodes)):
        state, _ = env.reset()
        episode_reward = 0
        episode_success = False
        
        for step in range(max_steps_per_episode):
            # 选择动作
            action, log_prob = agent.get_action(state)
            
            # 执行动作
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # 存储经验
            if log_prob is not None:
                agent.store_transition(state, action, reward, next_state, done, log_prob.item())
            
            episode_reward += reward
            state = next_state
            
            if done:
                episode_success = terminated
                break
        
        episode_rewards.append(episode_reward)
        
        # 更新策略
        agent.update()
        
        # 记录成功
        if episode_success:
            success_rates.append(1)
        else:
            success_rates.append(0)
        
        # 保存最佳模型
        if episode_reward > best_reward:
            best_reward = episode_reward
            agent.save("best_model.pth")
        
        # 定期保存和输出
        if (episode + 1) % save_interval == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            avg_success = np.mean(success_rates[-100:])
            
            print(f"\n回合 {episode + 1}/{num_episodes}")
            print(f"最近100回合平均奖励: {avg_reward:.2f}")
            print(f"最近100回合成功率: {avg_success:.2%}")
            print(f"最佳奖励: {best_reward:.2f}")
            
            # 保存检查点
            agent.save(f"checkpoint_ep{episode+1}.pth")
            
            # 绘制训练曲线
            plot_training_curve(episode_rewards, success_rates, episode + 1)
    
    print("\n训练完成！")
    print(f"最终平均奖励: {np.mean(episode_rewards[-100:]):.2f}")
    print(f"最终成功率: {np.mean(success_rates[-100:]):.2%}")
    
    # 保存最终模型
    agent.save("final_model.pth")
    
    # 绘制最终曲线
    plot_training_curve(episode_rewards, success_rates, num_episodes)
    
    env.close()

def plot_training_curve(rewards, success_rates, episode):
    """绘制训练曲线"""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
    
    # 奖励曲线
    ax1.plot(rewards, alpha=0.3, color='blue')
    if len(rewards) > 100:
        moving_avg = np.convolve(rewards, np.ones(100)/100, mode='valid')
        ax1.plot(range(99, len(rewards)), moving_avg, color='red', linewidth=2)
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Reward')
    ax1.set_title('Training Rewards')
    ax1.grid(True, alpha=0.3)
    
    # 成功率曲线
    if len(success_rates) > 100:
        success_ma = np.convolve(success_rates, np.ones(100)/100, mode='valid')
        ax2.plot(range(99, len(success_rates)), success_ma, color='green', linewidth=2)
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Success Rate')
    ax2.set_title('Success Rate')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'training_curve_ep{episode}.png')
    plt.close()

def test_model(model_path="best_model.pth", num_episodes=10):
    """测试训练好的模型"""
    print("\n测试训练好的模型...")
    
    env = RobotArmGazeboEnv(robot_name='aubo_i5', render_mode='human')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    agent = PPOAgent(state_dim, action_dim)
    agent.load(model_path)
    
    success_count = 0
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        steps = 0
        
        while steps < 200:
            action, _ = agent.get_action(state, deterministic=True)
            next_state, reward, terminated, truncated, _ = env.step(action)
            
            state = next_state
            episode_reward += reward
            steps += 1
            
            if terminated:
                success_count += 1
                break
        
        print(f"测试回合 {episode + 1}: 奖励 = {episode_reward:.2f}, 步数 = {steps}, 成功 = {terminated}")
    
    print(f"\n测试成功率: {success_count/num_episodes:.2%}")
    env.close()

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', type=str, default='train', choices=['train', 'test'])
    parser.add_argument('--model', type=str, default='best_model.pth')
    args = parser.parse_args()
    
    if args.mode == 'train':
        train()
    else:
        test_model(args.model)

🔧 第五步：Gazebo启动配置

5.1 创建启动文件

xml

复制代码

<!-- robot_arm.launch.py -->
from launch import LaunchDescription
from launch_ros.actions import Node

def generate_launch_description():
    return LaunchDescription([
        # 启动Gazebo
        Node(
            package='gazebo_ros',
            executable='gzserver',
            arguments=['-s', 'libgazebo_ros_factory.so'],
            output='screen'
        ),
        Node(
            package='gazebo_ros',
            executable='gzclient',
            output='screen'
        ),
        
        # 加载机械臂模型
        Node(
            package='gazebo_ros',
            executable='spawn_entity.py',
            arguments=['-entity', 'aubo_i5', 
                      '-file', '$(find aubo_description)/urdf/aubo_i5.urdf'],
            output='screen'
        ),
        
        # 启动关节控制器
        Node(
            package='controller_manager',
            executable='spawner',
            arguments=['joint_state_controller', 'position_controller'],
            output='screen'
        ),
        
        # 启动训练节点
        Node(
            package='robot_arm_training',
            executable='train_robot_arm.py',
            output='screen'
        )
    ])

5.2 运行训练

bash

复制代码

# 1. 启动Gazebo和机械臂
ros2 launch robot_arm_training robot_arm.launch.py

# 2. 在另一个终端启动训练
python3 train_robot_arm.py --mode train

# 3. 测试模型
python3 train_robot_arm.py --mode test --model best_model.pth

📊 六、性能优化技巧

6.1 奖励函数优化

python

复制代码

def compute_enhanced_reward(self):
    dist_to_target = np.linalg.norm(self.ee_position - self.target_position)
    
    # 稀疏奖励 + 稠密奖励组合
    sparse_reward = 100 if dist_to_target < 0.05 else -1
    dense_reward = -dist_to_target
    
    # 动作平滑惩罚
    action_penalty = 0.01 * np.linalg.norm(self.last_action)
    
    # 接近目标时的额外奖励
    potential_reward = -2 * (dist_to_target - self.last_distance)
    
    return sparse_reward + dense_reward + potential_reward - action_penalty

6.2 训练技巧

课程学习：先简单任务（固定目标），后复杂任务（随机目标）
经验回放：使用优先经验回放（PER）提高样本效率
多环境并行：使用多个Gazebo实例并行训练

动作平滑：对动作施加低通滤波

常见问题解决

问题	解决方案
Gazebo启动失败	检查环境变量: `source /usr/share/gazebo/setup.sh`
机械臂不受控制	检查控制器是否正确加载: `ros2 control list_controllers`
训练不收敛	调整奖励函数，增加稀疏奖励
仿真速度慢	降低`max_step_size`，使用`headless`模式