【无标题】

Gazebo中使用PPO训练机械臂是一个复杂的系统工程。我来提供一个完整的、可操作的实现方案,从环境搭建到训练部署。

📋 整体架构图

text

复制代码
┌─────────────────────────────────────────────────────────────┐
│                    训练系统架构                               │
├─────────────────────────────────────────────────────────────┤
│  Gazebo仿真环境             ↔         Python训练脚本          │
│  ┌─────────────────┐                  ┌─────────────────┐   │
│  │ 机械臂URDF模型   │                  │ PPO算法实现     │   │
│  │ 物理引擎         │ ←─── ROS 2 ───→ │ 神经网络       │   │
│  │ 传感器数据       │   (或gym接口)    │ 经验回放       │   │
│  │ 障碍物/目标物体  │                  │ 模型保存/加载   │   │
│  └─────────────────┘                  └─────────────────┘   │
└─────────────────────────────────────────────────────────────┘

🛠️ 第一步:环境搭建

1.1 安装必要软件

bash

复制代码
# 1. 安装Gazebo(建议用Harmonic版本)
sudo apt install gz-harmonic

# 2. 安装ROS 2 Humble(用于通信)
sudo apt install ros-humble-desktop
sudo apt install ros-humble-gazebo-ros-pkgs

# 3. 安装Python依赖
pip install torch torchvision torchaudio
pip install gymnasium pybullet stable-baselines3
pip install numpy matplotlib tqdm

1.2 准备机械臂URDF模型

bash

复制代码
# 下载URDF模型(以AUBO i5为例)
mkdir -p ~/robot_ws/src
cd ~/robot_ws/src
git clone https://github.com/auboliujie/aubo_robot.git

# 编译
cd ~/robot_ws
colcon build
source install/setup.bash

🎯 第二步:创建训练环境接口

2.1 Gymnasium风格环境包装器

python

复制代码
#!/usr/bin/env python3
# robot_gazebo_env.py

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import rclpy
from rclpy.node import Node
from sensor_msgs.msg import JointState
from geometry_msgs.msg import Pose, Point
from std_msgs.msg import Float64MultiArray
import time

class RobotArmGazeboEnv(gym.Env):
    """Gazebo机械臂训练环境"""
    
    def __init__(self, robot_name='aubo_i5', render_mode=None):
        super().__init__()
        
        # 初始化ROS2节点
        rclpy.init(args=None)
        self.node = Node('robot_arm_env')
        
        # 机器人参数
        self.robot_name = robot_name
        self.num_joints = 6
        self.joint_names = [f'joint_{i+1}' for i in range(self.num_joints)]
        
        # 关节限位(弧度)
        self.joint_limits = {
            'min': np.array([-3.14, -2.0, -2.5, -3.14, -2.0, -3.14]),
            'max': np.array([3.14, 2.0, 2.5, 3.14, 2.0, 3.14])
        }
        
        # 动作空间:6个关节的角度增量 [-0.1, 0.1] 弧度
        self.action_space = spaces.Box(
            low=-0.1, high=0.1, shape=(self.num_joints,), dtype=np.float32
        )
        
        # 状态空间:12维(6关节角度 + 6关节速度)+ 3目标位置 + 3末端位置
        # 可根据需要添加障碍物信息
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(12 + 3 + 3,), dtype=np.float32
        )
        
        # ROS2 通信
        self.joint_state_sub = self.node.create_subscription(
            JointState, '/joint_states', self.joint_state_callback, 10
        )
        self.arm_pub = self.node.create_publisher(
            Float64MultiArray, '/position_controller/commands', 10
        )
        
        # 状态变量
        self.current_joint_positions = np.zeros(self.num_joints)
        self.current_joint_velocities = np.zeros(self.num_joints)
        self.target_position = np.array([0.5, 0.0, 0.5])  # 目标位置
        
        # 末端执行器位置(需从正运动学计算)
        self.ee_position = np.zeros(3)
        
        # 控制频率
        self.dt = 0.1  # 100ms
        
    def joint_state_callback(self, msg):
        """关节状态回调"""
        for i, name in enumerate(msg.name):
            if name in self.joint_names:
                idx = self.joint_names.index(name)
                self.current_joint_positions[idx] = msg.position[idx]
                self.current_joint_velocities[idx] = msg.velocity[idx]
    
    def compute_forward_kinematics(self, joint_angles):
        """正运动学计算末端位置(简化版)"""
        # 实际应用中需要根据机械臂DH参数计算
        # 这里用简化模型
        l1, l2, l3 = 0.3, 0.3, 0.2  # 连杆长度
        theta1, theta2, theta3, theta4, theta5, theta6 = joint_angles
        
        # 简化计算(实际应用需用完整DH矩阵)
        x = l2 * np.cos(theta1) * np.cos(theta2) + l3 * np.cos(theta1) * np.cos(theta2+theta3)
        y = l2 * np.sin(theta1) * np.cos(theta2) + l3 * np.sin(theta1) * np.cos(theta2+theta3)
        z = l1 + l2 * np.sin(theta2) + l3 * np.sin(theta2+theta3)
        
        return np.array([x, y, z])
    
    def get_state(self):
        """获取当前状态"""
        # 计算末端位置
        self.ee_position = self.compute_forward_kinematics(self.current_joint_positions)
        
        # 组合状态向量
        state = np.concatenate([
            self.current_joint_positions,      # 6维
            self.current_joint_velocities,     # 6维
            self.target_position,               # 3维
            self.ee_position                     # 3维
        ])
        
        return state.astype(np.float32)
    
    def compute_reward(self):
        """计算奖励"""
        # 到目标的距离
        dist_to_target = np.linalg.norm(self.ee_position - self.target_position)
        
        # 基础奖励:负距离
        reward = -dist_to_target
        
        # 目标到达奖励
        if dist_to_target < 0.05:  # 5cm内算成功
            reward += 100
            self.success = True
        
        # 关节限位惩罚
        for i in range(self.num_joints):
            if self.current_joint_positions[i] < self.joint_limits['min'][i]:
                reward -= 10
            if self.current_joint_positions[i] > self.joint_limits['max'][i]:
                reward -= 10
        
        return reward
    
    def step(self, action):
        """执行一步动作"""
        # 应用动作:关节角度增量
        target_positions = self.current_joint_positions + action
        target_positions = np.clip(target_positions, 
                                   self.joint_limits['min'], 
                                   self.joint_limits['max'])
        
        # 发布控制命令
        cmd_msg = Float64MultiArray()
        cmd_msg.data = target_positions.tolist()
        self.arm_pub.publish(cmd_msg)
        
        # 等待一个控制周期
        time.sleep(self.dt)
        
        # 获取新状态
        rclpy.spin_once(self.node, timeout_sec=0.1)
        next_state = self.get_state()
        
        # 计算奖励
        reward = self.compute_reward()
        
        # 判断是否结束
        terminated = self.success
        truncated = False
        
        return next_state, reward, terminated, truncated, {}
    
    def reset(self, seed=None):
        """重置环境"""
        super().reset(seed=seed)
        
        # 随机初始化关节角度
        random_angles = np.random.uniform(
            self.joint_limits['min'] * 0.5,  # 限制在中间范围
            self.joint_limits['max'] * 0.5,
            self.num_joints
        )
        
        # 发布初始位置
        cmd_msg = Float64MultiArray()
        cmd_msg.data = random_angles.tolist()
        self.arm_pub.publish(cmd_msg)
        time.sleep(1.0)  # 等待机器人到位
        
        # 随机化目标位置(可选)
        if np.random.random() < 0.3:
            self.target_position = np.random.uniform([0.3, -0.3, 0.3], 
                                                     [0.7, 0.3, 0.7])
        
        self.success = False
        
        # 获取初始状态
        rclpy.spin_once(self.node, timeout_sec=0.1)
        state = self.get_state()
        
        return state, {}
    
    def close(self):
        """关闭环境"""
        self.node.destroy_node()
        rclpy.shutdown()

🧠 第三步:PPO算法实现

python

复制代码
#!/usr/bin/env python3
# ppo_robot_arm.py

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque
import random

class ActorNetwork(nn.Module):
    """策略网络"""
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, hidden_dim)
        
        # 均值输出
        self.mean = nn.Linear(hidden_dim, action_dim)
        # 标准差(可学习参数)
        self.log_std = nn.Parameter(torch.zeros(action_dim))
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        mean = self.mean(x)
        std = self.log_std.exp()
        return mean, std

class CriticNetwork(nn.Module):
    """价值网络"""
    def __init__(self, state_dim, hidden_dim=256):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, 1)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class PPOAgent:
    """PPO智能体"""
    def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99, 
                 clip_epsilon=0.2, epochs=10, batch_size=64):
        
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"使用设备: {self.device}")
        
        self.actor = ActorNetwork(state_dim, action_dim).to(self.device)
        self.critic = CriticNetwork(state_dim).to(self.device)
        self.optimizer = optim.Adam(
            list(self.actor.parameters()) + list(self.critic.parameters()), 
            lr=lr
        )
        
        self.gamma = gamma
        self.clip_epsilon = clip_epsilon
        self.epochs = epochs
        self.batch_size = batch_size
        
        # 经验缓冲区
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []
        self.log_probs = []
        
    def get_action(self, state, deterministic=False):
        """选择动作"""
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
        
        with torch.no_grad():
            mean, std = self.actor(state)
            
        if deterministic:
            # 测试时取均值
            action = mean
            log_prob = None
        else:
            # 训练时采样
            dist = torch.distributions.Normal(mean, std)
            action = dist.sample()
            log_prob = dist.log_prob(action).sum(dim=-1)
            
        return action.cpu().numpy()[0], log_prob
    
    def store_transition(self, state, action, reward, next_state, done, log_prob):
        """存储经验"""
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)
        self.next_states.append(next_state)
        self.dones.append(done)
        self.log_probs.append(log_prob)
    
    def compute_gae(self, rewards, values, next_values, dones, gamma=0.99, lam=0.95):
        """计算广义优势估计"""
        advantages = []
        gae = 0
        
        for t in reversed(range(len(rewards))):
            if t == len(rewards) - 1:
                delta = rewards[t] + gamma * next_values[t] * (1 - dones[t]) - values[t]
            else:
                delta = rewards[t] + gamma * values[t+1] * (1 - dones[t]) - values[t]
            
            gae = delta + gamma * lam * (1 - dones[t]) * gae
            advantages.insert(0, gae)
        
        return advantages
    
    def update(self):
        """更新策略"""
        if len(self.states) < self.batch_size:
            return
        
        # 转换为tensor
        states = torch.FloatTensor(np.array(self.states)).to(self.device)
        actions = torch.FloatTensor(np.array(self.actions)).to(self.device)
        rewards = torch.FloatTensor(np.array(self.rewards)).unsqueeze(1).to(self.device)
        next_states = torch.FloatTensor(np.array(self.next_states)).to(self.device)
        dones = torch.FloatTensor(np.array(self.dones)).unsqueeze(1).to(self.device)
        old_log_probs = torch.FloatTensor(np.array(self.log_probs)).unsqueeze(1).to(self.device)
        
        # 计算价值和优势
        with torch.no_grad():
            values = self.critic(states)
            next_values = self.critic(next_states)
            
        advantages = self.compute_gae(
            rewards.cpu().numpy().flatten(),
            values.cpu().numpy().flatten(),
            next_values.cpu().numpy().flatten(),
            dones.cpu().numpy().flatten()
        )
        advantages = torch.FloatTensor(advantages).unsqueeze(1).to(self.device)
        
        # 计算回报
        returns = advantages + values
        
        # PPO更新
        for _ in range(self.epochs):
            # 随机采样
            indices = np.random.permutation(len(states))
            for start in range(0, len(states), self.batch_size):
                end = start + self.batch_size
                batch_indices = indices[start:end]
                
                batch_states = states[batch_indices]
                batch_actions = actions[batch_indices]
                batch_returns = returns[batch_indices]
                batch_advantages = advantages[batch_indices]
                batch_old_log_probs = old_log_probs[batch_indices]
                
                # 标准化优势
                batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-8)
                
                # 计算新策略的对数概率
                mean, std = self.actor(batch_states)
                dist = torch.distributions.Normal(mean, std)
                new_log_probs = dist.log_prob(batch_actions).sum(dim=-1, keepdim=True)
                
                # 计算比率
                ratio = torch.exp(new_log_probs - batch_old_log_probs)
                
                # PPO截断目标
                surr1 = ratio * batch_advantages
                surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_advantages
                actor_loss = -torch.min(surr1, surr2).mean()
                
                # 价值损失
                values_pred = self.critic(batch_states)
                critic_loss = F.mse_loss(values_pred, batch_returns)
                
                # 总损失
                total_loss = actor_loss + 0.5 * critic_loss
                
                # 反向传播
                self.optimizer.zero_grad()
                total_loss.backward()
                self.optimizer.step()
        
        # 清空缓冲区
        self.states = []
        self.actions = []
        self.rewards = []
        self.next_states = []
        self.dones = []
        self.log_probs = []
    
    def save(self, path):
        """保存模型"""
        torch.save({
            'actor': self.actor.state_dict(),
            'critic': self.critic.state_dict(),
            'optimizer': self.optimizer.state_dict()
        }, path)
        print(f"模型已保存到 {path}")
    
    def load(self, path):
        """加载模型"""
        checkpoint = torch.load(path)
        self.actor.load_state_dict(checkpoint['actor'])
        self.critic.load_state_dict(checkpoint['critic'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        print(f"模型已从 {path} 加载")

🚀 第四步:训练脚本

python

复制代码
#!/usr/bin/env python3
# train_robot_arm.py

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os

from robot_gazebo_env import RobotArmGazeboEnv
from ppo_robot_arm import PPOAgent

def train():
    """训练主函数"""
    
    # 创建环境
    env = RobotArmGazeboEnv(robot_name='aubo_i5')
    
    # 获取状态和动作维度
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    print(f"状态维度: {state_dim}")
    print(f"动作维度: {action_dim}")
    
    # 创建PPO智能体
    agent = PPOAgent(
        state_dim=state_dim,
        action_dim=action_dim,
        lr=3e-4,
        gamma=0.99,
        clip_epsilon=0.2,
        epochs=10,
        batch_size=64
    )
    
    # 训练参数
    num_episodes = 2000
    max_steps_per_episode = 200
    save_interval = 100
    
    # 记录训练数据
    episode_rewards = []
    success_rates = []
    best_reward = -float('inf')
    
    print("\n开始训练...")
    print("="*60)
    
    for episode in tqdm(range(num_episodes)):
        state, _ = env.reset()
        episode_reward = 0
        episode_success = False
        
        for step in range(max_steps_per_episode):
            # 选择动作
            action, log_prob = agent.get_action(state)
            
            # 执行动作
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            
            # 存储经验
            if log_prob is not None:
                agent.store_transition(state, action, reward, next_state, done, log_prob.item())
            
            episode_reward += reward
            state = next_state
            
            if done:
                episode_success = terminated
                break
        
        episode_rewards.append(episode_reward)
        
        # 更新策略
        agent.update()
        
        # 记录成功
        if episode_success:
            success_rates.append(1)
        else:
            success_rates.append(0)
        
        # 保存最佳模型
        if episode_reward > best_reward:
            best_reward = episode_reward
            agent.save("best_model.pth")
        
        # 定期保存和输出
        if (episode + 1) % save_interval == 0:
            avg_reward = np.mean(episode_rewards[-100:])
            avg_success = np.mean(success_rates[-100:])
            
            print(f"\n回合 {episode + 1}/{num_episodes}")
            print(f"最近100回合平均奖励: {avg_reward:.2f}")
            print(f"最近100回合成功率: {avg_success:.2%}")
            print(f"最佳奖励: {best_reward:.2f}")
            
            # 保存检查点
            agent.save(f"checkpoint_ep{episode+1}.pth")
            
            # 绘制训练曲线
            plot_training_curve(episode_rewards, success_rates, episode + 1)
    
    print("\n训练完成!")
    print(f"最终平均奖励: {np.mean(episode_rewards[-100:]):.2f}")
    print(f"最终成功率: {np.mean(success_rates[-100:]):.2%}")
    
    # 保存最终模型
    agent.save("final_model.pth")
    
    # 绘制最终曲线
    plot_training_curve(episode_rewards, success_rates, num_episodes)
    
    env.close()

def plot_training_curve(rewards, success_rates, episode):
    """绘制训练曲线"""
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
    
    # 奖励曲线
    ax1.plot(rewards, alpha=0.3, color='blue')
    if len(rewards) > 100:
        moving_avg = np.convolve(rewards, np.ones(100)/100, mode='valid')
        ax1.plot(range(99, len(rewards)), moving_avg, color='red', linewidth=2)
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Reward')
    ax1.set_title('Training Rewards')
    ax1.grid(True, alpha=0.3)
    
    # 成功率曲线
    if len(success_rates) > 100:
        success_ma = np.convolve(success_rates, np.ones(100)/100, mode='valid')
        ax2.plot(range(99, len(success_rates)), success_ma, color='green', linewidth=2)
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Success Rate')
    ax2.set_title('Success Rate')
    ax2.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'training_curve_ep{episode}.png')
    plt.close()

def test_model(model_path="best_model.pth", num_episodes=10):
    """测试训练好的模型"""
    print("\n测试训练好的模型...")
    
    env = RobotArmGazeboEnv(robot_name='aubo_i5', render_mode='human')
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    
    agent = PPOAgent(state_dim, action_dim)
    agent.load(model_path)
    
    success_count = 0
    
    for episode in range(num_episodes):
        state, _ = env.reset()
        episode_reward = 0
        steps = 0
        
        while steps < 200:
            action, _ = agent.get_action(state, deterministic=True)
            next_state, reward, terminated, truncated, _ = env.step(action)
            
            state = next_state
            episode_reward += reward
            steps += 1
            
            if terminated:
                success_count += 1
                break
        
        print(f"测试回合 {episode + 1}: 奖励 = {episode_reward:.2f}, 步数 = {steps}, 成功 = {terminated}")
    
    print(f"\n测试成功率: {success_count/num_episodes:.2%}")
    env.close()

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--mode', type=str, default='train', choices=['train', 'test'])
    parser.add_argument('--model', type=str, default='best_model.pth')
    args = parser.parse_args()
    
    if args.mode == 'train':
        train()
    else:
        test_model(args.model)

🔧 第五步:Gazebo启动配置

5.1 创建启动文件

xml

复制代码
<!-- robot_arm.launch.py -->
from launch import LaunchDescription
from launch_ros.actions import Node

def generate_launch_description():
    return LaunchDescription([
        # 启动Gazebo
        Node(
            package='gazebo_ros',
            executable='gzserver',
            arguments=['-s', 'libgazebo_ros_factory.so'],
            output='screen'
        ),
        Node(
            package='gazebo_ros',
            executable='gzclient',
            output='screen'
        ),
        
        # 加载机械臂模型
        Node(
            package='gazebo_ros',
            executable='spawn_entity.py',
            arguments=['-entity', 'aubo_i5', 
                      '-file', '$(find aubo_description)/urdf/aubo_i5.urdf'],
            output='screen'
        ),
        
        # 启动关节控制器
        Node(
            package='controller_manager',
            executable='spawner',
            arguments=['joint_state_controller', 'position_controller'],
            output='screen'
        ),
        
        # 启动训练节点
        Node(
            package='robot_arm_training',
            executable='train_robot_arm.py',
            output='screen'
        )
    ])

5.2 运行训练

bash

复制代码
# 1. 启动Gazebo和机械臂
ros2 launch robot_arm_training robot_arm.launch.py

# 2. 在另一个终端启动训练
python3 train_robot_arm.py --mode train

# 3. 测试模型
python3 train_robot_arm.py --mode test --model best_model.pth

📊 六、性能优化技巧

6.1 奖励函数优化

python

复制代码
def compute_enhanced_reward(self):
    dist_to_target = np.linalg.norm(self.ee_position - self.target_position)
    
    # 稀疏奖励 + 稠密奖励组合
    sparse_reward = 100 if dist_to_target < 0.05 else -1
    dense_reward = -dist_to_target
    
    # 动作平滑惩罚
    action_penalty = 0.01 * np.linalg.norm(self.last_action)
    
    # 接近目标时的额外奖励
    potential_reward = -2 * (dist_to_target - self.last_distance)
    
    return sparse_reward + dense_reward + potential_reward - action_penalty

6.2 训练技巧

  1. 课程学习:先简单任务(固定目标),后复杂任务(随机目标)

  2. 经验回放:使用优先经验回放(PER)提高样本效率

  3. 多环境并行:使用多个Gazebo实例并行训练

  4. 动作平滑:对动作施加低通滤波

    常见问题解决

    问题 解决方案
    Gazebo启动失败 检查环境变量: source /usr/share/gazebo/setup.sh
    机械臂不受控制 检查控制器是否正确加载: ros2 control list_controllers
    训练不收敛 调整奖励函数,增加稀疏奖励
    仿真速度慢 降低max_step_size,使用headless模式
相关推荐
xin_nai1 分钟前
LeetCode热题100(Java)(5)普通数组
算法·leetcode·职场和发展
数据皮皮侠AI2 分钟前
中国城市可再生能源数据集(2005-2021)|顶刊 Sci Data 11 种能源面板
大数据·人工智能·笔记·能源·1024程序员节
G31135422736 分钟前
如何用 QClaw 龙虾做一个规律作息健康助理 Agent
大数据·人工智能·ai·云计算
幂律智能7 分钟前
零售行业合同管理数智化转型解决方案
大数据·人工智能·零售
旺财矿工9 分钟前
零基础搭建 OpenClaw 2.6.6 Win11 本地化运行环境
人工智能·openclaw·小龙虾·龙虾·openclaw安装包
九成宫10 分钟前
动手学深度学习PyTorch版初步安装过程
人工智能·pytorch·深度学习
Traving Yu10 分钟前
Prompt提示词工程
人工智能·prompt
NOCSAH11 分钟前
统好AI CRM功能解析:智能录入与跟进
人工智能
旖-旎11 分钟前
深搜练习(组合)(5)
c++·算法·深度优先·力扣
He少年13 分钟前
【AI 辅助编程做设备数据采集:一个真实项目的迭代复盘(OpenSpec 驱动)】
人工智能