Gazebo中使用PPO训练机械臂是一个复杂的系统工程。我来提供一个完整的、可操作的实现方案,从环境搭建到训练部署。
📋 整体架构图
text
┌─────────────────────────────────────────────────────────────┐
│ 训练系统架构 │
├─────────────────────────────────────────────────────────────┤
│ Gazebo仿真环境 ↔ Python训练脚本 │
│ ┌─────────────────┐ ┌─────────────────┐ │
│ │ 机械臂URDF模型 │ │ PPO算法实现 │ │
│ │ 物理引擎 │ ←─── ROS 2 ───→ │ 神经网络 │ │
│ │ 传感器数据 │ (或gym接口) │ 经验回放 │ │
│ │ 障碍物/目标物体 │ │ 模型保存/加载 │ │
│ └─────────────────┘ └─────────────────┘ │
└─────────────────────────────────────────────────────────────┘
🛠️ 第一步:环境搭建
1.1 安装必要软件
bash
# 1. 安装Gazebo(建议用Harmonic版本)
sudo apt install gz-harmonic
# 2. 安装ROS 2 Humble(用于通信)
sudo apt install ros-humble-desktop
sudo apt install ros-humble-gazebo-ros-pkgs
# 3. 安装Python依赖
pip install torch torchvision torchaudio
pip install gymnasium pybullet stable-baselines3
pip install numpy matplotlib tqdm
1.2 准备机械臂URDF模型
bash
# 下载URDF模型(以AUBO i5为例)
mkdir -p ~/robot_ws/src
cd ~/robot_ws/src
git clone https://github.com/auboliujie/aubo_robot.git
# 编译
cd ~/robot_ws
colcon build
source install/setup.bash
🎯 第二步:创建训练环境接口
2.1 Gymnasium风格环境包装器
python
#!/usr/bin/env python3
# robot_gazebo_env.py
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import rclpy
from rclpy.node import Node
from sensor_msgs.msg import JointState
from geometry_msgs.msg import Pose, Point
from std_msgs.msg import Float64MultiArray
import time
class RobotArmGazeboEnv(gym.Env):
"""Gazebo机械臂训练环境"""
def __init__(self, robot_name='aubo_i5', render_mode=None):
super().__init__()
# 初始化ROS2节点
rclpy.init(args=None)
self.node = Node('robot_arm_env')
# 机器人参数
self.robot_name = robot_name
self.num_joints = 6
self.joint_names = [f'joint_{i+1}' for i in range(self.num_joints)]
# 关节限位(弧度)
self.joint_limits = {
'min': np.array([-3.14, -2.0, -2.5, -3.14, -2.0, -3.14]),
'max': np.array([3.14, 2.0, 2.5, 3.14, 2.0, 3.14])
}
# 动作空间:6个关节的角度增量 [-0.1, 0.1] 弧度
self.action_space = spaces.Box(
low=-0.1, high=0.1, shape=(self.num_joints,), dtype=np.float32
)
# 状态空间:12维(6关节角度 + 6关节速度)+ 3目标位置 + 3末端位置
# 可根据需要添加障碍物信息
self.observation_space = spaces.Box(
low=-np.inf, high=np.inf, shape=(12 + 3 + 3,), dtype=np.float32
)
# ROS2 通信
self.joint_state_sub = self.node.create_subscription(
JointState, '/joint_states', self.joint_state_callback, 10
)
self.arm_pub = self.node.create_publisher(
Float64MultiArray, '/position_controller/commands', 10
)
# 状态变量
self.current_joint_positions = np.zeros(self.num_joints)
self.current_joint_velocities = np.zeros(self.num_joints)
self.target_position = np.array([0.5, 0.0, 0.5]) # 目标位置
# 末端执行器位置(需从正运动学计算)
self.ee_position = np.zeros(3)
# 控制频率
self.dt = 0.1 # 100ms
def joint_state_callback(self, msg):
"""关节状态回调"""
for i, name in enumerate(msg.name):
if name in self.joint_names:
idx = self.joint_names.index(name)
self.current_joint_positions[idx] = msg.position[idx]
self.current_joint_velocities[idx] = msg.velocity[idx]
def compute_forward_kinematics(self, joint_angles):
"""正运动学计算末端位置(简化版)"""
# 实际应用中需要根据机械臂DH参数计算
# 这里用简化模型
l1, l2, l3 = 0.3, 0.3, 0.2 # 连杆长度
theta1, theta2, theta3, theta4, theta5, theta6 = joint_angles
# 简化计算(实际应用需用完整DH矩阵)
x = l2 * np.cos(theta1) * np.cos(theta2) + l3 * np.cos(theta1) * np.cos(theta2+theta3)
y = l2 * np.sin(theta1) * np.cos(theta2) + l3 * np.sin(theta1) * np.cos(theta2+theta3)
z = l1 + l2 * np.sin(theta2) + l3 * np.sin(theta2+theta3)
return np.array([x, y, z])
def get_state(self):
"""获取当前状态"""
# 计算末端位置
self.ee_position = self.compute_forward_kinematics(self.current_joint_positions)
# 组合状态向量
state = np.concatenate([
self.current_joint_positions, # 6维
self.current_joint_velocities, # 6维
self.target_position, # 3维
self.ee_position # 3维
])
return state.astype(np.float32)
def compute_reward(self):
"""计算奖励"""
# 到目标的距离
dist_to_target = np.linalg.norm(self.ee_position - self.target_position)
# 基础奖励:负距离
reward = -dist_to_target
# 目标到达奖励
if dist_to_target < 0.05: # 5cm内算成功
reward += 100
self.success = True
# 关节限位惩罚
for i in range(self.num_joints):
if self.current_joint_positions[i] < self.joint_limits['min'][i]:
reward -= 10
if self.current_joint_positions[i] > self.joint_limits['max'][i]:
reward -= 10
return reward
def step(self, action):
"""执行一步动作"""
# 应用动作:关节角度增量
target_positions = self.current_joint_positions + action
target_positions = np.clip(target_positions,
self.joint_limits['min'],
self.joint_limits['max'])
# 发布控制命令
cmd_msg = Float64MultiArray()
cmd_msg.data = target_positions.tolist()
self.arm_pub.publish(cmd_msg)
# 等待一个控制周期
time.sleep(self.dt)
# 获取新状态
rclpy.spin_once(self.node, timeout_sec=0.1)
next_state = self.get_state()
# 计算奖励
reward = self.compute_reward()
# 判断是否结束
terminated = self.success
truncated = False
return next_state, reward, terminated, truncated, {}
def reset(self, seed=None):
"""重置环境"""
super().reset(seed=seed)
# 随机初始化关节角度
random_angles = np.random.uniform(
self.joint_limits['min'] * 0.5, # 限制在中间范围
self.joint_limits['max'] * 0.5,
self.num_joints
)
# 发布初始位置
cmd_msg = Float64MultiArray()
cmd_msg.data = random_angles.tolist()
self.arm_pub.publish(cmd_msg)
time.sleep(1.0) # 等待机器人到位
# 随机化目标位置(可选)
if np.random.random() < 0.3:
self.target_position = np.random.uniform([0.3, -0.3, 0.3],
[0.7, 0.3, 0.7])
self.success = False
# 获取初始状态
rclpy.spin_once(self.node, timeout_sec=0.1)
state = self.get_state()
return state, {}
def close(self):
"""关闭环境"""
self.node.destroy_node()
rclpy.shutdown()
🧠 第三步:PPO算法实现
python
#!/usr/bin/env python3
# ppo_robot_arm.py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
from collections import deque
import random
class ActorNetwork(nn.Module):
"""策略网络"""
def __init__(self, state_dim, action_dim, hidden_dim=256):
super().__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.fc3 = nn.Linear(hidden_dim, hidden_dim)
# 均值输出
self.mean = nn.Linear(hidden_dim, action_dim)
# 标准差(可学习参数)
self.log_std = nn.Parameter(torch.zeros(action_dim))
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
mean = self.mean(x)
std = self.log_std.exp()
return mean, std
class CriticNetwork(nn.Module):
"""价值网络"""
def __init__(self, state_dim, hidden_dim=256):
super().__init__()
self.fc1 = nn.Linear(state_dim, hidden_dim)
self.fc2 = nn.Linear(hidden_dim, hidden_dim)
self.fc3 = nn.Linear(hidden_dim, 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return self.fc3(x)
class PPOAgent:
"""PPO智能体"""
def __init__(self, state_dim, action_dim, lr=3e-4, gamma=0.99,
clip_epsilon=0.2, epochs=10, batch_size=64):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {self.device}")
self.actor = ActorNetwork(state_dim, action_dim).to(self.device)
self.critic = CriticNetwork(state_dim).to(self.device)
self.optimizer = optim.Adam(
list(self.actor.parameters()) + list(self.critic.parameters()),
lr=lr
)
self.gamma = gamma
self.clip_epsilon = clip_epsilon
self.epochs = epochs
self.batch_size = batch_size
# 经验缓冲区
self.states = []
self.actions = []
self.rewards = []
self.next_states = []
self.dones = []
self.log_probs = []
def get_action(self, state, deterministic=False):
"""选择动作"""
state = torch.FloatTensor(state).unsqueeze(0).to(self.device)
with torch.no_grad():
mean, std = self.actor(state)
if deterministic:
# 测试时取均值
action = mean
log_prob = None
else:
# 训练时采样
dist = torch.distributions.Normal(mean, std)
action = dist.sample()
log_prob = dist.log_prob(action).sum(dim=-1)
return action.cpu().numpy()[0], log_prob
def store_transition(self, state, action, reward, next_state, done, log_prob):
"""存储经验"""
self.states.append(state)
self.actions.append(action)
self.rewards.append(reward)
self.next_states.append(next_state)
self.dones.append(done)
self.log_probs.append(log_prob)
def compute_gae(self, rewards, values, next_values, dones, gamma=0.99, lam=0.95):
"""计算广义优势估计"""
advantages = []
gae = 0
for t in reversed(range(len(rewards))):
if t == len(rewards) - 1:
delta = rewards[t] + gamma * next_values[t] * (1 - dones[t]) - values[t]
else:
delta = rewards[t] + gamma * values[t+1] * (1 - dones[t]) - values[t]
gae = delta + gamma * lam * (1 - dones[t]) * gae
advantages.insert(0, gae)
return advantages
def update(self):
"""更新策略"""
if len(self.states) < self.batch_size:
return
# 转换为tensor
states = torch.FloatTensor(np.array(self.states)).to(self.device)
actions = torch.FloatTensor(np.array(self.actions)).to(self.device)
rewards = torch.FloatTensor(np.array(self.rewards)).unsqueeze(1).to(self.device)
next_states = torch.FloatTensor(np.array(self.next_states)).to(self.device)
dones = torch.FloatTensor(np.array(self.dones)).unsqueeze(1).to(self.device)
old_log_probs = torch.FloatTensor(np.array(self.log_probs)).unsqueeze(1).to(self.device)
# 计算价值和优势
with torch.no_grad():
values = self.critic(states)
next_values = self.critic(next_states)
advantages = self.compute_gae(
rewards.cpu().numpy().flatten(),
values.cpu().numpy().flatten(),
next_values.cpu().numpy().flatten(),
dones.cpu().numpy().flatten()
)
advantages = torch.FloatTensor(advantages).unsqueeze(1).to(self.device)
# 计算回报
returns = advantages + values
# PPO更新
for _ in range(self.epochs):
# 随机采样
indices = np.random.permutation(len(states))
for start in range(0, len(states), self.batch_size):
end = start + self.batch_size
batch_indices = indices[start:end]
batch_states = states[batch_indices]
batch_actions = actions[batch_indices]
batch_returns = returns[batch_indices]
batch_advantages = advantages[batch_indices]
batch_old_log_probs = old_log_probs[batch_indices]
# 标准化优势
batch_advantages = (batch_advantages - batch_advantages.mean()) / (batch_advantages.std() + 1e-8)
# 计算新策略的对数概率
mean, std = self.actor(batch_states)
dist = torch.distributions.Normal(mean, std)
new_log_probs = dist.log_prob(batch_actions).sum(dim=-1, keepdim=True)
# 计算比率
ratio = torch.exp(new_log_probs - batch_old_log_probs)
# PPO截断目标
surr1 = ratio * batch_advantages
surr2 = torch.clamp(ratio, 1 - self.clip_epsilon, 1 + self.clip_epsilon) * batch_advantages
actor_loss = -torch.min(surr1, surr2).mean()
# 价值损失
values_pred = self.critic(batch_states)
critic_loss = F.mse_loss(values_pred, batch_returns)
# 总损失
total_loss = actor_loss + 0.5 * critic_loss
# 反向传播
self.optimizer.zero_grad()
total_loss.backward()
self.optimizer.step()
# 清空缓冲区
self.states = []
self.actions = []
self.rewards = []
self.next_states = []
self.dones = []
self.log_probs = []
def save(self, path):
"""保存模型"""
torch.save({
'actor': self.actor.state_dict(),
'critic': self.critic.state_dict(),
'optimizer': self.optimizer.state_dict()
}, path)
print(f"模型已保存到 {path}")
def load(self, path):
"""加载模型"""
checkpoint = torch.load(path)
self.actor.load_state_dict(checkpoint['actor'])
self.critic.load_state_dict(checkpoint['critic'])
self.optimizer.load_state_dict(checkpoint['optimizer'])
print(f"模型已从 {path} 加载")
🚀 第四步:训练脚本
python
#!/usr/bin/env python3
# train_robot_arm.py
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
import os
from robot_gazebo_env import RobotArmGazeboEnv
from ppo_robot_arm import PPOAgent
def train():
"""训练主函数"""
# 创建环境
env = RobotArmGazeboEnv(robot_name='aubo_i5')
# 获取状态和动作维度
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
print(f"状态维度: {state_dim}")
print(f"动作维度: {action_dim}")
# 创建PPO智能体
agent = PPOAgent(
state_dim=state_dim,
action_dim=action_dim,
lr=3e-4,
gamma=0.99,
clip_epsilon=0.2,
epochs=10,
batch_size=64
)
# 训练参数
num_episodes = 2000
max_steps_per_episode = 200
save_interval = 100
# 记录训练数据
episode_rewards = []
success_rates = []
best_reward = -float('inf')
print("\n开始训练...")
print("="*60)
for episode in tqdm(range(num_episodes)):
state, _ = env.reset()
episode_reward = 0
episode_success = False
for step in range(max_steps_per_episode):
# 选择动作
action, log_prob = agent.get_action(state)
# 执行动作
next_state, reward, terminated, truncated, _ = env.step(action)
done = terminated or truncated
# 存储经验
if log_prob is not None:
agent.store_transition(state, action, reward, next_state, done, log_prob.item())
episode_reward += reward
state = next_state
if done:
episode_success = terminated
break
episode_rewards.append(episode_reward)
# 更新策略
agent.update()
# 记录成功
if episode_success:
success_rates.append(1)
else:
success_rates.append(0)
# 保存最佳模型
if episode_reward > best_reward:
best_reward = episode_reward
agent.save("best_model.pth")
# 定期保存和输出
if (episode + 1) % save_interval == 0:
avg_reward = np.mean(episode_rewards[-100:])
avg_success = np.mean(success_rates[-100:])
print(f"\n回合 {episode + 1}/{num_episodes}")
print(f"最近100回合平均奖励: {avg_reward:.2f}")
print(f"最近100回合成功率: {avg_success:.2%}")
print(f"最佳奖励: {best_reward:.2f}")
# 保存检查点
agent.save(f"checkpoint_ep{episode+1}.pth")
# 绘制训练曲线
plot_training_curve(episode_rewards, success_rates, episode + 1)
print("\n训练完成!")
print(f"最终平均奖励: {np.mean(episode_rewards[-100:]):.2f}")
print(f"最终成功率: {np.mean(success_rates[-100:]):.2%}")
# 保存最终模型
agent.save("final_model.pth")
# 绘制最终曲线
plot_training_curve(episode_rewards, success_rates, num_episodes)
env.close()
def plot_training_curve(rewards, success_rates, episode):
"""绘制训练曲线"""
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 8))
# 奖励曲线
ax1.plot(rewards, alpha=0.3, color='blue')
if len(rewards) > 100:
moving_avg = np.convolve(rewards, np.ones(100)/100, mode='valid')
ax1.plot(range(99, len(rewards)), moving_avg, color='red', linewidth=2)
ax1.set_xlabel('Episode')
ax1.set_ylabel('Reward')
ax1.set_title('Training Rewards')
ax1.grid(True, alpha=0.3)
# 成功率曲线
if len(success_rates) > 100:
success_ma = np.convolve(success_rates, np.ones(100)/100, mode='valid')
ax2.plot(range(99, len(success_rates)), success_ma, color='green', linewidth=2)
ax2.set_xlabel('Episode')
ax2.set_ylabel('Success Rate')
ax2.set_title('Success Rate')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'training_curve_ep{episode}.png')
plt.close()
def test_model(model_path="best_model.pth", num_episodes=10):
"""测试训练好的模型"""
print("\n测试训练好的模型...")
env = RobotArmGazeboEnv(robot_name='aubo_i5', render_mode='human')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
agent = PPOAgent(state_dim, action_dim)
agent.load(model_path)
success_count = 0
for episode in range(num_episodes):
state, _ = env.reset()
episode_reward = 0
steps = 0
while steps < 200:
action, _ = agent.get_action(state, deterministic=True)
next_state, reward, terminated, truncated, _ = env.step(action)
state = next_state
episode_reward += reward
steps += 1
if terminated:
success_count += 1
break
print(f"测试回合 {episode + 1}: 奖励 = {episode_reward:.2f}, 步数 = {steps}, 成功 = {terminated}")
print(f"\n测试成功率: {success_count/num_episodes:.2%}")
env.close()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--mode', type=str, default='train', choices=['train', 'test'])
parser.add_argument('--model', type=str, default='best_model.pth')
args = parser.parse_args()
if args.mode == 'train':
train()
else:
test_model(args.model)
🔧 第五步:Gazebo启动配置
5.1 创建启动文件
xml
<!-- robot_arm.launch.py -->
from launch import LaunchDescription
from launch_ros.actions import Node
def generate_launch_description():
return LaunchDescription([
# 启动Gazebo
Node(
package='gazebo_ros',
executable='gzserver',
arguments=['-s', 'libgazebo_ros_factory.so'],
output='screen'
),
Node(
package='gazebo_ros',
executable='gzclient',
output='screen'
),
# 加载机械臂模型
Node(
package='gazebo_ros',
executable='spawn_entity.py',
arguments=['-entity', 'aubo_i5',
'-file', '$(find aubo_description)/urdf/aubo_i5.urdf'],
output='screen'
),
# 启动关节控制器
Node(
package='controller_manager',
executable='spawner',
arguments=['joint_state_controller', 'position_controller'],
output='screen'
),
# 启动训练节点
Node(
package='robot_arm_training',
executable='train_robot_arm.py',
output='screen'
)
])
5.2 运行训练
bash
# 1. 启动Gazebo和机械臂
ros2 launch robot_arm_training robot_arm.launch.py
# 2. 在另一个终端启动训练
python3 train_robot_arm.py --mode train
# 3. 测试模型
python3 train_robot_arm.py --mode test --model best_model.pth
📊 六、性能优化技巧
6.1 奖励函数优化
python
def compute_enhanced_reward(self):
dist_to_target = np.linalg.norm(self.ee_position - self.target_position)
# 稀疏奖励 + 稠密奖励组合
sparse_reward = 100 if dist_to_target < 0.05 else -1
dense_reward = -dist_to_target
# 动作平滑惩罚
action_penalty = 0.01 * np.linalg.norm(self.last_action)
# 接近目标时的额外奖励
potential_reward = -2 * (dist_to_target - self.last_distance)
return sparse_reward + dense_reward + potential_reward - action_penalty
6.2 训练技巧
-
课程学习:先简单任务(固定目标),后复杂任务(随机目标)
-
经验回放:使用优先经验回放(PER)提高样本效率
-
多环境并行:使用多个Gazebo实例并行训练
-
动作平滑:对动作施加低通滤波
常见问题解决
问题 解决方案 Gazebo启动失败 检查环境变量: source /usr/share/gazebo/setup.sh机械臂不受控制 检查控制器是否正确加载: ros2 control list_controllers训练不收敛 调整奖励函数,增加稀疏奖励 仿真速度慢 降低 max_step_size,使用headless模式