Mujoco 开源机械臂 RL 强化学习避障、绕障

视频讲解：https://www.bilibili.com/video/BV1bd6sBpEvT/?vd_source=5ba34935b7845cd15c65ef62c64ba82f

代码仓库：https://github.com/LitchiCheng/mujoco-learning

在前面分享过《MuJoCo 机械臂 PPO 强化学习逆向运动学（IK）》，在工作空间内，如果没有障碍物，那么任何一组解都是可以下发给机械臂进行控制并到目标点。

但实际若存在人机交互或是动态变化的环境，工作空间内进入任何物体都会影响关节的执行动作，进而造成目标点无法到达的实际结果。那么解决避障、绕张的规划变得尤为重要。

传统的路径规划的方式前面也介绍过《Mujoco 机械臂 OMPL 进行 RRT 关节空间路径规划避障、绕障》，那么在强化学习中我们如何让模型网络学习到该特性，"掌握"绕障、避障的技巧呢？

首先对于observation要进行设计，希望在奖励和实际中找到障碍物和是否碰撞以及目标点的关系，则观测空间如下：

复制代码

self.obs_size = 7 + 3 + 3 + 1
self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_size,), dtype=np.float32)

def _get_observation(self) -> np.ndarray:
    joint_pos = self.data.qpos[:7].copy().astype(np.float32)
    return np.concatenate([joint_pos, self.goal_position, self.obstacle_position + np.random.normal(0, 0.001, size=3), np.array([self.obstacle_size], dtype=np.float32)])

对于奖励如何设计，从几个角度，

目标点是有精度要求
不想碰障碍物
机械臂不要超过关节限位
机械臂抖动不要太厉害

时间不要太长

def _calc_reward(self, joint_angles: np.ndarray, action: np.ndarray) -> tuple[np.ndarray, float]:
now_ee_pos = self.data.body(self.end_effector_id).xpos.copy()
dist_to_goal = np.linalg.norm(now_ee_pos - self.goal_position)

复制代码

     # 非线性距离奖励
     if dist_to_goal < self.goal_arrival_threshold:
         distance_reward = 20.0*(1.0+(1.0-(dist_to_goal / self.goal_arrival_threshold)))
     elif dist_to_goal < 2*self.goal_arrival_threshold:
         distance_reward = 10.0*(1.0+(1.0-(dist_to_goal / 2*self.goal_arrival_threshold)))
     elif dist_to_goal < 3*self.goal_arrival_threshold:
         distance_reward = 5.0*(1.0+(1.0-(dist_to_goal / 3*self.goal_arrival_threshold)))
     else:
         distance_reward = 1.0 / (1.0 + dist_to_goal)
     
     # 平滑惩罚
     smooth_penalty = 0.001 * np.linalg.norm(action - self.last_action)

     # 碰撞惩罚
     contact_reward = 10.0 * self.data.ncon
     
     # 关节角度限制惩罚
     joint_penalty = 0.0
     for i in range(7):
         min_angle, max_angle = self.model.jnt_range[:7][i]
         if joint_angles[i] < min_angle:
             joint_penalty += 0.5 * (min_angle - joint_angles[i])
         elif joint_angles[i] > max_angle:
             joint_penalty += 0.5 * (joint_angles[i] - max_angle)
     
     time_penalty = 0.001 * (time.time() - self.start_t)
     
     total_reward = (distance_reward 
                 - contact_reward 
                 - smooth_penalty 
                 - joint_penalty
                 - time_penalty)
     
     self.last_action = action.copy()
     
     return total_reward, dist_to_goal, self.data.ncon > 0

网络采用 PPO，对称网络，策略网络和价值网络，超参数需要实际调整，环境中障碍物为随机改变位置，目标点也随机，完整的训练代码如下，训练将TRAIN_MODE改True，测试改False即可

复制代码

import numpy as np
import mujoco
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import SubprocVecEnv
import torch.nn as nn
import warnings
import torch
import mujoco.viewer
import time
from typing import Optional
from scipy.spatial.transform import Rotation as R

# 忽略stable-baselines3的冗余UserWarning
warnings.filterwarnings("ignore", category=UserWarning, module="stable_baselines3.common.on_policy_algorithm")

import os

def write_flag_file(flag_filename="rl_visu_flag"):
    flag_path = os.path.join("/tmp", flag_filename)
    try:
        with open(flag_path, "w") as f:
            f.write("This is a flag file")
        return True
    except Exception as e:
        return False

def check_flag_file(flag_filename="rl_visu_flag"):
    flag_path = os.path.join("/tmp", flag_filename)
    return os.path.exists(flag_path)

def delete_flag_file(flag_filename="rl_visu_flag"):
    flag_path = os.path.join("/tmp", flag_filename)
    if not os.path.exists(flag_path):
        return True
    try:
        os.remove(flag_path)
        return True
    except Exception as e:
        return False

class PandaObstacleEnv(gym.Env):
    def __init__(self, visualize: bool = False):
        super(PandaObstacleEnv, self).__init__()
        if not check_flag_file():
            write_flag_file()
            self.visualize = visualize
        else:
            self.visualize = False
        self.handle = None

        self.model = mujoco.MjModel.from_xml_path('./model/franka_emika_panda/scene_pos_with_obstacles.xml')
        self.data = mujoco.MjData(self.model)
        # for i in range(self.model.ngeom):
        #     if self.model.geom_group[i] == 3:
        #         self.model.geom_conaffinity[i] = 0
        
        if self.visualize:
            self.handle = mujoco.viewer.launch_passive(self.model, self.data)
            self.handle.cam.distance = 3.0
            self.handle.cam.azimuth = 0.0
            self.handle.cam.elevation = -30.0
            self.handle.cam.lookat = np.array([0.2, 0.0, 0.4])
        
        self.np_random = np.random.default_rng(None)
        
        self.end_effector_id = mujoco.mj_name2id(self.model, mujoco.mjtObj.mjOBJ_BODY, 'ee_center_body')
        self.home_joint_pos = np.array(self.model.key_qpos[0][:7], dtype=np.float32)
        
        self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(7,), dtype=np.float32)
        # 7轴关节角度、目标位置、障碍物位置和球体直径
        self.obs_size = 7 + 3 + 3 + 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(self.obs_size,), dtype=np.float32)
        
        # self.goal_position = np.array([0.4, 0.3, 0.4], dtype=np.float32)
        self.goal_position = np.array([0.4, -0.3, 0.4], dtype=np.float32)
        self.goal_arrival_threshold = 0.005
        self.goal_visu_size = 0.02
        self.goal_visu_rgba = [0.1, 0.3, 0.3, 0.8]

        # 在xml中增加障碍物，worldbody 中添加如下
        # <geom name="obstacle_0" 
        #     type="sphere" 
        #     size="0.060" 
        #     pos="0.300 0.200 0.500" 
        #     contype="1" 
        #     conaffinity="1" 
        #     mass="0.0" 
        #     rgba="0.300 0.300 0.300 0.800" 
        # />
        # 并在init函数中初始化障碍物的位置和大小   
        for i in range(self.model.ngeom):
            name = mujoco.mj_id2name(self.model, mujoco.mjtObj.mjOBJ_GEOM, i)
            if name == "obstacle_0":
                self.obstacle_id_1 = i 
        self.obstacle_position = np.array(self.model.geom_pos[self.obstacle_id_1], dtype=np.float32)
        self.obstacle_size = self.model.geom_size[self.obstacle_id_1][0]

        self.last_action = self.home_joint_pos

    def _render_scene(self) -> None:
        if not self.visualize or self.handle is None:
            return
        self.handle.user_scn.ngeom = 0
        total_geoms = 1
        self.handle.user_scn.ngeom = total_geoms

        mujoco.mjv_initGeom(
            self.handle.user_scn.geoms[0],
            mujoco.mjtGeom.mjGEOM_SPHERE,
            size=[self.goal_visu_size, 0.0, 0.0],
            pos=self.goal_position,
            mat=np.eye(3).flatten(),
            rgba=np.array(self.goal_visu_rgba, dtype=np.float32)
        )

    def reset(self, seed: Optional[int] = None, options: Optional[dict] = None) -> tuple[np.ndarray, dict]:
        super().reset(seed=seed)
        if seed is not None:
            self.np_random = np.random.default_rng(seed)
        
        # 重置关节到home位姿
        mujoco.mj_resetData(self.model, self.data)
        self.data.qpos[:7] = self.home_joint_pos
        self.data.qpos[7:] = [0.04,0.04]
        mujoco.mj_forward(self.model, self.data)

        self.goal_position = np.array([self.goal_position[0], self.np_random.uniform(-0.3, 0.3), self.goal_position[2]], dtype=np.float32)
        self.obstacle_position = np.array([self.obstacle_position[0], self.np_random.uniform(-0.3, 0.3), self.obstacle_position[2]], dtype=np.float32)
        self.model.geom_pos[self.obstacle_id_1] = self.obstacle_position
        mujoco.mj_step(self.model, self.data)
        
        if self.visualize:
            self._render_scene()
        
        obs = self._get_observation()
        self.start_t = time.time()
        return obs, {}

    def _get_observation(self) -> np.ndarray:
        joint_pos = self.data.qpos[:7].copy().astype(np.float32)
        return np.concatenate([joint_pos, self.goal_position, self.obstacle_position + np.random.normal(0, 0.001, size=3), np.array([self.obstacle_size], dtype=np.float32)])

    def _calc_reward(self, joint_angles: np.ndarray, action: np.ndarray) -> tuple[np.ndarray, float]:
        now_ee_pos = self.data.body(self.end_effector_id).xpos.copy()
        dist_to_goal = np.linalg.norm(now_ee_pos - self.goal_position)

        # 非线性距离奖励
        if dist_to_goal < self.goal_arrival_threshold:
            distance_reward = 20.0*(1.0+(1.0-(dist_to_goal / self.goal_arrival_threshold)))
        elif dist_to_goal < 2*self.goal_arrival_threshold:
            distance_reward = 10.0*(1.0+(1.0-(dist_to_goal / 2*self.goal_arrival_threshold)))
        elif dist_to_goal < 3*self.goal_arrival_threshold:
            distance_reward = 5.0*(1.0+(1.0-(dist_to_goal / 3*self.goal_arrival_threshold)))
        else:
            distance_reward = 1.0 / (1.0 + dist_to_goal)
        
        # 平滑惩罚
        smooth_penalty = 0.001 * np.linalg.norm(action - self.last_action)

        # 碰撞惩罚
        contact_reward = 10.0 * self.data.ncon
        
        # 关节角度限制惩罚
        joint_penalty = 0.0
        for i in range(7):
            min_angle, max_angle = self.model.jnt_range[:7][i]
            if joint_angles[i] < min_angle:
                joint_penalty += 0.5 * (min_angle - joint_angles[i])
            elif joint_angles[i] > max_angle:
                joint_penalty += 0.5 * (joint_angles[i] - max_angle)
        
        time_penalty = 0.001 * (time.time() - self.start_t)
        
        total_reward = (distance_reward 
                    - contact_reward 
                    - smooth_penalty 
                    - joint_penalty
                    - time_penalty)
        
        self.last_action = action.copy()
        
        return total_reward, dist_to_goal, self.data.ncon > 0

    def step(self, action: np.ndarray) -> tuple[np.ndarray, np.float32, bool, bool, dict]:
        joint_ranges = self.model.jnt_range[:7]
        scaled_action = np.zeros(7, dtype=np.float32)
        for i in range(7):
            scaled_action[i] = joint_ranges[i][0] + (action[i] + 1) * 0.5 * (joint_ranges[i][1] - joint_ranges[i][0])
        
        self.data.ctrl[:7] = scaled_action
        self.data.qpos[7:] = [0.04,0.04]
        mujoco.mj_step(self.model, self.data)
        
        reward, dist_to_goal, collision = self._calc_reward(self.data.qpos[:7], action)
        terminated = False

        if collision:
            # print("collision happened, ", self.data.ncon)
            # info = {}
            # for i in range(self.data.ncon):
            #     contact = self.data.contact[i]
            #     # 获取几何体对应的body_id
            #     body1_id = self.model.geom_bodyid[contact.geom1]
            #     body2_id = self.model.geom_bodyid[contact.geom2]
            #     # 通过mj_id2name转换body_id为名称
            #     body1_name = mujoco.mj_id2name(self.model, mujoco.mjtObj.mjOBJ_BODY, body1_id)
            #     body2_name = mujoco.mj_id2name(self.model, mujoco.mjtObj.mjOBJ_BODY, body2_id)
            #     info["pair"+str(i)] = {}
            #     info["pair"+str(i)]["geom1"] = contact.geom1
            #     info["pair"+str(i)]["geom2"] = contact.geom2
            #     info["pair"+str(i)]["pos"] = contact.pos.copy()
            #     info["pair"+str(i)]["body1_name"] = body1_name
            #     info["pair"+str(i)]["body2_name"] = body2_name
            # print(info)
            reward -= 10.0
            terminated = True

        if dist_to_goal < self.goal_arrival_threshold:
            terminated = True
            print(f"[成功] 距离目标: {dist_to_goal:.3f}, 奖励: {reward:.3f}")
        # else:
        #     print(f"[失败] 距离目标: {dist_to_goal:.3f}, 奖励: {reward:.3f}")

        if not terminated:
            if time.time() - self.start_t > 20.0:
                reward -= 10.0
                print(f"[超时] 时间过长，奖励减半")
                terminated = True

        if self.visualize and self.handle is not None:
            self.handle.sync()
            time.sleep(0.01) 
        
        obs = self._get_observation()
        info = {
            'is_success': not collision and terminated and (dist_to_goal < self.goal_arrival_threshold),
            'distance_to_goal': dist_to_goal,
            'collision': collision
        }
        
        return obs, reward.astype(np.float32), terminated, False, info

    def seed(self, seed: Optional[int] = None) -> list[Optional[int]]:
        self.np_random = np.random.default_rng(seed)
        return [seed]

    def close(self) -> None:
        if self.visualize and self.handle is not None:
            self.handle.close()
            self.handle = None

def train_ppo(
    n_envs: int = 24,
    total_timesteps: int = 80_000_000,  # 本次训练的新增步数
    model_save_path: str = "panda_ppo_reach_target",
    visualize: bool = False,
    resume_from: Optional[str] = None
) -> None:

    ENV_KWARGS = {'visualize': visualize}
    
    env = make_vec_env(
        env_id=lambda: PandaObstacleEnv(** ENV_KWARGS),
        n_envs=n_envs,
        seed=42,
        vec_env_cls=SubprocVecEnv,
        vec_env_kwargs={"start_method": "fork"}
    )
    
    if resume_from is not None:
        model = PPO.load(resume_from, env=env)  # 加载时需传入当前环境
    else:
        # POLICY_KWARGS = dict(
        #     activation_fn=nn.ReLU,
        #     net_arch=[dict(pi=[256, 128], vf=[256, 128])]
        # )
        
        
        POLICY_KWARGS = dict(
            activation_fn=nn.LeakyReLU,
            net_arch=[
                dict(
                    pi=[512, 256, 128],
                    vf=[512, 256, 128]
                )
            ]
        )

        model = PPO(
            policy="MlpPolicy",
            env=env,
            policy_kwargs=POLICY_KWARGS,
            verbose=1,
            n_steps=2048,          
            batch_size=2048,       
            n_epochs=10,           
            gamma=0.99,
            # ent_coef=0.02,  # 增加熵系数，保留后期探索以提升泛化性
            ent_coef = 0.001, 
            clip_range=0.15,  # 限制策略更新幅度
            max_grad_norm=0.5,  # 梯度裁剪防止爆炸
            learning_rate=lambda f: 1e-4 * (1 - f),  # 学习率线性衰减（初始1e-4，后期逐步降低）
            device="cuda" if torch.cuda.is_available() else "cpu",
            tensorboard_log="./tensorboard/panda_obstacle_avoidance/"
        )
    
    print(f"并行环境数: {n_envs}, 本次训练新增步数: {total_timesteps}")
    model.learn(
        total_timesteps=total_timesteps,
        progress_bar=True
    )
    
    model.save(model_save_path)
    env.close()
    print(f"模型已保存至: {model_save_path}")


def test_ppo(
    model_path: str = "panda_obstacle_avoidance",
    total_episodes: int = 5,
) -> None:
    env = PandaObstacleEnv(visualize=True)
    model = PPO.load(model_path, env=env)

    
    success_count = 0
    print(f"测试轮数: {total_episodes}")
    
    for ep in range(total_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0.0
        
        while not done:
            obs = env._get_observation()
            # print(f"观察: {obs}")
            action, _states = model.predict(obs, deterministic=True)
            # action += np.random.normal(0, 0.002, size=7)  # 加入噪声
            obs, reward, terminated, truncated, info = env.step(action)
            # print(f"动作: {action}, 奖励: {reward}, 终止: {terminated}, 截断: {truncated}, 信息: {info}")
            episode_reward += reward
            done = terminated or truncated
        
        if info['is_success']:
            success_count += 1
        print(f"轮次 {ep+1:2d} | 总奖励: {episode_reward:6.2f} | 结果: {'成功' if info['is_success'] else '碰撞/失败'}")
    
    success_rate = (success_count / total_episodes) * 100
    print(f"总成功率: {success_rate:.1f}%")
    
    env.close()


if __name__ == "__main__":
    TRAIN_MODE = True  # 设为True开启训练模式
    if TRAIN_MODE:
        import os 
        os.system("rm -rf /home/dar/mujoco-bin/mujoco-learning/tensorboard*")
    delete_flag_file()
    MODEL_PATH = "assets/model/rl_obstacle_avoidance_checkpoint/panda_obstacle_avoidance_v3"
    RESUME_MODEL_PATH = "assets/model/rl_obstacle_avoidance_checkpoint/panda_obstacle_avoidance_v2"
    if TRAIN_MODE:
        train_ppo(
            n_envs=64,                
            total_timesteps=60_000_000,
            model_save_path=MODEL_PATH,
            visualize=True,
            # resume_from=RESUME_MODEL_PATH
            resume_from=None
        )
    else:
        test_ppo(
            model_path=MODEL_PATH,
            total_episodes=100,
        )
    os.system("date")

使用Tensorboard观察训练数据，可以看到接近训练结束时成功率接近100%