Gymnasium 是一个 强化学习(Reinforcement Learning, RL)的标准环境接口库。它是 OpenAI Gym 的社区维护和升级版本,由 Farama Foundation 主导开发,广泛用于 RL 研究、教学和工程实践中。
一、Gymnasium 简介
- 定位:通用强化学习环境库。
- 作用:提供标准化的 env.reset()、env.step(action)、env.render() 等接口,使算法开发者可以无缝切换不同任务环境(如 CartPole、Atari、MuJoCo 等)。
- 特点:
-
完全兼容 OpenAI Gym 的 API;
-
支持类型提示、错误检查;
-
活跃社区维护,持续更新;
-
支持自定义环境注册;
-
与主流 RL 框架(如 Stable-Baselines3、RLlib、CleanRL)无缝集成。
✅ 简单说:Gymnasium = OpenAI Gym 的"官方继承者",但 不是数据库,而是 Python 软件库。
-
二、Gymnasium 与 Isaac Gym / Isaac Sim 的对比
它们都属于 强化学习仿真工具生态,但定位、性能和用途有显著区别:
| 特性 | Gymnasium | Isaac Gym | Isaac Sim |
|---|---|---|---|
| 开发者 | 社区(Farama Foundation) | NVIDIA | NVIDIA |
| 底层物理引擎 | 多种(Box2D、MuJoCo 等) | PhysX(GPU 加速) | PhysX + Omniverse(高保真) |
| 运行设备 | CPU(普通电脑即可) | GPU 必需(NVIDIA) | GPU 必需(高端显卡) |
| 主要用途 | 通用 RL 任务(CartPole、Atari、简单机器人) | 大规模并行机器人 RL 训练 | 高保真机器人仿真 + AI训练 + 传感器建模 |
| API 风格 | 标准 Gym 接口 | 兼容 Gym 风格(但为向量化环境) | 基于 Omniverse,通过 Isaac Lab 提供 RL 接口 |
| 是否仍在活跃开发 | ✅ 是 | ⚠️ 已停止更新(被 Isaac Sim/Isaac Lab 取代) | ✅ 是(主力产品) |
三、Gymnasium优点和不足
Isaac Gym 曾是 Gymnasium 的"高性能替代品"
它模仿 Gym 的 API,但专为 GPU 并行仿真设计,适合训练成百上千个机器人实例。
例如:用 Isaac Gym 同时训练 10,000 个机械臂,速度比 MuJoCo + Gym 快数百倍。
Isaac Sim 是 Isaac Gym 的"继任者"
Isaac Sim 基于 NVIDIA Omniverse,支持更真实的渲染、传感器(摄像头、LiDAR)、多机器人协作等。
强化学习功能现在由 Isaac Lab(集成在 Isaac Sim 中)提供,取代了 Isaac Gym。
官方已逐步将 Isaac Gym 的能力迁移到 Isaac Sim + Isaac Lab 架构中。
Gymnasium 与 Isaac 系列无直接依赖
Gymnasium 是通用 RL 接口标准;
Isaac Gym/Sim 是具体仿真平台;
你可以用 Gymnasium 接口包装 Isaac Gym 环境(社区有尝试),但官方不直接耦合。
四、适用场景
初学者 / 通用 RL 任务 → 用 Gymnasium
高性能机器人 RL(旧项目) → 用 Isaac Gym(仅限 Linux + NVIDIA GPU)
前沿机器人仿真 + 多模态感知 + 部署闭环 → 用 Isaac Sim + Isaac Lab
五、使用举例
环境准备:Mujoco + Gymnasium
资源准备:动作采集文件csv + 机器人模型xml
[A]、构建训练环境
bash
# motion_trans.py
from dance_env import DanceEnv
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
import os
import logging
import time
# ===================== 固定配置 =====================
MODEL_SAVE_DIR = "./trained_robot_model"
LOG_DIR = "./training_logs"
os.makedirs(MODEL_SAVE_DIR, exist_ok=True)
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(os.path.join(LOG_DIR, "training.log")),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def create_robot_env():
env = DanceEnv(
xml_path="robot.xml",
ref_csv_path="Walk.csv",
max_episode_steps=500,
render_mode="human",
domain_randomization=False
)
env = Monitor(env, LOG_DIR)
return env
# ===================== 早停回调函数 =====================
from stable_baselines3.common.callbacks import BaseCallback
class EarlyStopCallback(BaseCallback):
def __init__(self, target_reward, verbose=0):
super().__init__(verbose)
self.target_reward = target_reward
def _on_step(self):
if self.locals["rewards"].mean() >= self.target_reward:
self.logger.info(f"奖励达到 {self.target_reward}(当前平均奖励:{self.locals['rewards'].mean():.3f}),提前终止训练!")
return False
return True
def main():
try:
logger.info("开始创建机器人环境...")
env = create_robot_env()
obs, info = env.reset(seed=42)
logger.info(f"环境创建成功!观测空间形状: {obs.shape}")
logger.info(f"动作空间维度: {env.action_space.shape}")
logger.info("初始化PPO神经网络模型...")
# 临时调小训练步数,快速观察可视化效果
TRAIN_TIMESTEPS = 50000 # 从1000000改为10000,后续可按需调大
model = PPO(
"MlpPolicy",
env,
learning_rate=3e-4,
n_steps=2048,
batch_size=64,
gamma=0.99,
verbose=1,
tensorboard_log=LOG_DIR,
device="cpu",
policy_kwargs=dict(
net_arch=dict(pi=[512, 512], vf=[512, 512])
)
)
logger.info("PPO模型初始化成功!开始训练...")
callback = EarlyStopCallback(target_reward=-100.0, verbose=1)
model.learn(
total_timesteps=TRAIN_TIMESTEPS,
callback=callback,
tb_log_name="robot_walk_ppo",
progress_bar=True # 显示训练进度条
)
# 模型保存
model_save_path = os.path.join(MODEL_SAVE_DIR, "robot_walk_ppo_model.zip")
model.save(model_save_path)
logger.info(f"训练完成!模型已保存至: {model_save_path}")
# 模型验证(添加渲染延迟)
logger.info("开始验证训练好的模型...")
obs, info = env.reset(seed=42)
total_reward = 0.0
episode_count = 0
for step in range(500):
action, _states = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action)
total_reward += reward
time.sleep(0.05)
if step % 100 == 0:
logger.info(f"验证步骤 {step}, 即时奖励: {reward:.3f}, 累计奖励: {total_reward:.3f}")
if terminated or truncated:
logger.info(f"验证终止,步骤: {step}, 原因: {'达到最大步数' if terminated else '机器人倒地'}")
obs, info = env.reset()
total_reward = 0.0
episode_count += 1
time.sleep(0.5)
logger.info("模型验证完成!")
except Exception as e:
logger.error(f"训练过程发生错误!错误信息: {str(e)}", exc_info=True)
raise e
finally:
try:
env.close()
logger.info("环境已安全关闭")
except:
pass
if __name__ == "__main__":
main()
[B]、数据载录文件
bash
# dance_env.py
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import pandas as pd
import mujoco
class DanceEnv(gym.Env):
metadata = {"render_modes": ["human"], "render_fps": 50}
def __init__(
self,
xml_path: str,
ref_csv_path: str,
max_episode_steps: int = 1000,
render_mode: str = None,
domain_randomization: bool = False,
):
super().__init__()
self.xml_path = xml_path
raw_traj = pd.read_csv(ref_csv_path).values.astype(np.float32)
self.ref_pelvis = raw_traj[:, :7] # [T, 7] 骨盆位姿
self.ref_joints_raw = raw_traj[:, 7:] # [T, 28] 原始CSV关节角度
csv_joint_names = [
# 左侧下肢(严格匹配MuJoCo actuator顺序:yaw→roll→pitch→knee→ankle pitch→ankle roll)
"dof_left_hip_yaw_link", "dof_left_hip_roll_link", "dof_left_hip_pitch_link",
"dof_left_knee_link", "dof_left_ankle_pitch_link", "dof_left_ankle_roll_link",
# 右侧下肢(严格匹配MuJoCo actuator顺序:yaw→roll→pitch→knee→ankle pitch→ankle roll)
"dof_right_hip_yaw_link", "dof_right_hip_roll_link", "dof_right_hip_pitch_link",
"dof_right_knee_link", "dof_right_ankle_pitch_link", "dof_right_ankle_roll_link",
# 左侧上肢(严格匹配MuJoCo actuator顺序:pitch→roll→yaw→elbow→wrist roll→wrist pitch→wrist yaw)
"dof_left_shoulder_pitch_link", "dof_left_shoulder_roll_link", "dof_left_shoulder_yaw_link",
"dof_left_elbow_link", "dof_left_wrist_roll_link", "dof_left_wrist_pitch_link", "dof_left_wrist_yaw_link",
# 右侧上肢(严格匹配MuJoCo actuator顺序:pitch→roll→yaw→elbow→wrist roll→wrist pitch→wrist yaw)
"dof_right_shoulder_pitch_link", "dof_right_shoulder_roll_link", "dof_right_shoulder_yaw_link",
"dof_right_elbow_link", "dof_right_wrist_roll_link", "dof_right_wrist_pitch_link", "dof_right_wrist_yaw_link",
# 躯干(严格匹配MuJoCo actuator顺序:roll→yaw)
"dof_waist_roll_link", "dof_waist_yaw_link"
]
# 2. 加载模型并获取MuJoCo受控关节名称(按actuator顺序)
self.model = mujoco.MjModel.from_xml_path(xml_path)
self.data = mujoco.MjData(self.model)
controlled_joint_ids = self.model.actuator_trnid[:, 0]
self.controlled_joint_ids = controlled_joint_ids
# 获取MuJoCo关节名称列表
mujoco_joint_names = [self.model.joint(jid).name for jid in controlled_joint_ids]
print("MuJoCo受控关节顺序:", mujoco_joint_names)
csv_joint_names_clean = []
for name in csv_joint_names:
# 步骤1:移除dof_前缀
name_without_dof = name.replace("dof_", "")
# 步骤2:移除_link后缀
name_without_dof_link = name_without_dof.replace("_link", "")
# 步骤3:补充_joint后缀,匹配MuJoCo关节名
name_matching_mujoco = f"{name_without_dof_link}_joint"
csv_joint_names_clean.append(name_matching_mujoco)
# 3. 建立CSV关节到MuJoCo关节的索引映射(对齐顺序)
csv_to_mujoco_idx = []
for mujoco_joint_name in mujoco_joint_names:
if mujoco_joint_name in csv_joint_names_clean:
csv_idx = csv_joint_names_clean.index(mujoco_joint_name)
csv_to_mujoco_idx.append(csv_idx)
else:
raise ValueError(f"MuJoCo关节{mujoco_joint_name}未在CSV中找到对应项")
# 4. 重新排列参考关节角度,匹配MuJoCo关节顺序
self.ref_joints = self.ref_joints_raw[:, csv_to_mujoco_idx] # [T, 28] 对齐后的关节角度
assert self.ref_joints.shape[1] == 28, f"Expected 28 joint columns, got {self.ref_joints.shape[1]}"
self.T_ref = raw_traj.shape[0]
self.n_joints = self.ref_joints.shape[1]
assert self.model.nu == self.n_joints, f"Actuator count {self.model.nu} != CSV joints {self.n_joints}"
self.joint_ranges = np.array([
self.model.jnt_range[j_id][1] - self.model.jnt_range[j_id][0]
for j_id in controlled_joint_ids
], dtype=np.float32)
self.joint_mins = np.array([
self.model.jnt_range[j_id][0]
for j_id in controlled_joint_ids
], dtype=np.float32)
self.controlled_joint_qpos_adrs = self.model.jnt_qposadr[controlled_joint_ids]
self.controlled_joint_dof_adrs = self.model.jnt_dofadr[controlled_joint_ids]
self.nominal_mass = self.model.body_mass.copy()
self.nominal_friction = self.model.geom_friction[:, 0].copy()
self.nominal_kp = np.array([200.0]*12 + [100.0]*16, dtype=np.float32) # 下肢关节kp=200,躯干/上肢=100
self.nominal_kd = np.array([10.0]*12 + [5.0]*16, dtype=np.float32) # 下肢关节kd=10,躯干/上肢=5
self.kp = self.nominal_kp.copy()
self.kd = self.nominal_kd.copy()
self.obs_dim = self.model.nq + self.model.nv + self.n_joints + 1
self.observation_space = spaces.Box(
low=-np.inf, high=np.inf, shape=(self.obs_dim,), dtype=np.float32
)
self.action_space = spaces.Box(
low=-1.0, high=1.0, shape=(self.n_joints,), dtype=np.float32
)
self.max_episode_steps = max_episode_steps
self.render_mode = render_mode
self.domain_randomization = domain_randomization
self.viewer = None
self.timestep = 0
if self.domain_randomization:
self._apply_domain_randomization()
print("Controlled joint names:", [self.model.joint(jid).name for jid in controlled_joint_ids])
def _apply_domain_randomization(self):
rng = np.random.default_rng()
# Mass
for i in range(1, self.model.nbody):
self.model.body_mass[i] = self.nominal_mass[i] * rng.uniform(0.8, 1.2)
# Friction
for i in range(self.model.ngeom):
self.model.geom_friction[i][0] = self.nominal_friction[i] * rng.uniform(0.7, 1.3)
# Actuator gains
self.kp = self.nominal_kp * rng.uniform(0.85, 1.15, size=self.n_joints)
self.kd = self.nominal_kd * rng.uniform(0.85, 1.15, size=self.n_joints)
# Gravity
self.model.opt.gravity[2] = -9.81 * rng.uniform(0.95, 1.05)
def _get_obs(self):
phase = float(self.timestep % self.T_ref) / self.T_ref
target_idx = self.timestep % self.T_ref
target_q = self.ref_joints[target_idx]
obs = np.concatenate([
self.data.qpos.astype(np.float32), # shape: (35,) → 7(pelvis)+28(joints)
self.data.qvel.astype(np.float32), # shape: (34,)
target_q, # shape: (28,)
[phase]
])
return obs
def _pd_control(self, action):
assert action.shape == (self.n_joints,), f"Action shape {action.shape} != ({self.n_joints},)"
assert len(self.joint_mins) == self.n_joints
target_q = self.joint_mins + (action + 1.0) * 0.5 * self.joint_ranges
joint_maxs = self.joint_mins + self.joint_ranges
target_q = np.clip(target_q, self.joint_mins, joint_maxs)
q_now = self.data.qpos[self.controlled_joint_qpos_adrs]
qdot_now = self.data.qvel[self.controlled_joint_dof_adrs]
torque = self.kp * (target_q - q_now) - self.kd * qdot_now
torque = np.clip(torque, -200.0, 200.0) # 从-100→100调整为-200→200
return torque
def _compute_reward(self, target_q):
q_now = self.data.qpos[self.controlled_joint_qpos_adrs]
track_err = np.linalg.norm(target_q - q_now, ord=2)
effort = np.sum(np.square(self.data.ctrl))
upright = self.data.qpos[2] # 骨盆z高度
com_height = self.data.xipos[1, 2] # 骨盆质心高度
# 放宽倾斜阈值:从0.5调整为0.8,降低误判
pelvis_roll = self.data.qpos[3]
pelvis_pitch = self.data.qpos[4]
tilt_penalty = -50.0 if (np.abs(pelvis_roll) > 0.8 or np.abs(pelvis_pitch) > 0.8) else 0.0
# 放宽高度阈值:从0.4调整为0.6,避免初始姿态直接触发倒地
fall_penalty = -200.0 if com_height < 0.6 else 0.0
reward = -0.5 * track_err - 0.001 * effort + 5.0 * upright + tilt_penalty + fall_penalty
# 同步放宽倒地标记阈值
self.is_fallen = (com_height < 0.6) or (np.abs(pelvis_roll) > 0.8) or (np.abs(pelvis_pitch) > 0.8)
return reward
def step(self, action):
torque = self._pd_control(action)
# 限制控制力矩,防止力矩过大导致关节失控
torque = np.clip(torque, -100.0, 100.0)
self.data.ctrl[:] = torque
mujoco.mj_step(self.model, self.data)
target_idx = self.timestep % self.T_ref
target_q = self.ref_joints[target_idx]
reward = self._compute_reward(target_q)
self.timestep += 1
terminated = self.timestep >= self.max_episode_steps
truncated = self.is_fallen
obs = self._get_obs()
if self.render_mode == "human":
self._render_frame()
return obs, reward, terminated, truncated, {}
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.timestep = 0
mujoco.mj_resetData(self.model, self.data)
if self.domain_randomization:
self._apply_domain_randomization()
# 初始姿态:使用参考轨迹第0帧,新增:小幅随机扰动,增强鲁棒性
initial_pelvis = self.ref_pelvis[0]
initial_joints = self.ref_joints[0]
rng = np.random.default_rng(seed)
initial_pelvis += rng.normal(0, 0.001, size=initial_pelvis.shape)
initial_joints += rng.normal(0, 0.001, size=initial_joints.shape)
self.data.qpos[:7] = initial_pelvis
self.data.qpos[7:] = initial_joints
self.data.qvel[:] = 0.0
mujoco.mj_forward(self.model, self.data)
self.is_fallen = False
obs = self._get_obs()
return obs, {}
def _render_frame(self):
if self.viewer is None:
from mujoco import viewer
try:
self.viewer = viewer.launch_passive(self.model, self.data)
self.viewer._loop_rate = self.metadata["render_fps"]
except Exception as e:
print(f"启动Viewer失败: {e}")
self.render_mode = None
return
else:
try:
if self.viewer.is_running():
self.viewer.sync()
import time
time.sleep(1/self.metadata["render_fps"]) # 按50fps添加固定延迟
else:
self.viewer.close()
self.viewer = None
self.render_mode = None
except Exception as e:
print(f"Viewer同步失败: {e}")
self.viewer = None
self.render_mode = None
def close(self):
if self.viewer is not None:
try:
if self.viewer.is_running():
self.viewer.close()
except Exception as e:
print(f"关闭Viewer失败: {e}")
finally:
self.viewer = None
[C]、训练部署文件
bash
from dance_env import DanceEnv
from stable_baselines3 import PPO
import logging
# 配置日志
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
def main():
MODEL_PATH = "./trained_robot_model/robot_walk_ppo_model.zip"
try:
# 1. 加载训练好的神经网络模型
logger.info(f"开始加载模型: {MODEL_PATH}")
model = PPO.load(MODEL_PATH)
logger.info("模型加载成功!")
# 2. 创建部署环境(开启可视化)
logger.info("创建部署环境...")
env = DanceEnv(
xml_path="robot.xml",
ref_csv_path="Walk.csv",
max_episode_steps=500,
render_mode="human", # 开启可视化
domain_randomization=False
)
obs, info = env.reset(seed=42)
logger.info("部署环境创建成功!开始运行模型...")
# 3. 运行模型
total_reward = 0.0
for step in range(500):
action, _states = model.predict(obs, deterministic=True)
obs, reward, terminated, truncated, info = env.step(action)
total_reward += reward
if step % 100 == 0:
logger.info(f"部署步骤 {step}, 即时奖励: {reward:.3f}, 累计奖励: {total_reward:.3f}")
if terminated or truncated:
logger.info(f"部署运行终止,步骤: {step}, 原因: {'达到最大步数' if terminated else '机器人倒地'}")
obs, info = env.reset()
total_reward = 0.0
logger.info("模型部署运行完成!")
except Exception as e:
logger.error(f"模型部署失败!错误信息: {str(e)}", exc_info=True)
finally:
env.close()
logger.info("部署环境已关闭")
if __name__ == "__main__":
main()
六、结束语
基于动作采集文件(csv、pkl或者npz)开展,基于环境仿真(mujoco、Animate或者Isaac_sim) + RL的部署策略的训练是sim2Sim和sim2Real的桥接环节,必要且重要。实现方法较多最新的是Isaac sim的方案。