之前的PyTorch 深度学习实战(23):多任务强化学习(Multi-Task RL)总结扩展运用代码如下:
python
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Normal
from torch.amp import autocast, GradScaler
from metaworld.envs import ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE
import time
from collections import deque
# ================== 配置参数 ==================
class MultiTaskPPOConfig:
# 任务配置
task_names = [
'reach-v2-goal-observable',
'push-v2-goal-observable',
'pick-place-v2-goal-observable'
]
num_tasks = 3
# 网络架构
shared_dim = 512
task_specific_dim = 256
meta_controller_dim = 128
shared_layers = 2
task_specific_layers = 1
# 训练参数
lr = 5e-5
meta_lr = 1e-5
gamma = 0.99
gae_lambda = 0.97
clip_epsilon = 0.15
ppo_epochs = 5
batch_size = 4096
max_episodes = 10000
max_steps = 200
grad_clip = 0.5
entropy_coef = 0.1
# 探索参数
initial_std = 1.5
min_std = 0.2
std_decay = 0.999
# 课程学习安排
curriculum_schedule = {
0: ['reach-v2-goal-observable'],
1000: ['reach-v2-goal-observable', 'push-v2-goal-observable'],
3000: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'],
6000: ['reach-v2-goal-observable', 'push-v2-goal-observable', 'pick-place-v2-goal-observable']
}
# 监控配置
log_interval = 50
eval_interval = 500
eval_episodes = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# ================== MetaController ==================
class MetaController(nn.Module):
def __init__(self, num_tasks, state_dim):
super().__init__()
self.net = nn.Sequential(
nn.Linear(state_dim, MultiTaskPPOConfig.meta_controller_dim),
nn.LayerNorm(MultiTaskPPOConfig.meta_controller_dim),
nn.GELU(),
nn.Linear(MultiTaskPPOConfig.meta_controller_dim, num_tasks)
)
# 初始化参数
for layer in self.net:
if isinstance(layer, nn.Linear):
nn.init.orthogonal_(layer.weight, gain=0.01)
nn.init.constant_(layer.bias, 0.0)
def forward(self, state):
logits = self.net(state)
return torch.softmax(logits, -1), logits
# ================== 共享策略网络 ==================
class SharedPolicy(nn.Module):
def __init__(self, state_dim, action_dim):
super().__init__()
self.action_dim = action_dim
self.current_std = MultiTaskPPOConfig.initial_std
# 共享网络层
self.shared_net = nn.Sequential(
nn.Linear(state_dim, MultiTaskPPOConfig.shared_dim),
nn.LayerNorm(MultiTaskPPOConfig.shared_dim),
nn.GELU(),
nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.shared_dim),
nn.GELU()
)
# 多任务头部
self.task_heads = nn.ModuleList()
self.value_heads = nn.ModuleList()
for _ in range(MultiTaskPPOConfig.num_tasks):
# 动作头
task_head = nn.Sequential(
nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.task_specific_dim),
nn.GELU(),
nn.Linear(MultiTaskPPOConfig.task_specific_dim, action_dim)
)
self.task_heads.append(task_head)
# 值函数头
value_head = nn.Sequential(
nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.task_specific_dim),
nn.GELU(),
nn.Linear(MultiTaskPPOConfig.task_specific_dim, 1)
)
self.value_heads.append(value_head)
# 可学习的对数标准差
self.log_std = nn.Parameter(torch.zeros(1, action_dim))
# 初始化参数
self._init_weights()
def _init_weights(self):
for head in self.task_heads:
for layer in head:
if isinstance(layer, nn.Linear):
nn.init.orthogonal_(layer.weight, gain=0.01)
nn.init.constant_(layer.bias, 0.0)
for head in self.value_heads:
for layer in head:
if isinstance(layer, nn.Linear):
nn.init.orthogonal_(layer.weight, gain=1.0)
nn.init.constant_(layer.bias, 0.0)
def decay_action_std(self):
"""衰减动作标准差"""
self.current_std = max(self.current_std * MultiTaskPPOConfig.std_decay,
MultiTaskPPOConfig.min_std)
def forward(self, states, task_ids):
# 确保输入是float32
states = states.float() if states.dtype != torch.float32 else states
shared_features = self.shared_net(states)
batch_size = states.size(0)
# 初始化输出张量
action_means = torch.zeros(
batch_size, self.action_dim,
dtype=torch.float32,
device=states.device
)
action_stds = torch.exp(self.log_std).expand(batch_size, -1) * self.current_std
values = torch.zeros(
batch_size, 1,
dtype=torch.float32,
device=states.device
)
unique_task_ids = torch.unique(task_ids)
for task_id_tensor in unique_task_ids:
task_id = task_id_tensor.item()
mask = (task_ids == task_id_tensor)
if not mask.any():
continue
selected_features = shared_features[mask]
# 计算任务特定输出
with autocast(device_type=states.device.type, enabled=False): # 禁用混合精度
task_action = self.task_heads[task_id](selected_features.float())
task_value = self.value_heads[task_id](selected_features.float())
action_means[mask] = task_action
values[mask] = task_value
return action_means, action_stds, values
# ================== 训练系统 ==================
class EnhancedMultiTaskPPOTrainer:
def __init__(self):
# 初始化多任务环境
self.envs = []
self.state_dim = None
self.action_dim = None
# 验证环境并获取维度
for task_name in MultiTaskPPOConfig.task_names:
env = ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE[task_name]()
obs, _ = env.reset()
if self.state_dim is None:
self.state_dim = obs.shape[0]
self.action_dim = env.action_space.shape[0]
else:
assert obs.shape[0] == self.state_dim, f"状态维度不一致: {task_name}"
self.envs.append(env)
# 初始化策略网络
self.policy = SharedPolicy(self.state_dim, self.action_dim).to(MultiTaskPPOConfig.device)
self.optimizer = optim.AdamW(self.policy.parameters(), lr=MultiTaskPPOConfig.lr)
self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
self.optimizer,
T_max=MultiTaskPPOConfig.max_episodes,
eta_min=1e-6
)
self.scaler = GradScaler(enabled=MultiTaskPPOConfig.device.type == 'cuda')
# 初始化MetaController
self.meta_controller = MetaController(
MultiTaskPPOConfig.num_tasks,
self.state_dim
).to(MultiTaskPPOConfig.device)
self.meta_optimizer = optim.Adam(
self.meta_controller.parameters(),
lr=MultiTaskPPOConfig.meta_lr
)
# 初始化经验回放缓冲
self.buffer = deque(maxlen=MultiTaskPPOConfig.max_steps)
# 课程学习状态
self.current_phase = 0
self.phase_thresholds = sorted(MultiTaskPPOConfig.curriculum_schedule.keys())
# 训练统计
self.episode_rewards = {i: deque(maxlen=100) for i in range(MultiTaskPPOConfig.num_tasks)}
self.episode_lengths = {i: deque(maxlen=100) for i in range(MultiTaskPPOConfig.num_tasks)}
self.meta_data = {
'states': [],
'chosen_tasks': [],
'rewards': []
}
# 评估统计
self.eval_rewards = {i: [] for i in range(MultiTaskPPOConfig.num_tasks)}
self.eval_success = {i: [] for i in range(MultiTaskPPOConfig.num_tasks)}
def get_current_tasks(self, episode):
"""获取当前课程阶段的任务列表"""
if len(self.phase_thresholds) > 1 and self.current_phase < len(self.phase_thresholds) - 1:
if episode >= self.phase_thresholds[self.current_phase + 1]:
self.current_phase += 1
task_names = MultiTaskPPOConfig.curriculum_schedule[
self.phase_thresholds[self.current_phase]
]
return [MultiTaskPPOConfig.task_names.index(name) for name in task_names]
def collect_experience(self, num_steps, episode):
"""集成课程学习和meta controller的经验收集"""
current_tasks = self.get_current_tasks(episode)
for _ in range(num_steps):
# 从当前课程任务中随机选择基础任务
base_task_id = np.random.choice(current_tasks)
env = self.envs[base_task_id]
if not hasattr(env, '_last_obs'):
state, _ = env.reset()
else:
state = env._last_obs
# MetaController调整
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(MultiTaskPPOConfig.device)
with torch.no_grad():
task_probs, _ = self.meta_controller(state_tensor)
task_probs = task_probs.squeeze().cpu().numpy()
# 过滤概率分布
mask = np.zeros_like(task_probs)
mask[current_tasks] = 1
filtered_probs = task_probs * mask
filtered_probs = filtered_probs / (filtered_probs.sum() + 1e-6)
# 任务选择策略
if np.random.rand() < 0.7:
task_id = np.random.choice(current_tasks, p=filtered_probs[current_tasks])
else:
task_id = np.random.choice(current_tasks)
# 记录meta controller决策
self.meta_data['states'].append(state_tensor)
self.meta_data['chosen_tasks'].append(task_id)
# 执行选择的task
env = self.envs[task_id]
with torch.no_grad():
task_id_tensor = torch.tensor([task_id], dtype=torch.long, device=MultiTaskPPOConfig.device)
action_mean, action_std, value = self.policy(state_tensor, task_id_tensor)
dist = Normal(action_mean.float(), action_std.float()) # 确保分布参数是float32
action = dist.sample().squeeze(0)
log_prob = dist.log_prob(action).sum(-1, keepdim=True)
action_np = action.cpu().numpy()
next_state, reward, done, trunc, info = env.step(action_np)
# 记录数据
self.buffer.append({
'state': state,
'action': action_np,
'log_prob': log_prob.cpu(),
'reward': float(reward),
'done': bool(done),
'task_id': task_id,
'value': float(value.item()),
'success': info.get('success', False)
})
# 记录meta controller的反馈
self.meta_data['rewards'].append(reward)
state = next_state if not (done or trunc) else env.reset()[0]
def compute_gae(self, values, rewards, dones):
"""计算广义优势估计(GAE)"""
advantages = []
last_advantage = 0
next_value = 0
next_non_terminal = 1.0
for t in reversed(range(len(rewards))):
delta = rewards[t] + MultiTaskPPOConfig.gamma * next_value * next_non_terminal - values[t]
last_advantage = delta + MultiTaskPPOConfig.gamma * MultiTaskPPOConfig.gae_lambda * next_non_terminal * last_advantage
advantages.append(last_advantage)
next_value = values[t]
next_non_terminal = 1.0 - dones[t]
advantages = torch.tensor(advantages[::-1], dtype=torch.float32).to(MultiTaskPPOConfig.device)
returns = advantages + torch.tensor(values, dtype=torch.float32).to(MultiTaskPPOConfig.device)
return (advantages - advantages.mean()) / (advantages.std() + 1e-8), returns
def calculate_task_weights(self):
"""基于最近表现计算任务权重"""
task_weights = torch.ones(MultiTaskPPOConfig.num_tasks,
device=MultiTaskPPOConfig.device)
for task_id in range(MultiTaskPPOConfig.num_tasks):
if len(self.episode_rewards[task_id]) > 10:
# 计算最近10个episode的成功率
recent_rewards = list(self.episode_rewards[task_id])[-10:]
success_rate = sum(1 for r in recent_rewards if r > 0) / len(recent_rewards)
# 动态调整权重
if success_rate < 0.3:
task_weights[task_id] = 2.0 # 困难任务加倍权重
elif success_rate > 0.8:
task_weights[task_id] = 0.5 # 简单任务减半权重
return task_weights / task_weights.sum()
def update_meta_controller(self):
"""更新任务选择策略"""
if len(self.meta_data['states']) == 0:
return
states = torch.cat(self.meta_data['states'])
chosen_tasks = torch.tensor(
self.meta_data['chosen_tasks'],
device=MultiTaskPPOConfig.device
)
rewards = torch.tensor(
self.meta_data['rewards'],
dtype=torch.float32,
device=MultiTaskPPOConfig.device
)
# 清空数据
self.meta_data = {
'states': [],
'chosen_tasks': [],
'rewards': []
}
# 归一化奖励
rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-6)
# 更新MetaController
task_probs, logits = self.meta_controller(states)
selected_probs = task_probs.gather(1, chosen_tasks.unsqueeze(1))
loss = -torch.log(selected_probs + 1e-6) * rewards.unsqueeze(1)
loss = loss.mean()
self.meta_optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(
self.meta_controller.parameters(),
MultiTaskPPOConfig.grad_clip
)
self.meta_optimizer.step()
def update_policy(self):
"""策略更新方法"""
if not self.buffer:
return 0, 0, 0
# 从缓冲中提取数据
batch = list(self.buffer)
states = torch.tensor(
[x['state'] for x in batch],
dtype=torch.float32,
device=MultiTaskPPOConfig.device
)
actions = torch.FloatTensor(np.array([x['action'] for x in batch])).to(MultiTaskPPOConfig.device)
old_log_probs = torch.cat([x['log_prob'] for x in batch]).to(MultiTaskPPOConfig.device)
rewards = torch.FloatTensor([x['reward'] for x in batch]).to(MultiTaskPPOConfig.device)
dones = torch.FloatTensor([x['done'] for x in batch]).to(MultiTaskPPOConfig.device)
task_ids = torch.tensor(
[x['task_id'] for x in batch],
dtype=torch.long,
device=MultiTaskPPOConfig.device
)
values = torch.FloatTensor([x['value'] for x in batch]).to(MultiTaskPPOConfig.device)
successes = torch.FloatTensor([x['success'] for x in batch]).to(MultiTaskPPOConfig.device)
# 计算GAE和returns
advantages, returns = self.compute_gae(values.cpu().numpy(), rewards.cpu().numpy(), dones.cpu().numpy())
# 计算任务权重
task_weights = self.calculate_task_weights()
# 自动混合精度训练
total_policy_loss = 0
total_value_loss = 0
total_entropy = 0
for _ in range(MultiTaskPPOConfig.ppo_epochs):
# 随机打乱数据
perm = torch.randperm(len(batch))
for i in range(0, len(batch), MultiTaskPPOConfig.batch_size):
idx = perm[i:i + MultiTaskPPOConfig.batch_size]
# 获取小批量数据
batch_states = states[idx]
batch_actions = actions[idx]
batch_old_log_probs = old_log_probs[idx]
batch_returns = returns[idx]
batch_advantages = advantages[idx]
batch_task_ids = task_ids[idx]
with autocast(device_type=MultiTaskPPOConfig.device.type,
enabled=MultiTaskPPOConfig.device.type == 'cuda'):
# 前向传播
action_means, action_stds, new_values = self.policy(batch_states, batch_task_ids)
dist = Normal(action_means, action_stds)
new_log_probs = dist.log_prob(batch_actions).sum(-1, keepdim=True)
entropy = dist.entropy().mean()
# 计算重要性采样比率
ratio = (new_log_probs - batch_old_log_probs).exp()
# 策略损失
surr1 = ratio * batch_advantages.unsqueeze(-1)
surr2 = torch.clamp(ratio, 1 - MultiTaskPPOConfig.clip_epsilon,
1 + MultiTaskPPOConfig.clip_epsilon) * batch_advantages.unsqueeze(-1)
policy_loss_per_task = -torch.min(surr1, surr2)
# 应用任务权重
selected_weights = task_weights[batch_task_ids].unsqueeze(-1)
policy_loss = (policy_loss_per_task * selected_weights).mean()
policy_loss -= MultiTaskPPOConfig.entropy_coef * entropy
# 值函数损失 (带clip)
value_pred_clipped = values[idx] + (new_values - values[idx]).clamp(
-MultiTaskPPOConfig.clip_epsilon,
MultiTaskPPOConfig.clip_epsilon
)
value_loss1 = (new_values.squeeze() - batch_returns).pow(2)
value_loss2 = (value_pred_clipped.squeeze() - batch_returns).pow(2)
value_loss = 0.5 * torch.max(value_loss1, value_loss2).mean()
# 总损失
loss = policy_loss + value_loss
# 反向传播
self.scaler.scale(loss).backward()
total_policy_loss += policy_loss.item()
total_value_loss += value_loss.item()
total_entropy += entropy.item()
# 梯度裁剪和参数更新
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(self.policy.shared_net.parameters(), 1.0)
torch.nn.utils.clip_grad_norm_(
list(self.policy.task_heads.parameters()) +
list(self.policy.value_heads.parameters()),
0.5
)
self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad()
self.scheduler.step()
# 衰减动作噪声
self.policy.decay_action_std()
return (total_policy_loss / MultiTaskPPOConfig.ppo_epochs,
total_value_loss / MultiTaskPPOConfig.ppo_epochs,
total_entropy / MultiTaskPPOConfig.ppo_epochs)
def evaluate_policy(self):
"""评估当前策略性能"""
eval_results = {i: {'rewards': [], 'successes': []} for i in range(MultiTaskPPOConfig.num_tasks)}
for task_id in range(MultiTaskPPOConfig.num_tasks):
env = self.envs[task_id]
for _ in range(MultiTaskPPOConfig.eval_episodes):
state, _ = env.reset()
episode_reward = 0
done = False
success = False
for _ in range(MultiTaskPPOConfig.max_steps):
with torch.no_grad():
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(MultiTaskPPOConfig.device)
task_id_tensor = torch.tensor([task_id], dtype=torch.long, device=MultiTaskPPOConfig.device)
action_mean, _, _ = self.policy(state_tensor, task_id_tensor)
action = action_mean.squeeze(0).cpu().numpy()
state, reward, done, trunc, info = env.step(action)
episode_reward += reward
success = success or info.get('success', False)
if done or trunc:
break
eval_results[task_id]['rewards'].append(episode_reward)
eval_results[task_id]['successes'].append(success)
# 记录评估结果
for task_id in range(MultiTaskPPOConfig.num_tasks):
avg_reward = np.mean(eval_results[task_id]['rewards'])
success_rate = np.mean(eval_results[task_id]['successes'])
self.eval_rewards[task_id].append(avg_reward)
self.eval_success[task_id].append(success_rate)
return eval_results
def train(self):
print(f"开始训练,设备:{MultiTaskPPOConfig.device}")
print(f"课程安排:{MultiTaskPPOConfig.curriculum_schedule}")
start_time = time.time()
# 初始评估
self.evaluate_policy()
for episode in range(MultiTaskPPOConfig.max_episodes):
# 经验收集阶段
self.collect_experience(MultiTaskPPOConfig.max_steps, episode)
# 策略优化阶段
policy_loss, value_loss, entropy = self.update_policy()
# MetaController更新
self.update_meta_controller()
# 记录统计信息
for exp in self.buffer:
task_id = exp['task_id']
self.episode_rewards[task_id].append(exp['reward'])
self.episode_lengths[task_id].append(1)
# 定期输出日志
if (episode + 1) % MultiTaskPPOConfig.log_interval == 0:
avg_rewards = {k: np.mean(v) if v else 0 for k, v in self.episode_rewards.items()}
success_rates = {
k: np.mean([1 if r > 0 else 0 for r in v]) if v else 0
for k, v in self.episode_rewards.items()
}
time_cost = time.time() - start_time
# 打印当前课程阶段
current_task_names = MultiTaskPPOConfig.curriculum_schedule[
self.phase_thresholds[self.current_phase]
]
print(f"\nEpisode {episode + 1:5d} | Time: {time_cost:6.1f}s")
print(f"当前课程阶段: {current_task_names} (Phase {self.current_phase})")
print(f"动作标准差: {self.policy.current_std:.3f} | 学习率: {self.scheduler.get_last_lr()[0]:.2e}")
for task_id in range(MultiTaskPPOConfig.num_tasks):
task_name = MultiTaskPPOConfig.task_names[task_id]
print(
f" {task_name:25s} | Avg Reward: {avg_rewards[task_id]:7.2f} | Success Rate: {success_rates[task_id]:.2f}")
print(f" Policy Loss: {policy_loss:.4f} | Value Loss: {value_loss:.4f} | Entropy: {entropy:.4f}")
start_time = time.time()
# 定期评估
if (episode + 1) % MultiTaskPPOConfig.eval_interval == 0:
eval_results = self.evaluate_policy()
if (episode + 1) % 1000 == 0:
print("\n评估结果:")
for task_id in range(MultiTaskPPOConfig.num_tasks):
task_name = MultiTaskPPOConfig.task_names[task_id]
avg_reward = np.mean(eval_results[task_id]['rewards'])
success_rate = np.mean(eval_results[task_id]['successes'])
print(f" {task_name:25s} | Avg Reward: {avg_reward:7.2f} | Success Rate: {success_rate:.2f}")
# 训练结束保存模型
torch.save({
'policy_state_dict': self.policy.state_dict(),
'meta_controller_state_dict': self.meta_controller.state_dict(),
'optimizer_state_dict': self.optimizer.state_dict()
}, "multitask_ppo_model.pth")
if __name__ == "__main__":
trainer = EnhancedMultiTaskPPOTrainer()
print(f"状态维度: {trainer.state_dim}, 动作维度: {trainer.action_dim}")
trainer.train()
部分输出为:
python
Episode 50 | Time: 216.6s
当前课程阶段: ['reach-v2-goal-observable'] (Phase 0)
动作标准差: 1.427 | 学习率: 5.00e-05
reach-v2-goal-observable | Avg Reward: 1.42 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: -0.1777 | Value Loss: 471.4303 | Entropy: 1.7773
Episode 100 | Time: 193.3s
当前课程阶段: ['reach-v2-goal-observable'] (Phase 0)
动作标准差: 1.357 | 学习率: 5.00e-05
reach-v2-goal-observable | Avg Reward: 1.42 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: -0.1729 | Value Loss: 357.7264 | Entropy: 1.7293
......
Episode 2800 | Time: 198.6s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.11e-05
reach-v2-goal-observable | Avg Reward: 1.44 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: 0.0092 | Value Loss: 191.3147 | Entropy: -0.0918
Episode 2850 | Time: 212.2s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.08e-05
reach-v2-goal-observable | Avg Reward: 1.44 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: 0.0090 | Value Loss: 183.6324 | Entropy: -0.0902
Episode 2900 | Time: 210.4s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.05e-05
reach-v2-goal-observable | Avg Reward: 1.44 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: 0.0089 | Value Loss: 188.5185 | Entropy: -0.0889
Episode 2950 | Time: 210.1s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.02e-05
reach-v2-goal-observable | Avg Reward: 1.44 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: 0.0087 | Value Loss: 183.0386 | Entropy: -0.0874
Episode 3000 | Time: 212.0s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 3.99e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.00 | Success Rate: 0.00
Policy Loss: 0.0086 | Value Loss: 182.9761 | Entropy: -0.0858
评估结果:
reach-v2-goal-observable | Avg Reward: 106.66 | Success Rate: 0.00
push-v2-goal-observable | Avg Reward: 3.99 | Success Rate: 0.00
pick-place-v2-goal-observable | Avg Reward: 4.49 | Success Rate: 0.00
Episode 3050 | Time: 234.3s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.96e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0084 | Value Loss: 28.1028 | Entropy: -0.0843
Episode 3100 | Time: 210.3s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.93e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0083 | Value Loss: 0.1660 | Entropy: -0.0829
Episode 3150 | Time: 209.8s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.90e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0082 | Value Loss: 0.1506 | Entropy: -0.0818
Episode 3200 | Time: 210.2s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.86e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0080 | Value Loss: 0.1429 | Entropy: -0.0801
Episode 3250 | Time: 210.3s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.83e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0079 | Value Loss: 0.1725 | Entropy: -0.0785
Episode 3300 | Time: 209.7s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.80e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0077 | Value Loss: 0.1990 | Entropy: -0.0771
Episode 3350 | Time: 209.5s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.76e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0076 | Value Loss: 0.2084 | Entropy: -0.0758
Episode 3400 | Time: 210.1s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.73e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0075 | Value Loss: 0.2057 | Entropy: -0.0745
Episode 3450 | Time: 210.9s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.70e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0073 | Value Loss: 0.2251 | Entropy: -0.0733
Episode 3500 | Time: 210.1s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.66e-05
reach-v2-goal-observable | Avg Reward: 1.45 | Success Rate: 1.00
push-v2-goal-observable | Avg Reward: 0.05 | Success Rate: 1.00
pick-place-v2-goal-observable | Avg Reward: 0.02 | Success Rate: 1.00
Policy Loss: 0.0072 | Value Loss: 0.2199 | Entropy: -0.0723
......