PyTorch 深度学习实战(23):多任务强化学习(Multi-Task RL)之扩展

之前的PyTorch 深度学习实战(23):多任务强化学习(Multi-Task RL)总结扩展运用代码如下:

python 复制代码
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.distributions import Normal
from torch.amp import autocast, GradScaler
from metaworld.envs import ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE
import time
from collections import deque


# ================== 配置参数 ==================
class MultiTaskPPOConfig:
    # 任务配置
    task_names = [
        'reach-v2-goal-observable',
        'push-v2-goal-observable',
        'pick-place-v2-goal-observable'
    ]
    num_tasks = 3

    # 网络架构
    shared_dim = 512
    task_specific_dim = 256
    meta_controller_dim = 128
    shared_layers = 2
    task_specific_layers = 1

    # 训练参数
    lr = 5e-5
    meta_lr = 1e-5
    gamma = 0.99
    gae_lambda = 0.97
    clip_epsilon = 0.15
    ppo_epochs = 5
    batch_size = 4096
    max_episodes = 10000
    max_steps = 200
    grad_clip = 0.5
    entropy_coef = 0.1

    # 探索参数
    initial_std = 1.5
    min_std = 0.2
    std_decay = 0.999

    # 课程学习安排
    curriculum_schedule = {
        0: ['reach-v2-goal-observable'],
        1000: ['reach-v2-goal-observable', 'push-v2-goal-observable'],
        3000: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'],
        6000: ['reach-v2-goal-observable', 'push-v2-goal-observable', 'pick-place-v2-goal-observable']
    }

    # 监控配置
    log_interval = 50
    eval_interval = 500
    eval_episodes = 10

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# ================== MetaController ==================
class MetaController(nn.Module):
    def __init__(self, num_tasks, state_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, MultiTaskPPOConfig.meta_controller_dim),
            nn.LayerNorm(MultiTaskPPOConfig.meta_controller_dim),
            nn.GELU(),
            nn.Linear(MultiTaskPPOConfig.meta_controller_dim, num_tasks)
        )

        # 初始化参数
        for layer in self.net:
            if isinstance(layer, nn.Linear):
                nn.init.orthogonal_(layer.weight, gain=0.01)
                nn.init.constant_(layer.bias, 0.0)

    def forward(self, state):
        logits = self.net(state)
        return torch.softmax(logits, -1), logits


# ================== 共享策略网络 ==================
class SharedPolicy(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.action_dim = action_dim
        self.current_std = MultiTaskPPOConfig.initial_std

        # 共享网络层
        self.shared_net = nn.Sequential(
            nn.Linear(state_dim, MultiTaskPPOConfig.shared_dim),
            nn.LayerNorm(MultiTaskPPOConfig.shared_dim),
            nn.GELU(),
            nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.shared_dim),
            nn.GELU()
        )

        # 多任务头部
        self.task_heads = nn.ModuleList()
        self.value_heads = nn.ModuleList()

        for _ in range(MultiTaskPPOConfig.num_tasks):
            # 动作头
            task_head = nn.Sequential(
                nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.task_specific_dim),
                nn.GELU(),
                nn.Linear(MultiTaskPPOConfig.task_specific_dim, action_dim)
            )
            self.task_heads.append(task_head)

            # 值函数头
            value_head = nn.Sequential(
                nn.Linear(MultiTaskPPOConfig.shared_dim, MultiTaskPPOConfig.task_specific_dim),
                nn.GELU(),
                nn.Linear(MultiTaskPPOConfig.task_specific_dim, 1)
            )
            self.value_heads.append(value_head)

        # 可学习的对数标准差
        self.log_std = nn.Parameter(torch.zeros(1, action_dim))

        # 初始化参数
        self._init_weights()

    def _init_weights(self):
        for head in self.task_heads:
            for layer in head:
                if isinstance(layer, nn.Linear):
                    nn.init.orthogonal_(layer.weight, gain=0.01)
                    nn.init.constant_(layer.bias, 0.0)

        for head in self.value_heads:
            for layer in head:
                if isinstance(layer, nn.Linear):
                    nn.init.orthogonal_(layer.weight, gain=1.0)
                    nn.init.constant_(layer.bias, 0.0)

    def decay_action_std(self):
        """衰减动作标准差"""
        self.current_std = max(self.current_std * MultiTaskPPOConfig.std_decay,
                               MultiTaskPPOConfig.min_std)

    def forward(self, states, task_ids):
        # 确保输入是float32
        states = states.float() if states.dtype != torch.float32 else states

        shared_features = self.shared_net(states)
        batch_size = states.size(0)

        # 初始化输出张量
        action_means = torch.zeros(
            batch_size, self.action_dim,
            dtype=torch.float32,
            device=states.device
        )
        action_stds = torch.exp(self.log_std).expand(batch_size, -1) * self.current_std
        values = torch.zeros(
            batch_size, 1,
            dtype=torch.float32,
            device=states.device
        )

        unique_task_ids = torch.unique(task_ids)

        for task_id_tensor in unique_task_ids:
            task_id = task_id_tensor.item()
            mask = (task_ids == task_id_tensor)

            if not mask.any():
                continue

            selected_features = shared_features[mask]

            # 计算任务特定输出
            with autocast(device_type=states.device.type, enabled=False):  # 禁用混合精度
                task_action = self.task_heads[task_id](selected_features.float())
                task_value = self.value_heads[task_id](selected_features.float())

            action_means[mask] = task_action
            values[mask] = task_value

        return action_means, action_stds, values


# ================== 训练系统 ==================
class EnhancedMultiTaskPPOTrainer:
    def __init__(self):
        # 初始化多任务环境
        self.envs = []
        self.state_dim = None
        self.action_dim = None

        # 验证环境并获取维度
        for task_name in MultiTaskPPOConfig.task_names:
            env = ALL_V2_ENVIRONMENTS_GOAL_OBSERVABLE[task_name]()
            obs, _ = env.reset()

            if self.state_dim is None:
                self.state_dim = obs.shape[0]
                self.action_dim = env.action_space.shape[0]
            else:
                assert obs.shape[0] == self.state_dim, f"状态维度不一致: {task_name}"

            self.envs.append(env)

        # 初始化策略网络
        self.policy = SharedPolicy(self.state_dim, self.action_dim).to(MultiTaskPPOConfig.device)
        self.optimizer = optim.AdamW(self.policy.parameters(), lr=MultiTaskPPOConfig.lr)
        self.scheduler = optim.lr_scheduler.CosineAnnealingLR(
            self.optimizer,
            T_max=MultiTaskPPOConfig.max_episodes,
            eta_min=1e-6
        )
        self.scaler = GradScaler(enabled=MultiTaskPPOConfig.device.type == 'cuda')

        # 初始化MetaController
        self.meta_controller = MetaController(
            MultiTaskPPOConfig.num_tasks,
            self.state_dim
        ).to(MultiTaskPPOConfig.device)
        self.meta_optimizer = optim.Adam(
            self.meta_controller.parameters(),
            lr=MultiTaskPPOConfig.meta_lr
        )

        # 初始化经验回放缓冲
        self.buffer = deque(maxlen=MultiTaskPPOConfig.max_steps)

        # 课程学习状态
        self.current_phase = 0
        self.phase_thresholds = sorted(MultiTaskPPOConfig.curriculum_schedule.keys())

        # 训练统计
        self.episode_rewards = {i: deque(maxlen=100) for i in range(MultiTaskPPOConfig.num_tasks)}
        self.episode_lengths = {i: deque(maxlen=100) for i in range(MultiTaskPPOConfig.num_tasks)}
        self.meta_data = {
            'states': [],
            'chosen_tasks': [],
            'rewards': []
        }

        # 评估统计
        self.eval_rewards = {i: [] for i in range(MultiTaskPPOConfig.num_tasks)}
        self.eval_success = {i: [] for i in range(MultiTaskPPOConfig.num_tasks)}

    def get_current_tasks(self, episode):
        """获取当前课程阶段的任务列表"""
        if len(self.phase_thresholds) > 1 and self.current_phase < len(self.phase_thresholds) - 1:
            if episode >= self.phase_thresholds[self.current_phase + 1]:
                self.current_phase += 1

        task_names = MultiTaskPPOConfig.curriculum_schedule[
            self.phase_thresholds[self.current_phase]
        ]
        return [MultiTaskPPOConfig.task_names.index(name) for name in task_names]

    def collect_experience(self, num_steps, episode):
        """集成课程学习和meta controller的经验收集"""
        current_tasks = self.get_current_tasks(episode)

        for _ in range(num_steps):
            # 从当前课程任务中随机选择基础任务
            base_task_id = np.random.choice(current_tasks)
            env = self.envs[base_task_id]

            if not hasattr(env, '_last_obs'):
                state, _ = env.reset()
            else:
                state = env._last_obs

            # MetaController调整
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(MultiTaskPPOConfig.device)

            with torch.no_grad():
                task_probs, _ = self.meta_controller(state_tensor)
                task_probs = task_probs.squeeze().cpu().numpy()

                # 过滤概率分布
                mask = np.zeros_like(task_probs)
                mask[current_tasks] = 1
                filtered_probs = task_probs * mask
                filtered_probs = filtered_probs / (filtered_probs.sum() + 1e-6)

                # 任务选择策略
                if np.random.rand() < 0.7:
                    task_id = np.random.choice(current_tasks, p=filtered_probs[current_tasks])
                else:
                    task_id = np.random.choice(current_tasks)

                # 记录meta controller决策
                self.meta_data['states'].append(state_tensor)
                self.meta_data['chosen_tasks'].append(task_id)

            # 执行选择的task
            env = self.envs[task_id]
            with torch.no_grad():
                task_id_tensor = torch.tensor([task_id], dtype=torch.long, device=MultiTaskPPOConfig.device)
                action_mean, action_std, value = self.policy(state_tensor, task_id_tensor)
                dist = Normal(action_mean.float(), action_std.float())  # 确保分布参数是float32
                action = dist.sample().squeeze(0)
                log_prob = dist.log_prob(action).sum(-1, keepdim=True)
                action_np = action.cpu().numpy()

            next_state, reward, done, trunc, info = env.step(action_np)

            # 记录数据
            self.buffer.append({
                'state': state,
                'action': action_np,
                'log_prob': log_prob.cpu(),
                'reward': float(reward),
                'done': bool(done),
                'task_id': task_id,
                'value': float(value.item()),
                'success': info.get('success', False)
            })

            # 记录meta controller的反馈
            self.meta_data['rewards'].append(reward)

            state = next_state if not (done or trunc) else env.reset()[0]

    def compute_gae(self, values, rewards, dones):
        """计算广义优势估计(GAE)"""
        advantages = []
        last_advantage = 0
        next_value = 0
        next_non_terminal = 1.0

        for t in reversed(range(len(rewards))):
            delta = rewards[t] + MultiTaskPPOConfig.gamma * next_value * next_non_terminal - values[t]
            last_advantage = delta + MultiTaskPPOConfig.gamma * MultiTaskPPOConfig.gae_lambda * next_non_terminal * last_advantage
            advantages.append(last_advantage)
            next_value = values[t]
            next_non_terminal = 1.0 - dones[t]

        advantages = torch.tensor(advantages[::-1], dtype=torch.float32).to(MultiTaskPPOConfig.device)
        returns = advantages + torch.tensor(values, dtype=torch.float32).to(MultiTaskPPOConfig.device)
        return (advantages - advantages.mean()) / (advantages.std() + 1e-8), returns

    def calculate_task_weights(self):
        """基于最近表现计算任务权重"""
        task_weights = torch.ones(MultiTaskPPOConfig.num_tasks,
                                  device=MultiTaskPPOConfig.device)

        for task_id in range(MultiTaskPPOConfig.num_tasks):
            if len(self.episode_rewards[task_id]) > 10:
                # 计算最近10个episode的成功率
                recent_rewards = list(self.episode_rewards[task_id])[-10:]
                success_rate = sum(1 for r in recent_rewards if r > 0) / len(recent_rewards)

                # 动态调整权重
                if success_rate < 0.3:
                    task_weights[task_id] = 2.0  # 困难任务加倍权重
                elif success_rate > 0.8:
                    task_weights[task_id] = 0.5  # 简单任务减半权重

        return task_weights / task_weights.sum()

    def update_meta_controller(self):
        """更新任务选择策略"""
        if len(self.meta_data['states']) == 0:
            return

        states = torch.cat(self.meta_data['states'])
        chosen_tasks = torch.tensor(
            self.meta_data['chosen_tasks'],
            device=MultiTaskPPOConfig.device
        )
        rewards = torch.tensor(
            self.meta_data['rewards'],
            dtype=torch.float32,
            device=MultiTaskPPOConfig.device
        )

        # 清空数据
        self.meta_data = {
            'states': [],
            'chosen_tasks': [],
            'rewards': []
        }

        # 归一化奖励
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-6)

        # 更新MetaController
        task_probs, logits = self.meta_controller(states)
        selected_probs = task_probs.gather(1, chosen_tasks.unsqueeze(1))
        loss = -torch.log(selected_probs + 1e-6) * rewards.unsqueeze(1)
        loss = loss.mean()

        self.meta_optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(
            self.meta_controller.parameters(),
            MultiTaskPPOConfig.grad_clip
        )
        self.meta_optimizer.step()

    def update_policy(self):
        """策略更新方法"""
        if not self.buffer:
            return 0, 0, 0

        # 从缓冲中提取数据
        batch = list(self.buffer)
        states = torch.tensor(
            [x['state'] for x in batch],
            dtype=torch.float32,
            device=MultiTaskPPOConfig.device
        )
        actions = torch.FloatTensor(np.array([x['action'] for x in batch])).to(MultiTaskPPOConfig.device)
        old_log_probs = torch.cat([x['log_prob'] for x in batch]).to(MultiTaskPPOConfig.device)
        rewards = torch.FloatTensor([x['reward'] for x in batch]).to(MultiTaskPPOConfig.device)
        dones = torch.FloatTensor([x['done'] for x in batch]).to(MultiTaskPPOConfig.device)
        task_ids = torch.tensor(
            [x['task_id'] for x in batch],
            dtype=torch.long,
            device=MultiTaskPPOConfig.device
        )
        values = torch.FloatTensor([x['value'] for x in batch]).to(MultiTaskPPOConfig.device)
        successes = torch.FloatTensor([x['success'] for x in batch]).to(MultiTaskPPOConfig.device)

        # 计算GAE和returns
        advantages, returns = self.compute_gae(values.cpu().numpy(), rewards.cpu().numpy(), dones.cpu().numpy())

        # 计算任务权重
        task_weights = self.calculate_task_weights()

        # 自动混合精度训练
        total_policy_loss = 0
        total_value_loss = 0
        total_entropy = 0

        for _ in range(MultiTaskPPOConfig.ppo_epochs):
            # 随机打乱数据
            perm = torch.randperm(len(batch))

            for i in range(0, len(batch), MultiTaskPPOConfig.batch_size):
                idx = perm[i:i + MultiTaskPPOConfig.batch_size]

                # 获取小批量数据
                batch_states = states[idx]
                batch_actions = actions[idx]
                batch_old_log_probs = old_log_probs[idx]
                batch_returns = returns[idx]
                batch_advantages = advantages[idx]
                batch_task_ids = task_ids[idx]

                with autocast(device_type=MultiTaskPPOConfig.device.type,
                              enabled=MultiTaskPPOConfig.device.type == 'cuda'):
                    # 前向传播
                    action_means, action_stds, new_values = self.policy(batch_states, batch_task_ids)
                    dist = Normal(action_means, action_stds)
                    new_log_probs = dist.log_prob(batch_actions).sum(-1, keepdim=True)
                    entropy = dist.entropy().mean()

                    # 计算重要性采样比率
                    ratio = (new_log_probs - batch_old_log_probs).exp()

                    # 策略损失
                    surr1 = ratio * batch_advantages.unsqueeze(-1)
                    surr2 = torch.clamp(ratio, 1 - MultiTaskPPOConfig.clip_epsilon,
                                        1 + MultiTaskPPOConfig.clip_epsilon) * batch_advantages.unsqueeze(-1)
                    policy_loss_per_task = -torch.min(surr1, surr2)

                    # 应用任务权重
                    selected_weights = task_weights[batch_task_ids].unsqueeze(-1)
                    policy_loss = (policy_loss_per_task * selected_weights).mean()
                    policy_loss -= MultiTaskPPOConfig.entropy_coef * entropy

                    # 值函数损失 (带clip)
                    value_pred_clipped = values[idx] + (new_values - values[idx]).clamp(
                        -MultiTaskPPOConfig.clip_epsilon,
                        MultiTaskPPOConfig.clip_epsilon
                    )
                    value_loss1 = (new_values.squeeze() - batch_returns).pow(2)
                    value_loss2 = (value_pred_clipped.squeeze() - batch_returns).pow(2)
                    value_loss = 0.5 * torch.max(value_loss1, value_loss2).mean()

                    # 总损失
                    loss = policy_loss + value_loss

                # 反向传播
                self.scaler.scale(loss).backward()
                total_policy_loss += policy_loss.item()
                total_value_loss += value_loss.item()
                total_entropy += entropy.item()

        # 梯度裁剪和参数更新
        self.scaler.unscale_(self.optimizer)
        torch.nn.utils.clip_grad_norm_(self.policy.shared_net.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(
            list(self.policy.task_heads.parameters()) +
            list(self.policy.value_heads.parameters()),
            0.5
        )
        self.scaler.step(self.optimizer)
        self.scaler.update()
        self.optimizer.zero_grad()
        self.scheduler.step()

        # 衰减动作噪声
        self.policy.decay_action_std()

        return (total_policy_loss / MultiTaskPPOConfig.ppo_epochs,
                total_value_loss / MultiTaskPPOConfig.ppo_epochs,
                total_entropy / MultiTaskPPOConfig.ppo_epochs)

    def evaluate_policy(self):
        """评估当前策略性能"""
        eval_results = {i: {'rewards': [], 'successes': []} for i in range(MultiTaskPPOConfig.num_tasks)}

        for task_id in range(MultiTaskPPOConfig.num_tasks):
            env = self.envs[task_id]

            for _ in range(MultiTaskPPOConfig.eval_episodes):
                state, _ = env.reset()
                episode_reward = 0
                done = False
                success = False

                for _ in range(MultiTaskPPOConfig.max_steps):
                    with torch.no_grad():
                        state_tensor = torch.FloatTensor(state).unsqueeze(0).to(MultiTaskPPOConfig.device)
                        task_id_tensor = torch.tensor([task_id], dtype=torch.long, device=MultiTaskPPOConfig.device)
                        action_mean, _, _ = self.policy(state_tensor, task_id_tensor)
                        action = action_mean.squeeze(0).cpu().numpy()

                    state, reward, done, trunc, info = env.step(action)
                    episode_reward += reward
                    success = success or info.get('success', False)

                    if done or trunc:
                        break

                eval_results[task_id]['rewards'].append(episode_reward)
                eval_results[task_id]['successes'].append(success)

        # 记录评估结果
        for task_id in range(MultiTaskPPOConfig.num_tasks):
            avg_reward = np.mean(eval_results[task_id]['rewards'])
            success_rate = np.mean(eval_results[task_id]['successes'])
            self.eval_rewards[task_id].append(avg_reward)
            self.eval_success[task_id].append(success_rate)

        return eval_results

    def train(self):
        print(f"开始训练,设备:{MultiTaskPPOConfig.device}")
        print(f"课程安排:{MultiTaskPPOConfig.curriculum_schedule}")
        start_time = time.time()

        # 初始评估
        self.evaluate_policy()

        for episode in range(MultiTaskPPOConfig.max_episodes):
            # 经验收集阶段
            self.collect_experience(MultiTaskPPOConfig.max_steps, episode)

            # 策略优化阶段
            policy_loss, value_loss, entropy = self.update_policy()

            # MetaController更新
            self.update_meta_controller()

            # 记录统计信息
            for exp in self.buffer:
                task_id = exp['task_id']
                self.episode_rewards[task_id].append(exp['reward'])
                self.episode_lengths[task_id].append(1)

            # 定期输出日志
            if (episode + 1) % MultiTaskPPOConfig.log_interval == 0:
                avg_rewards = {k: np.mean(v) if v else 0 for k, v in self.episode_rewards.items()}
                success_rates = {
                    k: np.mean([1 if r > 0 else 0 for r in v]) if v else 0
                    for k, v in self.episode_rewards.items()
                }
                time_cost = time.time() - start_time

                # 打印当前课程阶段
                current_task_names = MultiTaskPPOConfig.curriculum_schedule[
                    self.phase_thresholds[self.current_phase]
                ]

                print(f"\nEpisode {episode + 1:5d} | Time: {time_cost:6.1f}s")
                print(f"当前课程阶段: {current_task_names} (Phase {self.current_phase})")
                print(f"动作标准差: {self.policy.current_std:.3f} | 学习率: {self.scheduler.get_last_lr()[0]:.2e}")

                for task_id in range(MultiTaskPPOConfig.num_tasks):
                    task_name = MultiTaskPPOConfig.task_names[task_id]
                    print(
                        f"  {task_name:25s} | Avg Reward: {avg_rewards[task_id]:7.2f} | Success Rate: {success_rates[task_id]:.2f}")

                print(f"  Policy Loss: {policy_loss:.4f} | Value Loss: {value_loss:.4f} | Entropy: {entropy:.4f}")

                start_time = time.time()

            # 定期评估
            if (episode + 1) % MultiTaskPPOConfig.eval_interval == 0:
                eval_results = self.evaluate_policy()

                if (episode + 1) % 1000 == 0:
                    print("\n评估结果:")
                    for task_id in range(MultiTaskPPOConfig.num_tasks):
                        task_name = MultiTaskPPOConfig.task_names[task_id]
                        avg_reward = np.mean(eval_results[task_id]['rewards'])
                        success_rate = np.mean(eval_results[task_id]['successes'])
                        print(f"  {task_name:25s} | Avg Reward: {avg_reward:7.2f} | Success Rate: {success_rate:.2f}")

        # 训练结束保存模型
        torch.save({
            'policy_state_dict': self.policy.state_dict(),
            'meta_controller_state_dict': self.meta_controller.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict()
        }, "multitask_ppo_model.pth")


if __name__ == "__main__":
    trainer = EnhancedMultiTaskPPOTrainer()
    print(f"状态维度: {trainer.state_dim}, 动作维度: {trainer.action_dim}")
    trainer.train()

部分输出为:

python 复制代码
Episode    50 | Time:  216.6s
当前课程阶段: ['reach-v2-goal-observable'] (Phase 0)
动作标准差: 1.427 | 学习率: 5.00e-05
  reach-v2-goal-observable  | Avg Reward:    1.42 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.00 | Success Rate: 0.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: -0.1777 | Value Loss: 471.4303 | Entropy: 1.7773

Episode   100 | Time:  193.3s
当前课程阶段: ['reach-v2-goal-observable'] (Phase 0)
动作标准差: 1.357 | 学习率: 5.00e-05
  reach-v2-goal-observable  | Avg Reward:    1.42 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.00 | Success Rate: 0.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: -0.1729 | Value Loss: 357.7264 | Entropy: 1.7293

......

Episode  2800 | Time:  198.6s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.11e-05
  reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: 0.0092 | Value Loss: 191.3147 | Entropy: -0.0918

Episode  2850 | Time:  212.2s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.08e-05
  reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: 0.0090 | Value Loss: 183.6324 | Entropy: -0.0902

Episode  2900 | Time:  210.4s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.05e-05
  reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: 0.0089 | Value Loss: 188.5185 | Entropy: -0.0889

Episode  2950 | Time:  210.1s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 4.02e-05
  reach-v2-goal-observable  | Avg Reward:    1.44 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: 0.0087 | Value Loss: 183.0386 | Entropy: -0.0874

Episode  3000 | Time:  212.0s
当前课程阶段: ['reach-v2-goal-observable', 'push-v2-goal-observable'] (Phase 1)
动作标准差: 0.200 | 学习率: 3.99e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.00 | Success Rate: 0.00
  Policy Loss: 0.0086 | Value Loss: 182.9761 | Entropy: -0.0858

评估结果:
  reach-v2-goal-observable  | Avg Reward:  106.66 | Success Rate: 0.00
  push-v2-goal-observable   | Avg Reward:    3.99 | Success Rate: 0.00
  pick-place-v2-goal-observable | Avg Reward:    4.49 | Success Rate: 0.00

Episode  3050 | Time:  234.3s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.96e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0084 | Value Loss: 28.1028 | Entropy: -0.0843

Episode  3100 | Time:  210.3s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.93e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0083 | Value Loss: 0.1660 | Entropy: -0.0829

Episode  3150 | Time:  209.8s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.90e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0082 | Value Loss: 0.1506 | Entropy: -0.0818

Episode  3200 | Time:  210.2s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.86e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0080 | Value Loss: 0.1429 | Entropy: -0.0801

Episode  3250 | Time:  210.3s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.83e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0079 | Value Loss: 0.1725 | Entropy: -0.0785

Episode  3300 | Time:  209.7s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.80e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0077 | Value Loss: 0.1990 | Entropy: -0.0771

Episode  3350 | Time:  209.5s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.76e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0076 | Value Loss: 0.2084 | Entropy: -0.0758

Episode  3400 | Time:  210.1s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.73e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0075 | Value Loss: 0.2057 | Entropy: -0.0745

Episode  3450 | Time:  210.9s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.70e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0073 | Value Loss: 0.2251 | Entropy: -0.0733

Episode  3500 | Time:  210.1s
当前课程阶段: ['push-v2-goal-observable', 'pick-place-v2-goal-observable'] (Phase 2)
动作标准差: 0.200 | 学习率: 3.66e-05
  reach-v2-goal-observable  | Avg Reward:    1.45 | Success Rate: 1.00
  push-v2-goal-observable   | Avg Reward:    0.05 | Success Rate: 1.00
  pick-place-v2-goal-observable | Avg Reward:    0.02 | Success Rate: 1.00
  Policy Loss: 0.0072 | Value Loss: 0.2199 | Entropy: -0.0723

......
相关推荐
小技工丨3 分钟前
详解大语言模型生态系统概念:lama,llama.cpp,HuggingFace 模型 ,GGUF,MLX,lm-studio,ollama这都是什么?
人工智能·语言模型·llama
陈奕昆5 分钟前
大模型微调之LLaMA-Factory 系列教程大纲
人工智能·llama·大模型微调·llama-factory
上海云盾商务经理杨杨28 分钟前
AI如何重塑DDoS防护行业?六大变革与未来展望
人工智能·安全·web安全·ddos
lanboAI33 分钟前
基于卷积神经网络的蔬菜水果识别系统,resnet50,mobilenet模型【pytorch框架+python源码】
pytorch·python·cnn
苯酸氨酰糖化物35 分钟前
计算机毕业设计--基于深度学习(U-Net与多尺度ViT)的车牌模糊图像修复算法设计与实现(含Github代码+Web端在线体验界面)
深度学习·算法·课程设计
一刀到底21139 分钟前
ai agent(智能体)开发 python3基础8 网页抓取中 selenium 和 Playwright 区别和联系
人工智能·python
每天都要写算法(努力版)44 分钟前
【神经网络与深度学习】改变随机种子可以提升模型性能?
人工智能·深度学习·神经网络
烟锁池塘柳01 小时前
【计算机视觉】三种图像质量评价指标详解:PSNR、SSIM与SAM
人工智能·深度学习·计算机视觉
小森77671 小时前
(六)机器学习---聚类与K-means
人工智能·机器学习·数据挖掘·scikit-learn·kmeans·聚类
RockLiu@8052 小时前
探索PyTorch中的空间与通道双重注意力机制:实现concise的scSE模块
人工智能·pytorch·python