egpo进行train_egpo训练时,keyvalueError:“replay_sequence_length“

def execution_plan(workers: WorkerSet,

config: TrainerConfigDict) -> LocalIterator[dict]:

if config.get("prioritized_replay"):

prio_args = {

"prioritized_replay_alpha": config["prioritized_replay_alpha"],

"prioritized_replay_beta": config["prioritized_replay_beta"],

"prioritized_replay_eps": config["prioritized_replay_eps"],

}

else:

prio_args = {}

复制代码
local_replay_buffer = LocalReplayBuffer(
    num_shards=1,
    learning_starts=config["learning_starts"],
    buffer_size=config["buffer_size"],
    replay_batch_size=config["train_batch_size"],
    replay_mode=config["multiagent"]["replay_mode"],
    #这一行需要注释掉,如果不注释掉,整个代码就跑不起来,可能是因为ray1.4.1版本没有这个参数
    # replay_sequence_length=config["replay_sequence_length"],
    **prio_args)

rollouts = ParallelRollouts(workers, mode="bulk_sync")

# Update penalty
rollouts = rollouts.for_each(UpdateSaverPenalty(workers))
# We execute the following steps concurrently:
# (1) Generate rollouts and store them in our local replay buffer. Calling
# next() on store_op drives this.
store_op = rollouts.for_each(StoreToReplayBuffer(local_buffer=local_replay_buffer))

def update_prio(item):
    samples, info_dict = item
    if config.get("prioritized_replay"):
        prio_dict = {}
        for policy_id, info in info_dict.items():
            # TODO(sven): This is currently structured differently for
            #  torch/tf. Clean up these results/info dicts across
            #  policies (note: fixing this in torch_policy.py will
            #  break e.g. DDPPO!).
            td_error = info.get("td_error",
                                info[LEARNER_STATS_KEY].get("td_error"))
            prio_dict[policy_id] = (samples.policy_batches[policy_id]
                                    .data.get("batch_indexes"), td_error)
        local_replay_buffer.update_priorities(prio_dict)
    return info_dict

# (2) Read and train on experiences from the replay buffer. Every batch
# returned from the LocalReplay() iterator is passed to TrainOneStep to
# take a SGD step, and then we decide whether to update the target network.
post_fn = config.get("before_learn_on_batch") or (lambda b, *a: b)
replay_op = Replay(local_buffer=local_replay_buffer) \
    .for_each(lambda x: post_fn(x, workers, config)) \
    .for_each(TrainOneStep(workers)) \
    .for_each(update_prio) \
    .for_each(UpdateTargetNetwork(
    workers, config["target_network_update_freq"]))

# Alternate deterministically between (1) and (2). Only return the output
# of (2) since training metrics are not available until (2) runs.
train_op = Concurrently(
    [store_op, replay_op],
    mode="round_robin",
    output_indexes=[1],
    round_robin_weights=calculate_rr_weights(config))

return StandardMetricsReporting(train_op, workers, config)
相关推荐
m0_748554817 小时前
golang如何实现用户订阅偏好管理_golang用户订阅偏好管理实现总结
jvm·数据库·python
smj2302_796826528 小时前
解决leetcode第3911题.移除子数组元素后第k小偶数
数据结构·python·算法·leetcode
阿正呀9 小时前
Redis怎样实现本地缓存的高效失效通知
jvm·数据库·python
2501_901200539 小时前
mysql如何设置InnoDB引擎参数_优化innodb_buffer_pool
jvm·数据库·python
_.Switch9 小时前
东方财富股票数据JS逆向:secids字段和AES加密实战
开发语言·前端·javascript·网络·爬虫·python·ecmascript
Mr_sst9 小时前
Claude Code 部署与使用保姆级教程(2026 最新)
python·ai
瞎某某Blinder9 小时前
DFT学习记录[6]基于 HES06的能带计算+有效质量计算
python·学习·程序人生·数据挖掘·云计算·学习方法
m0_4954964110 小时前
mysql处理复杂SQL性能_InnoDB优化器与MyISAM差异
jvm·数据库·python
forEverPlume11 小时前
PHP怎么使用Eloquent Attribute Composition属性组合_Laravel通过组合构建复杂属性【方法】
jvm·数据库·python
Aleeeeex11 小时前
RAG 那点事:从 8 份企业文档到能用的问答系统,全过程拆给你看
人工智能·python·ai编程