java
# config.py
import numpy as np
# --- 堆场配置 ---
YARD_CONFIG = {
"BLOCKS": ['A', 'B', 'C'],
"ROWS_PER_BLOCK": 5, # 为了加快训练,缩小规模
"COLS_PER_ROW": 4,
"TIERS_PER_SLOT": 3,
"CHE_POSITIONS": { # 场桥在每个Block的中心位置
'A': (2, 1),
'B': (1, 1),
'C': (3, 2),
},
"MAX_CONTAINERS": 200, # 堆场最大容量
}
# --- MCTS配置 ---
MCTS_CONFIG = {
"SIMULATION_COUNT": 100, # 减少模拟次数以适应RL训练速度
"TIME_LIMIT": 1.0, # 搜索时间限制(秒)
"EXPLORATION_PARAM": 1.41,
"DISCOUNT_FACTOR": 0.95,
"PRIOR_WEIGHT": 0.25,
"USE_RL_PRIOR": True, # MCTS是否使用RL模型的输出作为先验
"RL_PRIOR_WEIGHT": 0.5, # RL先验的权重
}
# --- RL配置 ---
RL_CONFIG = {
"LEARNING_RATE": 0.001,
"BATCH_SIZE": 32,
"GAMMA": 0.99, # 折扣因子
"EPSILON_START": 1.0,
"EPSILON_END": 0.1,
"EPSILON_DECAY": 0.995,
"TARGET_UPDATE_FREQ": 100, # 更新目标网络的频率
"MEMORY_SIZE": 10000, # 经验回放缓冲区大小
"TRAIN_EVERY_N_STEPS": 10, # 每N步训练一次
"EPISODES": 500, # 训练回合数
"MODEL_SAVE_PATH": "yard_rl_model.pth",
"MODEL_LOAD_PATH": "yard_rl_model.pth", # 加载预训练模型
}
# --- 规则配置 ---
RULE_CONFIG = {
"MAX_STACK_HEIGHT": YARD_CONFIG["TIERS_PER_SLOT"],
"PREFER_NEAR_CHE": True,
"BALANCE_CHE_LOAD": True,
"AVOID_REHANDLING": True,
"REWARD_DISTANCE_TO_CHE": -0.1, # 距离场桥的惩罚
"REWARD_BALANCE_LOAD": -0.5, # 负载不平衡的惩罚
"REWARD_SPACE_UTILIZATION": 0.1, # 空间利用率奖励
"REWARD_AVOID_REHANDLING": 0.2, # 避免翻箱的奖励
"PENALTY_COLLISION": -10.0, # 碰撞惩罚
"REWARD_SUCCESSFUL_PLACE": 1.0, # 成功放置奖励
}
java
# main.py
import numpy as np
import time
from yard_environment import YardEnvironment
from rule_engine import RuleEngine
from mcts_agent import MCTSAgent
from rl_agent import RLYardAgent
from config import RL_CONFIG
def train_rl_agent():
print("=== 开始训练RL智能体 ===")
env = YardEnvironment(render_mode="human") # render_mode可选
rule_eng = RuleEngine(env)
rl_agent = RLYardAgent(env)
# 尝试加载预训练模型
rl_agent.load_model()
episodes = 1#RL_CONFIG["EPISODES"]
scores = []
for e in range(episodes):
state, _ = env.reset()
total_reward = 0
step_count = 0
done = False
while not done:
available_actions = env.get_free_slots_as_actions()
if not available_actions:
break # 堆场满了
action = rl_agent.act(state, available_actions)
next_state, reward, terminated, truncated, info = env.step(action)
done = terminated or truncated
rl_agent.train_step(state, action, reward, next_state, done)
state = next_state
total_reward += reward
step_count += 1
scores.append(total_reward)
avg_score = np.mean(scores[-100:]) # 最后100局的平均分
print(f"Episode {e+1}/{episodes}, Score: {total_reward:.2f}, Avg Score (100): {avg_score:.2f}, Epsilon: {rl_agent.epsilon:.3f}, Step Count: {step_count}")
# 保存训练好的模型
# rl_agent.save_model()
print("=== RL智能体训练完成 ===")
return rl_agent
def run_mcts_with_rl_prior(rl_agent):
print("\n=== 运行MCTS + RL先验系统 ===")
env = YardEnvironment(render_mode="human") # 确保环境正确初始化
rule_eng = RuleEngine(env)
mcts_agent = MCTSAgent(env, rule_eng, rl_agent)
for i in range(5):
print(f"\n--- 分配第 {i+1} 个新集装箱 ---")
state, _ = env.reset()
container_info = {"id": f"NEW_{i}", "expected_pickup_time": np.random.randint(1, 10)}
start_time = time.time()
best_action, value = mcts_agent.search(container_info)
end_time = time.time()
if best_action is None:
print("错误:MCTS未能找到合适位置!")
break
# ✅ 修正:正确解包 step() 的返回值
next_state, reward, terminated, truncated, info = env.step(best_action)
done = terminated or truncated # 判断是否完成
# ✅ 修正:检查终止条件,而不是检查"success"
if not done: # 只要没结束,就认为放置成功
block_idx, r, c, t = env._decode_action(best_action)
block = env.blocks[block_idx]
print(f"MCTS-RL推荐位置: Block {block}, Row {r}, Col {c}, Tier {t}")
print(f"评估价值: {value:.2f}")
print(f"搜索耗时: {end_time - start_time:.2f}s")
else:
print("错误:放置失败或环境已终止!")
print(f"当前场桥负载: {env._get_che_loads()}")
env.render()
time.sleep(1)
print("\n--- MCTS-Rl系统运行结束 ---")
def run_mcts_without_rl():
print("\n=== 运行纯MCTS系统 (作为对比) ===")
env = YardEnvironment(render_mode="human")
rule_eng = RuleEngine(env)
mcts_agent = MCTSAgent(env, rule_eng, None)
for i in range(5):
print(f"\n--- 分配第 {i+1} 个新集装箱 (纯MCTS) ---")
state, _ = env.reset()
container_info = {"id": f"NEW_{i}_pure_mcts", "expected_pickup_time": np.random.randint(1, 10)}
start_time = time.time()
best_action, value = mcts_agent.search(container_info)
end_time = time.time()
if best_action is None:
print("错误:MCTS未能找到合适位置!")
break
# ✅ 修正:正确解包 step() 的返回值
next_state, reward, terminated, truncated, info = env.step(best_action)
done = terminated or truncated # 判断是否完成
# ✅ 修正:检查终止条件,而不是检查"success"
if not done: # 只要没结束,就认为放置成功
block_idx, r, c, t = env._decode_action(best_action)
block = env.blocks[block_idx]
print(f"Pure-MCTS推荐位置: Block {block}, Row {r}, Col {c}, Tier {t}")
print(f"评估价值: {value:.2f}")
print(f"搜索耗时: {end_time - start_time:.2f}s")
else:
print("错误:放置失败或环境已终止!")
print(f"当前场桥负载: {env._get_che_loads()}")
env.render()
time.sleep(1)
print("\n--- 纯MCTS系统运行结束 ---")
def main():
print("=== 集装箱堆场MCTS+RL智能选位系统 ===")
# 1. 训练RL代理
trained_rl_agent = train_rl_agent()
# 2. 使用训练好的RL代理增强MCTS
run_mcts_with_rl_prior(trained_rl_agent)
# 3. 运行纯MCTS作为对比
run_mcts_without_rl()
if __name__ == "__main__":
main()
mcts_agent.py
java
import math
import time
import random
import numpy as np
from config import MCTS_CONFIG
class MCTSNode:
def __init__(self, state_snapshot, parent=None, action=None):
self.state = state_snapshot # 堆场状态快照 (np array)
self.parent = parent
self.action = action # action index
self.children = []
self.visits = 0
self.value = 0.0
self.prior_prob = 0.0 # 先验概率
self.untried_actions = None
self._actions_initialized = False
def is_fully_expanded(self):
return self._actions_initialized and len(self.untried_actions) == 0
def ucb1_score(self):
if self.visits == 0:
return float('inf')
exploitation = self.value / self.visits
if self.parent is None or self.parent.visits == 0:
exploration = 0
else:
exploration = MCTS_CONFIG["EXPLORATION_PARAM"] * \
math.sqrt(math.log(self.parent.visits) / self.visits)
return exploitation + exploration + MCTS_CONFIG["PRIOR_WEIGHT"] * self.prior_prob
class MCTSAgent:
def __init__(self, simulator, rule_engine, rl_agent=None):
self.sim = simulator
self.rule_engine = rule_engine
self.rl_agent = rl_agent # 可选的RL代理
def search(self, container_info, time_limit=None):
if time_limit is None:
time_limit = MCTS_CONFIG["TIME_LIMIT"]
start_time = time.time()
root = MCTSNode(self._get_state_snapshot())
# --- 初始化根节点 ---
free_actions = self.sim.get_free_slots_as_actions()
if not free_actions:
print("警告:堆场已满,没有空位!")
return None, -float('inf')
# 应用规则过滤
filtered_actions = self.rule_engine.apply_filters(free_actions, container_info)
if not filtered_actions:
print("警告:规则过滤后没有找到符合条件的空位!")
return None, -float('inf')
root.untried_actions = filtered_actions[:]
root._actions_initialized = True
# 如果启用了RL先验,则计算RL概率
rl_probs = None
if MCTS_CONFIG["USE_RL_PRIOR"] and self.rl_agent is not None:
state_for_rl = self._get_state_for_rl()
available_actions_mask = np.zeros(self.sim.action_space.n)
for a in filtered_actions:
available_actions_mask[a] = 1
rl_probs = self.rl_agent.get_action_probs(state_for_rl, filtered_actions)
iteration_count = 0
while time.time() - start_time < time_limit and iteration_count < MCTS_CONFIG["SIMULATION_COUNT"]:
# 选择
node = self._tree_policy(root)
# 检查是否可以扩展
if not self._is_terminal(node.state) and not node.is_fully_expanded():
# 扩展
new_node = self._expand(node, container_info, rl_probs)
# 模拟
reward = self._default_policy(new_node.state, container_info)
# 回溯
self._backup_negamax(new_node, reward)
else:
reward = self._default_policy(node.state, container_info)
self._backup_negamax(node, reward)
iteration_count += 1
if not root.children:
return None, -float('inf')
best_child = max(root.children, key=lambda c: c.visits)
return best_child.action, best_child.value / best_child.visits if best_child.visits > 0 else 0
def _tree_policy(self, node):
current = node
while not self._is_terminal(current.state) and current.is_fully_expanded():
current = self._best_child_uct(current)
return current
def _expand(self, node, container_info, rl_probs=None):
if not node._actions_initialized:
free_actions = self._get_free_slots_from_snapshot(node.state)
filtered_actions = self.rule_engine.apply_filters(free_actions, container_info)
node.untried_actions = filtered_actions[:]
node._actions_initialized = True
if node.untried_actions:
action = node.untried_actions.pop()
new_state = self._apply_action_to_state(node.state, action, container_info['id'])
child = MCTSNode(new_state, parent=node, action=action)
prior_score = 0.0
if MCTS_CONFIG["USE_RL_PRIOR"] and rl_probs is not None:
prior_score = rl_probs[action] * MCTS_CONFIG["RL_PRIOR_WEIGHT"]
else:
# 修正:将numpy数组转换为字典格式以便规则引擎评估
decoded_slot = self.sim._decode_action(action)
snapshot_dict = {i: node.state[i] for i in range(len(self.sim.blocks))}
prior_score = self.rule_engine.evaluate_slot(snapshot_dict, decoded_slot, container_info)
child.prior_prob = max(0, prior_score)
node.children.append(child)
return child
else:
return node
def _best_child_uct(self, node):
if not node.children:
return node
return max(node.children, key=lambda c: c.ucb1_score())
def _default_policy(self, state_snapshot, container_info):
# 修正:直接处理 numpy.ndarray
# 计算各场桥负载
loads = {}
for i, block in enumerate(self.sim.blocks):
loads[block] = np.sum(state_snapshot[i]) # 直接对第i个block的切片求和
load_variance = np.var(list(loads.values()))
total_slots = len(self.sim.blocks) * self.sim.rows * self.sim.cols * self.sim.tiers
# 计算总占用槽位数
occupied_slots = np.sum(state_snapshot != 0) # 对整个数组进行布尔判断并求和
space_util = occupied_slots / total_slots if total_slots > 0 else 0
reward = -load_variance * 0.5 + space_util * 10
return reward
def _backup_negamax(self, node, reward):
current = node
while current is not None:
current.visits += 1
current.value += reward
current = current.parent
# pass
def _is_terminal(self, state_snapshot):
return len(self._get_free_slots_from_snapshot(state_snapshot)) == 0
def _get_state_snapshot(self):
# 获取当前堆场状态的快照
return self.sim.layout.copy()
def _get_state_for_rl(self):
# 获取RL代理可以理解的状态
return self.sim._get_obs()
def _apply_action_to_state(self, state_snapshot, action, container_id):
# 将动作应用到状态快照上
block_idx, r, c, t = self.sim._decode_action(action)
new_snapshot = state_snapshot.copy() # 复制快照
new_snapshot[block_idx, r, c, t] = 1 # 修改副本
return new_snapshot
def _get_free_slots_from_snapshot(self, snap_layout):
free_actions = []
for i in range(len(self.sim.blocks)):
for r in range(self.sim.rows):
for c in range(self.sim.cols):
for t in range(self.sim.tiers):
if snap_layout[i, r, c, t] == 0:
can_place = True
for lower_t in range(t):
if snap_layout[i, r, c, lower_t] == 0:
can_place = False
break
if can_place:
free_actions.append(self.sim._encode_action(i, r, c, t))
return free_actions
# def _get_che_loads_from_snapshot(self, snap_layout):
# # 修正:直接处理 numpy.ndarray
# loads = {block: 0 for block in self.sim.blocks}
# for i, block in enumerate(self.sim.blocks):
# loads[block] = np.sum(snap_layout[i])
# return loads
java
# model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
class YardQNetwork(nn.Module):
"""
一个简单的卷积神经网络,用于估计状态-动作价值 Q(s, a)
输入: (batch_size, n_blocks, rows, cols, tiers)
输出: (batch_size, n_actions)
"""
def __init__(self, n_blocks, rows, cols, tiers, n_actions):
super(YardQNetwork, self).__init__()
self.conv1 = nn.Conv3d(in_channels=n_blocks, out_channels=16, kernel_size=3, padding=1)
self.conv2 = nn.Conv3d(in_channels=16, out_channels=32, kernel_size=3, padding=1)
self.pool = nn.AdaptiveAvgPool3d((1, 1, 1)) # 全局平均池化
self.fc1 = nn.Linear(32, 64)
self.fc2 = nn.Linear(64, n_actions) # 输出每个动作的Q值
def forward(self, x):
# x shape: (batch_size, n_blocks, rows, cols, tiers)
x = F.relu(self.conv1(x))
x = F.relu(self.conv2(x))
x = self.pool(x) # (batch_size, 32, 1, 1, 1)
x = x.view(x.size(0), -1) # (batch_size, 32)
x = F.relu(self.fc1(x)) # (batch_size, 64)
x = self.fc2(x) # (batch_size, n_actions)
return x
rl_agent.py
java
# rl_agent.py
import torch
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import random
from collections import deque
from model import YardQNetwork
from config import RL_CONFIG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ReplayBuffer:
def __init__(self, capacity=RL_CONFIG["MEMORY_SIZE"]):
self.buffer = deque(maxlen=capacity)
def push(self, state, action, reward, next_state, done):
self.buffer.append((state, action, reward, next_state, done))
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
state, action, reward, next_state, done = map(np.stack, zip(*batch))
return state, action, reward, next_state, done
def __len__(self):
return len(self.buffer)
class RLYardAgent:
def __init__(self, env):
self.env = env
self.n_actions = env.action_space.n
self.state_shape = env.observation_space.shape # (n_blocks, rows, cols, tiers)
self.n_blocks, self.rows, self.cols, self.tiers = self.state_shape
self.q_network = YardQNetwork(self.n_blocks, self.rows, self.cols, self.tiers, self.n_actions).to(device)
self.target_network = YardQNetwork(self.n_blocks, self.rows, self.cols, self.tiers, self.n_actions).to(device)
self.optimizer = optim.Adam(self.q_network.parameters(), lr=RL_CONFIG["LEARNING_RATE"])
self.memory = ReplayBuffer()
self.epsilon = RL_CONFIG["EPSILON_START"]
self.gamma = RL_CONFIG["GAMMA"]
self.batch_size = RL_CONFIG["BATCH_SIZE"]
self.target_update_freq = RL_CONFIG["TARGET_UPDATE_FREQ"]
self.train_every_n_steps = RL_CONFIG["TRAIN_EVERY_N_STEPS"]
self.step_counter = 0
def act(self, state, available_actions=None):
"""根据当前策略选择动作"""
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device) # (1, n_blocks, rows, cols, tiers)
# Epsilon-greedy 策略
if random.random() < self.epsilon:
if available_actions:
return random.choice(available_actions)
else:
return self.env.action_space.sample()
with torch.no_grad():
q_values = self.q_network(state_tensor).cpu().numpy()[0]
if available_actions:
# 只考虑可用动作
masked_q_values = np.full(self.n_actions, -np.inf)
for a in available_actions:
masked_q_values[a] = q_values[a]
return np.argmax(masked_q_values)
else:
return np.argmax(q_values)
def remember(self, state, action, reward, next_state, done):
"""存储经验"""
self.memory.push(state, action, reward, next_state, done)
def replay(self):
"""从经验回放中学习"""
if len(self.memory) < self.batch_size:
return
states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)
states = torch.FloatTensor(states).to(device)
actions = torch.LongTensor(actions).to(device)
rewards = torch.FloatTensor(rewards).to(device)
next_states = torch.FloatTensor(next_states).to(device)
dones = torch.BoolTensor(dones).to(device)
current_q_values = self.q_network(states).gather(1, actions.unsqueeze(1)) # (batch_size, 1)
next_q_values = self.target_network(next_states).max(1)[0].detach() # (batch_size,)
target_q_values = rewards + (self.gamma * next_q_values * ~dones) # (batch_size,)
loss = F.mse_loss(current_q_values.squeeze(), target_q_values)
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def update_target_network(self):
"""更新目标网络"""
self.target_network.load_state_dict(self.q_network.state_dict())
def decay_epsilon(self):
"""衰减epsilon"""
self.epsilon = max(RL_CONFIG["EPSILON_END"], self.epsilon * RL_CONFIG["EPSILON_DECAY"])
def train_step(self, state, action, reward, next_state, done):
"""单步训练"""
self.remember(state, action, reward, next_state, done)
self.step_counter += 1
if self.step_counter % self.train_every_n_steps == 0:
self.replay()
if self.step_counter % self.target_update_freq == 0:
self.update_target_network()
self.decay_epsilon()
def save_model(self, path=RL_CONFIG["MODEL_SAVE_PATH"]):
"""保存模型"""
torch.save(self.q_network.state_dict(), path)
print(f"Model saved to {path}")
def load_model(self, path=RL_CONFIG["MODEL_LOAD_PATH"]):
"""加载模型"""
try:
self.q_network.load_state_dict(torch.load(path, map_location=device))
self.target_network.load_state_dict(self.q_network.state_dict())
print(f"Model loaded from {path}")
except FileNotFoundError:
print(f"Model file {path} not found. Starting from scratch.")
def get_action_probs(self, state, available_actions=None):
"""获取所有可用动作的概率(用于MCTS先验)"""
with torch.no_grad():
state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
q_values = self.q_network(state_tensor).cpu().numpy()[0]
if available_actions:
masked_q_values = np.full(self.n_actions, -np.inf)
for a in available_actions:
masked_q_values[a] = q_values[a]
# 使用softmax将Q值转换为概率
probs = np.exp(masked_q_values - np.max(masked_q_values)) # 防止溢出
probs = probs / probs.sum()
return probs
else:
# 如果没有可用动作限制,对所有动作计算概率
exp_q = np.exp(q_values - np.max(q_values))
return exp_q / exp_q.sum()
rule_engine.py
java
# rule_engine.py
import numpy as np
from config import RULE_CONFIG, YARD_CONFIG
class RuleEngine:
def __init__(self, simulator):
self.sim = simulator # simulator is the environment instance here
self.blocks = YARD_CONFIG["BLOCKS"]
self.rows = YARD_CONFIG["ROWS_PER_BLOCK"]
self.cols = YARD_CONFIG["COLS_PER_ROW"]
self.che_positions = YARD_CONFIG["CHE_POSITIONS"]
def apply_filters(self, free_actions, container_info):
"""应用规则过滤候选动作"""
# free_actions 是动作列表,需要解码成 (block_idx, r, c, t)
free_slots = [self.sim._decode_action(a) for a in free_actions]
filtered_slots = free_slots.copy()
# 规则1: 优先靠近场桥
if RULE_CONFIG["PREFER_NEAR_CHE"]:
filtered_slots = self._filter_near_che(filtered_slots)
# 规则2: 平衡场桥负载
if RULE_CONFIG["BALANCE_CHE_LOAD"]:
filtered_slots = self._filter_balance_che_load(filtered_slots)
# 规则3: 避免增加翻箱风险 (简化版)
if RULE_CONFIG["AVOID_REHANDLING"]:
filtered_slots = self._filter_avoid_rehandling(filtered_slots, container_info)
# 再将筛选后的槽位编码回动作
filtered_actions = [self.sim._encode_action(*slot) for slot in filtered_slots]
return filtered_actions
def _filter_near_che(self, slots):
if not slots: return slots
distances = []
for block_idx, r, c, t in slots:
block = self.blocks[block_idx]
che_r, che_c = self.che_positions[block]
dist = abs(r - che_r) + abs(c - che_c)
distances.append(dist)
min_dist = min(distances)
closest_slots = [slots[i] for i, d in enumerate(distances) if d == min_dist]
return closest_slots
def _filter_balance_che_load(self, slots):
if not slots: return slots
# 计算当前状态下各场桥的负载
loads = self._get_current_che_loads()
load_scores = []
for block_idx, r, c, t in slots:
block = self.blocks[block_idx]
hypothetical_load = {k: v for k, v in loads.items()}
hypothetical_load[block] += 1
load_variance = np.var(list(hypothetical_load.values()))
# 负的方差,方差越小得分越高
load_scores.append(-load_variance)
max_score = max(load_scores)
balanced_slots = [slots[i] for i, s in enumerate(load_scores) if s == max_score]
return balanced_slots
def _filter_avoid_rehandling(self, slots, container_info):
if not slots: return slots
# 简化:选择层数较高的位置
target_tier = max(slot[3] for slot in slots)
rehandling_safe_slots = [slot for slot in slots if slot[3] == target_tier]
if not rehandling_safe_slots:
return slots
return rehandling_safe_slots
def _get_current_che_loads(self):
loads = {block: 0 for block in self.blocks}
for i, block in enumerate(self.blocks):
# 从环境的layout获取当前负载
# 假设sim.layout是环境实例的属性
# layout shape: (n_blocks, rows, cols, tiers)
# 这里需要从环境实例获取
# 我们直接从sim实例获取
loads[block] = np.sum(self.sim.layout[i])
return loads
def evaluate_slot(self, state_snapshot, slot, container_info):
"""评估一个槽位的质量(用于MCTS模拟)"""
block_idx, r, c, t = slot
block = self.blocks[block_idx]
score = 0.0
# 距离场桥
che_r, che_c = self.che_positions[block]
dist_to_che = abs(r - che_r) + abs(c - che_c)
score += RULE_CONFIG["REWARD_DISTANCE_TO_CHE"] * dist_to_che
# 负载均衡 (基于传入的state_snapshot)
loads = self._get_che_loads_from_snapshot(state_snapshot)
hypothetical_load = {k: v for k, v in loads.items()}
hypothetical_load[block] += 1
load_variance = np.var(list(hypothetical_load.values()))
score += RULE_CONFIG["REWARD_BALANCE_LOAD"] * load_variance
# 层数奖励
score += RULE_CONFIG["REWARD_AVOID_REHANDLING"] * t
# 空间利用率奖励 (基于传入的state_snapshot)
total_slots = len(self.sim.blocks) * self.sim.rows * self.sim.cols * self.sim.tiers
occupied_slots = sum(np.sum(layout != 0) for layout in state_snapshot.values())
util = occupied_slots / total_slots if total_slots > 0 else 0
score += RULE_CONFIG["REWARD_SPACE_UTILIZATION"] * util
return score
def _get_che_loads_from_snapshot(self, snap_layout):
loads = {block: 0 for block in self.sim.blocks}
for i, block in enumerate(self.sim.blocks):
loads[block] = np.sum(snap_layout[i])
return loads
yard_environment.py
java
# yard_environment.py
import gymnasium as gym
import numpy as np
import random
from config import YARD_CONFIG, RULE_CONFIG
from rule_engine import RuleEngine
class YardEnvironment(gym.Env):
metadata = {"render_modes": ["human", "ansi"]}
def __init__(self, render_mode=None):
super(YardEnvironment, self).__init__()
self.render_mode = render_mode
self.blocks = YARD_CONFIG["BLOCKS"]
self.rows = YARD_CONFIG["ROWS_PER_BLOCK"]
self.cols = YARD_CONFIG["COLS_PER_ROW"]
self.tiers = YARD_CONFIG["TIERS_PER_SLOT"]
self.che_positions = YARD_CONFIG["CHE_POSITIONS"]
self.max_containers = YARD_CONFIG["MAX_CONTAINERS"]
# 动作空间:所有可能的放置位置 (block_idx, row, col, tier)
# 将三维位置编码为一维索引
self.n_blocks = len(self.blocks)
self.action_space = gym.spaces.Discrete(self.n_blocks * self.rows * self.cols * self.tiers)
# 观察空间:堆场状态 (blocks, rows, cols, tiers),用0表示空,1表示有箱子
# 还可以包含其他特征,如场桥位置
obs_shape = (self.n_blocks, self.rows, self.cols, self.tiers)
self.observation_space = gym.spaces.Box(low=0, high=1, shape=obs_shape, dtype=np.float32)
self.reset()
def reset(self, seed=None, options=None):
super().reset(seed=seed)
self.layout = np.zeros((self.n_blocks, self.rows, self.cols, self.tiers), dtype=np.int8)
self.container_count = 0
self.next_container_id = 0
self.episode_reward = 0
self.step_count = 0
self.rule_engine = RuleEngine(self)
return self._get_obs(), {}
def step(self, action):
info = {}
terminated = False
truncated = False
reward = 0
# 解码动作
if not self._is_valid_action(action):
# 无效动作:碰撞惩罚
reward = RULE_CONFIG["PENALTY_COLLISION"]
info["reason"] = "Invalid Action"
else:
block_idx, r, c, t = self._decode_action(action)
block = self.blocks[block_idx]
# 检查下方是否都已占用(符合堆叠规则)
can_place = True
for lower_t in range(t):
if self.layout[block_idx, r, c, lower_t] == 0:
can_place = False
break
if can_place:
# 执行放置
self.layout[block_idx, r, c, t] = 1
self.container_count += 1
self.step_count += 1
# 计算奖励
reward += RULE_CONFIG["REWARD_SUCCESSFUL_PLACE"]
# 距离场桥奖励
che_r, che_c = self.che_positions[block]
dist_to_che = abs(r - che_r) + abs(c - che_c)
reward += RULE_CONFIG["REWARD_DISTANCE_TO_CHE"] * dist_to_che
# 负载均衡奖励
loads = self._get_che_loads()
load_variance = np.var(list(loads.values()))
reward += RULE_CONFIG["REWARD_BALANCE_LOAD"] * load_variance
# 空间利用率奖励
util = self.container_count / self.max_containers
reward += RULE_CONFIG["REWARD_SPACE_UTILIZATION"] * util
# 避免翻箱奖励(简化:层数越高越好)
reward += RULE_CONFIG["REWARD_AVOID_REHANDLING"] * t
info["reason"] = "Successful Place"
else:
# 无效动作:碰撞惩罚
reward = RULE_CONFIG["PENALTY_COLLISION"]
info["reason"] = "Cannot Place Here (Stacking Rule Violated)"
self.episode_reward += reward
# 终止条件
if self.container_count >= self.max_containers:
terminated = True
info["final_reason"] = "Yard Full"
elif self.step_count > 200: # 限制单局步数,防止无限循环
truncated = True
info["final_reason"] = "Max Steps Reached"
observation = self._get_obs()
return observation, reward, terminated, truncated, info
def _get_obs(self):
# 返回当前堆场状态作为观察
return self.layout.astype(np.float32)
def _decode_action(self, action):
# 将一维动作索引转换为 (block_idx, r, c, t)
t = action % self.tiers
action //= self.tiers
c = action % self.cols
action //= self.cols
r = action % self.rows
action //= self.rows
block_idx = action
return block_idx, r, c, t
def _encode_action(self, block_idx, r, c, t):
# 将 (block_idx, r, c, t) 转换为一维动作索引
return block_idx * (self.rows * self.cols * self.tiers) + \
r * (self.cols * self.tiers) + \
c * self.tiers + t
def _is_valid_action(self, action):
# 检查动作是否在动作空间内
if not (0 <= action < self.action_space.n):
return False
block_idx, r, c, t = self._decode_action(action)
# 检查位置是否为空
return self.layout[block_idx, r, c, t] == 0
def _get_che_loads(self):
loads = {block: 0 for block in self.blocks}
for i, block in enumerate(self.blocks):
loads[block] = np.sum(self.layout[i])
return loads
def render(self):
if self.render_mode == "human":
print("\n--- 堆场布局可视化 ---")
for i, block in enumerate(self.blocks):
print(f"\nBlock {block}:")
print(" ", end="")
for c in range(self.cols):
print(f"{c:4}", end="")
print()
for r in range(self.rows):
print(f"{r:2}|", end="")
for c in range(self.cols):
occupied = any(self.layout[i, r, c, t] == 1 for t in range(self.tiers))
if (r, c) == self.che_positions[block]:
symbol = " H " if occupied else " h "
else:
symbol = " X " if occupied else " . "
print(symbol, end="")
print()
print(f"Total Containers: {self.container_count}/{self.max_containers}")
elif self.render_mode == "ansi":
# Return a string representation
output = ""
for i, block in enumerate(self.blocks):
output += f"\nBlock {block}:\n"
output += " " + "".join([f"{c:4}" for c in range(self.cols)]) + "\n"
for r in range(self.rows):
output += f"{r:2}|" + "".join([
(" H " if any(self.layout[i, r, c, t] == 1 for t in range(self.tiers)) else " h ") if (r, c) == self.che_positions[block] else
(" X " if any(self.layout[i, r, c, t] == 1 for t in range(self.tiers)) else " . ")
for c in range(self.cols)
]) + "\n"
output += f"Total Containers: {self.container_count}/{self.max_containers}\n"
return output
def get_free_slots_as_actions(self):
"""获取所有空闲槽位对应的action列表"""
free_actions = []
for i in range(self.n_blocks):
for r in range(self.rows):
for c in range(self.cols):
for t in range(self.tiers):
if self.layout[i, r, c, t] == 0:
# 检查下方是否都已占用
can_place = True
for lower_t in range(t):
if self.layout[i, r, c, lower_t] == 0:
can_place = False
break
if can_place:
free_actions.append(self._encode_action(i, r, c, t))
return free_actions