基于深度强化学习（Deep Q-Network, DQN）的运输路径优化系统

这是一个基于深度强化学习（Deep Q-Network, DQN）的运输路径优化系统。代码主要包含以下几个部分：

1. 导入库

复制代码

import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple

导入了NumPy、Gym环境库、Matplotlib绘图库、随机数库、PyTorch深度学习框架等必要的Python库。

2. 中文字体配置

复制代码

plt.rcParams['font.sans-serif'] = ['SimSun']  # 设置宋体
plt.rcParams['axes.unicode_minus'] = False     # 解决负号显示问题

配置Matplotlib以支持中文显示和负号显示。

3. 经验回放缓冲区

复制代码

Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory:
    """经验回放缓冲区"""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
    
    def push(self, *args):
        """保存一个转换"""
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size):
        """随机采样一批转换"""
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

定义了Transition命名元组来存储状态转换信息。
ReplayMemory类实现了经验回放缓冲区，用于存储和采样训练数据。

4. DQN网络结构

复制代码

class DQN(nn.Module):
    """深度Q网络模型"""
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

定义了一个简单的三层全连接神经网络，用于近似Q函数。

5. 运输路径优化环境

复制代码

class TransportationEnv(gym.Env):
    """运输路径优化环境"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self, grid_size=(10, 10), num_obstacles=15, congestion_factor=0.3):
        # 初始化环境参数
        self.grid_size = grid_size
        self.num_obstacles = num_obstacles
        self.congestion_factor = congestion_factor
        
        # 定义动作空间和状态空间
        self.action_space = spaces.Discrete(4)  # 上、下、左、右
        low = np.array([0, 0, 0, 0, 0])
        high = np.array([grid_size[0]-1, grid_size[1]-1, 
                         grid_size[0]-1, grid_size[1]-1, 
                         grid_size[0]+grid_size[1]])
        self.observation_space = spaces.Box(low, high, dtype=np.int32)
        
        # 初始化环境
        self.reset()

定义了一个自定义的Gym环境，用于模拟运输路径优化问题。
环境包含障碍物、拥堵区域、起点和终点。

6. 环境方法

复制代码

    def reset(self):
        """重置环境"""
        # 随机生成障碍物、起点和终点
        # 初始化路径和步数
        # 生成拥堵区域
        return self.get_state()
    
    def generate_congestion(self):
        """生成拥堵区域"""
        # 在网格中心附近生成拥堵区域
    
    def step(self, action):
        """执行动作并返回下一状态、奖励和终止标志"""
        # 计算新位置
        # 检查是否撞到障碍物
        # 计算奖励
        # 返回下一状态、奖励、终止标志和额外信息
    
    def get_state(self):
        """获取当前状态"""
        # 返回包含当前位置、目标位置和距离的状态
    
    def manhattan_distance(self, pos1, pos2):
        """计算曼哈顿距离"""
        return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
    
    def render(self, mode='human'):
        """渲染环境"""
        # 可视化网格、障碍物、拥堵区域、路径、当前位置和目标位置

实现了Gym环境的标准方法：reset()、step()、render()等。

7. DQN智能体

复制代码

class DQNAgent:
    """深度Q网络智能体"""
    def __init__(self, state_size, action_size, device,
                 learning_rate=0.001, discount_factor=0.99,
                 exploration_rate=1.0, exploration_decay=0.995,
                 min_exploration_rate=0.01, memory_size=10000,
                 batch_size=64, target_update=10):
        # 初始化参数
        # 创建策略网络和目标网络
        # 设置优化器和经验回放缓冲区
    
    def select_action(self, state, training=True):
        """选择动作"""
        # ε-贪婪策略选择动作
    
    def learn(self):
        """从经验回放中学习"""
        # 采样一批转换
        # 计算当前Q值和目标Q值
        # 计算损失并更新网络
        # 衰减探索率
        # 定期更新目标网络

实现了DQN智能体，包含策略网络和目标网络。
使用经验回放和ε-贪婪策略进行训练。

8. 训练和评估函数

复制代码

def train_agent(env, agent, num_episodes=1000):
    """训练智能体"""
    # 运行多个回合
    # 在每个回合中与环境交互
    # 存储经验并学习
    # 打印训练进度

def evaluate_agent(env, agent, num_episodes=10):
    """评估智能体性能"""
    # 测试智能体性能
    # 计算成功率和平均步数

train_agent()函数用于训练智能体。
evaluate_agent()函数用于评估训练好的智能体。

9. 主程序

复制代码

if __name__ == "__main__":
    # 设置设备
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # 创建环境和智能体
    env = TransportationEnv(grid_size=(10, 10))
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size, device)
    
    # 训练智能体
    print("开始训练智能体...")
    rewards = train_agent(env, agent, num_episodes=500)
    
    # 评估智能体
    print("\n评估智能体性能...")
    evaluate_agent(env, agent, num_episodes=5)
    
    # 可视化训练过程和最优路径
    plt.figure(figsize=(10, 6))
    plt.plot(rewards)
    plt.title('训练奖励')
    plt.xlabel('回合')
    plt.ylabel('总奖励')
    plt.grid(True)
    plt.show()
    
    print("\n可视化最优路径...")
    state = env.reset()
    done = False
    
    while not done:
        action = agent.select_action(state, training=False)
        state, _, done, _ = env.step(action)
    
    env.render()

主程序创建环境、智能体，进行训练和评估，并可视化结果。

这个系统实现了一个完整的深度强化学习解决方案，用于解决运输路径优化问题。智能体学习在包含障碍物和拥堵区域的网格世界中找到从起点到终点的最优路径。

完整代码

python 复制代码

import numpy as np
import gym
from gym import spaces
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, namedtuple

# 添加中文字体配置
plt.rcParams['font.sans-serif'] = ['SimSun']  # 设置宋体
plt.rcParams['axes.unicode_minus'] = False     # 解决负号显示问题

# 定义经验回放的数据结构
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward', 'done'))

class ReplayMemory:
    """经验回放缓冲区"""
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)
    
    def push(self, *args):
        """保存一个转换"""
        self.memory.append(Transition(*args))
    
    def sample(self, batch_size):
        """随机采样一批转换"""
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

class DQN(nn.Module):
    """深度Q网络模型"""
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class TransportationEnv(gym.Env):
    """运输路径优化环境"""
    metadata = {'render.modes': ['human']}
    
    def __init__(self, grid_size=(10, 10), num_obstacles=15, congestion_factor=0.3):
        super(TransportationEnv, self).__init__()
        
        # 环境参数
        self.grid_size = grid_size
        self.num_obstacles = num_obstacles
        self.congestion_factor = congestion_factor
        
        # 动作空间: 上、下、左、右
        self.action_space = spaces.Discrete(4)
        
        # 状态空间: 智能体位置(x,y) + 终点位置(x,y) + 拥堵信息(简化为距离终点的曼哈顿距离)
        low = np.array([0, 0, 0, 0, 0])
        high = np.array([grid_size[0]-1, grid_size[1]-1, 
                         grid_size[0]-1, grid_size[1]-1, 
                         grid_size[0]+grid_size[1]])
        self.observation_space = spaces.Box(low, high, dtype=np.int32)
        
        # 初始化环境
        self.reset()
        
    def reset(self):
        """重置环境"""
        # 随机生成障碍物
        self.obstacles = set()
        while len(self.obstacles) < self.num_obstacles:
            x = random.randint(0, self.grid_size[0]-1)
            y = random.randint(0, self.grid_size[1]-1)
            self.obstacles.add((x, y))
        
        # 随机生成起点和终点，确保不在障碍物上
        while True:
            self.agent_pos = (random.randint(0, self.grid_size[0]-1), 
                             random.randint(0, self.grid_size[1]-1))
            if self.agent_pos not in self.obstacles:
                break
                
        while True:
            self.goal_pos = (random.randint(0, self.grid_size[0]-1), 
                            random.randint(0, self.grid_size[1]-1))
            if self.goal_pos != self.agent_pos and self.goal_pos not in self.obstacles:
                break
        
        # 初始化路径和步数
        self.path = [self.agent_pos]
        self.steps = 0
        self.max_steps = self.grid_size[0] * self.grid_size[1] * 2
        
        # 生成拥堵区域
        self.generate_congestion()
        
        # 返回初始状态
        return self.get_state()
    
    def generate_congestion(self):
        """生成拥堵区域"""
        self.congestion = set()
        num_congestion = int(self.grid_size[0] * self.grid_size[1] * self.congestion_factor)
        
        # 拥堵区域倾向于靠近中心
        center_x, center_y = self.grid_size[0]//2, self.grid_size[1]//2
        for _ in range(num_congestion):
            while True:
                # 以中心为基准的高斯分布生成坐标
                x = int(np.clip(np.random.normal(center_x, self.grid_size[0]/4), 0, self.grid_size[0]-1))
                y = int(np.clip(np.random.normal(center_y, self.grid_size[1]/4), 0, self.grid_size[1]-1))
                pos = (x, y)
                if pos not in self.obstacles and pos not in self.congestion:
                    self.congestion.add(pos)
                    break
    
    def step(self, action):
        """执行动作并返回下一状态、奖励和终止标志"""
        self.steps += 1
        
        # 计算新位置
        x, y = self.agent_pos
        if action == 0:  # 上
            y = max(y-1, 0)
        elif action == 1:  # 下
            y = min(y+1, self.grid_size[1]-1)
        elif action == 2:  # 左
            x = max(x-1, 0)
        elif action == 3:  # 右
            x = min(x+1, self.grid_size[0]-1)
            
        new_pos = (x, y)
        
        # 检查是否撞到障碍物
        if new_pos in self.obstacles:
            reward = -10  # 撞到障碍物的惩罚
            done = False  # 可以继续尝试
        else:
            self.agent_pos = new_pos
            self.path.append(new_pos)
            
            # 计算奖励
            prev_dist = self.manhattan_distance(self.path[-2], self.goal_pos)
            curr_dist = self.manhattan_distance(self.agent_pos, self.goal_pos)
            
            if new_pos == self.goal_pos:
                reward = 100  # 到达目标的奖励
                done = True
            elif self.steps >= self.max_steps:
                reward = -50  # 超时惩罚
                done = True
            else:
                # 距离减少的奖励，进入拥堵区域的惩罚
                reward = (prev_dist - curr_dist) * 2
                if new_pos in self.congestion:
                    reward -= 3  # 拥堵区域惩罚
                done = False
        
        # 返回下一状态、奖励、终止标志和额外信息
        return self.get_state(), reward, done, {}
    
    def get_state(self):
        """获取当前状态"""
        dist_to_goal = self.manhattan_distance(self.agent_pos, self.goal_pos)
        return np.array([self.agent_pos[0], self.agent_pos[1], 
                         self.goal_pos[0], self.goal_pos[1], 
                         dist_to_goal], dtype=np.float32)
    
    def manhattan_distance(self, pos1, pos2):
        """计算曼哈顿距离"""
        return abs(pos1[0] - pos2[0]) + abs(pos1[1] - pos2[1])
    
    def render(self, mode='human'):
        """渲染环境"""
        if mode == 'human':
            # 创建网格
            grid = np.zeros(self.grid_size)
            
            # 标记障碍物
            for x, y in self.obstacles:
                grid[y, x] = -1  # 障碍物用-1表示
                
            # 标记拥堵区域
            for x, y in self.congestion:
                if grid[y, x] != -1:  # 不覆盖障碍物
                    grid[y, x] = 0.5  # 拥堵区域用0.5表示
            
            # 标记路径
            for x, y in self.path:
                if grid[y, x] not in [-1, 0.5]:  # 不覆盖障碍物和拥堵
                    grid[y, x] = 0.2  # 路径用0.2表示
            
            # 标记当前位置和目标位置
            grid[self.agent_pos[1], self.agent_pos[0]] = 1  # 当前位置用1表示
            grid[self.goal_pos[1], self.goal_pos[0]] = 2  # 目标位置用2表示
            
            # 创建自定义颜色映射
            colors = [(0, 0, 0), (0.5, 0.5, 0.5), (0.7, 0.7, 0.7), (0, 1, 0), (1, 0, 0)]  # 黑, 灰, 浅灰, 绿, 红
            cmap_name = 'custom_cmap'
            cmap = LinearSegmentedColormap.from_list(cmap_name, colors, N=5)
            
            # 显示网格
            plt.figure(figsize=(10, 8))
            plt.imshow(grid, cmap=cmap, interpolation='nearest')
            plt.colorbar(ticks=[-1, 0, 0.5, 1, 2], 
                         label='-1:障碍物, 0:空, 0.5:拥堵, 1:当前位置, 2:目标')
            plt.title('运输路径优化环境')
            plt.grid(True, which='both', linestyle='-', color='gray', alpha=0.5)
            plt.show()
        else:
            super(TransportationEnv, self).render(mode=mode)

class DQNAgent:
    """深度Q网络智能体"""
    def __init__(self, state_size, action_size, device,
                 learning_rate=0.001, discount_factor=0.99,
                 exploration_rate=1.0, exploration_decay=0.995,
                 min_exploration_rate=0.01, memory_size=10000,
                 batch_size=64, target_update=10):
        self.state_size = state_size
        self.action_size = action_size
        self.device = device
        self.gamma = discount_factor
        self.epsilon = exploration_rate
        self.epsilon_min = min_exploration_rate
        self.epsilon_decay = exploration_decay
        self.batch_size = batch_size
        self.target_update = target_update
        
        # 创建策略网络和目标网络
        self.policy_net = DQN(state_size, action_size).to(device)
        self.target_net = DQN(state_size, action_size).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()  # 目标网络设为评估模式
        
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
        self.memory = ReplayMemory(memory_size)
        
        self.steps_done = 0
    
    def select_action(self, state, training=True):
        """选择动作"""
        if training and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0).to(self.device)
            q_values = self.policy_net(state_tensor)
            return q_values.max(1)[1].item()
    
    def learn(self):
        """从经验回放中学习"""
        if len(self.memory) < self.batch_size:
            return
        
        transitions = self.memory.sample(self.batch_size)
        batch = Transition(*zip(*transitions))
        
        # 创建非终止状态的掩码
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=self.device, dtype=torch.bool)
        non_final_next_states = torch.FloatTensor([s for s in batch.next_state
                                                if s is not None]).to(self.device)
        
        state_batch = torch.FloatTensor(batch.state).to(self.device)
        action_batch = torch.LongTensor(batch.action).unsqueeze(1).to(self.device)
        reward_batch = torch.FloatTensor(batch.reward).to(self.device)
        done_batch = torch.FloatTensor(batch.done).to(self.device)
        
        # 计算当前Q值
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)
        
        # 计算下一状态的期望Q值
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        
        # 计算期望Q值
        expected_state_action_values = (next_state_values * self.gamma * (1 - done_batch)) + reward_batch
        
        # 计算Huber损失
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
        
        # 优化模型
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)  # 梯度裁剪
        self.optimizer.step()
        
        # 衰减探索率
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
        
        # 更新目标网络
        self.steps_done += 1
        if self.steps_done % self.target_update == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())

def train_agent(env, agent, num_episodes=1000):
    """训练智能体"""
    rewards_history = []
    
    for episode in range(num_episodes):
        state = env.reset()
        total_reward = 0
        done = False
        
        while not done:
            # 选择动作
            action = agent.select_action(state)
            
            # 执行动作
            next_state, reward, done, _ = env.step(action)
            
            # 存储经验
            agent.memory.push(state, action, next_state, reward, done)
            
            # 更新状态
            state = next_state
            total_reward += reward
            
            # 学习
            agent.learn()
            
        rewards_history.append(total_reward)
        
        if (episode + 1) % 100 == 0:
            avg_reward = np.mean(rewards_history[-100:])
            print(f"Episode {episode+1}/{num_episodes}, Average Reward: {avg_reward:.2f}, Epsilon: {agent.epsilon:.4f}")
    
    return rewards_history

def evaluate_agent(env, agent, num_episodes=10):
    """评估智能体性能"""
    success_count = 0
    total_steps = []
    
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        steps = 0
        
        while not done:
            action = agent.select_action(state, training=False)  # 评估模式，不探索
            state, _, done, _ = env.step(action)
            steps += 1
            
        if env.agent_pos == env.goal_pos:
            success_count += 1
            total_steps.append(steps)
        
        print(f"Evaluation Episode {episode+1}: {'Success' if done and env.agent_pos == env.goal_pos else 'Failure'}")
    
    success_rate = success_count / num_episodes
    avg_steps = np.mean(total_steps) if total_steps else float('inf')
    
    print(f"Success Rate: {success_rate:.2%}, Average Steps: {avg_steps:.2f}")
    return success_rate, avg_steps

# 主函数
if __name__ == "__main__":
    # 设置设备
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # 创建环境和智能体
    env = TransportationEnv(grid_size=(10, 10))
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size, device)
    
    # 训练智能体
    print("开始训练智能体...")
    rewards = train_agent(env, agent, num_episodes=500)
    
    # 评估智能体
    print("\n评估智能体性能...")
    evaluate_agent(env, agent, num_episodes=5)
    
    # 可视化训练过程
    plt.figure(figsize=(10, 6))
    plt.plot(rewards)
    plt.title('训练奖励')
    plt.xlabel('回合')
    plt.ylabel('总奖励')
    plt.grid(True)
    plt.show()
    
    # 可视化最优路径
    print("\n可视化最优路径...")
    state = env.reset()
    done = False
    
    while not done:
        action = agent.select_action(state, training=False)  # 评估模式
        state, _, done, _ = env.step(action)
    
    env.render()