包含环境建模、Q-learning 智能体、训练流程、最优配时选取
一、系统架构(MATLAB 实现)
TrafficEnvironment.m % 交通环境(简化交叉口)
QLearningAgent.m % Q-learning 智能体
train_traffic_rl.m % 训练主脚本
evaluate_policy.m % 评估最优配时
compare_fixed_timing.m % 对比固定配时
二、交通环境模型 TrafficEnvironment.m
matlab
classdef TrafficEnvironment < handle
% 单交叉口交通环境(四相位)
properties
% 道路参数
arrival_rate = [0.3, 0.25, 0.2, 0.15]; % 东、南、西、北到达率(veh/s)
max_queue = 50; % 最大队列长度
% 信号配时参数
green_options = [15, 30, 45]; % 绿灯时长选项(秒)
yellow_time = 3; % 黄灯时间
cycle_length = 120; % 周期长度
% 状态
queue_length = zeros(4,1); % 各方向队列长度
waiting_time = zeros(4,1); % 各方向等待时间
step_count = 0;
max_steps = 3600; % 仿真时长(秒)
% 统计
total_delay = 0;
total_throughput = 0;
end
methods
function obj = TrafficEnvironment()
obj.reset();
end
function reset(obj)
obj.queue_length = zeros(4,1);
obj.waiting_time = zeros(4,1);
obj.step_count = 0;
obj.total_delay = 0;
obj.total_throughput = 0;
end
function [state, reward, done] = step(obj, action)
% action: [相位索引(1-4), 绿灯时长索引(1-3)]
phase = action(1);
green_duration = obj.green_options(action(2));
% 1. 车辆到达(泊松过程)
arrivals = poissrnd(obj.arrival_rate * green_duration);
% 2. 放行当前相位的车辆
if phase == 1 % 东西直行
departures = min(obj.queue_length(1), arrivals(1));
obj.queue_length(1) = obj.queue_length(1) - departures;
obj.total_throughput = obj.total_throughput + departures;
departures = min(obj.queue_length(3), arrivals(3));
obj.queue_length(3) = obj.queue_length(3) - departures;
obj.total_throughput = obj.total_throughput + departures;
elseif phase == 2 % 南北直行
departures = min(obj.queue_length(2), arrivals(2));
obj.queue_length(2) = obj.queue_length(2) - departures;
obj.total_throughput = obj.total_throughput + departures;
departures = min(obj.queue_length(4), arrivals(4));
obj.queue_length(4) = obj.queue_length(4) - departures;
obj.total_throughput = obj.total_throughput + departures;
elseif phase == 3 % 东西左转
% 简化处理:左转占用部分绿灯时间
departures = min(obj.queue_length(1)*0.3, arrivals(1)*0.3);
obj.queue_length(1) = obj.queue_length(1) - departures;
obj.total_throughput = obj.total_throughput + departures;
elseif phase == 4 % 南北左转
departures = min(obj.queue_length(2)*0.3, arrivals(2)*0.3);
obj.queue_length(2) = obj.queue_length(2) - departures;
obj.total_throughput = obj.total_throughput + departures;
end
% 3. 其他方向车辆累积
for i = 1:4
if ~(phase == 1 && (i == 1 || i == 3)) && ...
~(phase == 2 && (i == 2 || i == 4)) && ...
~(phase == 3 && i == 1) && ...
~(phase == 4 && i == 2)
obj.queue_length(i) = min(obj.queue_length(i) + arrivals(i), obj.max_queue);
end
end
% 4. 更新等待时间
obj.waiting_time = obj.waiting_time + obj.queue_length * green_duration;
obj.total_delay = obj.total_delay + sum(obj.queue_length) * green_duration;
% 5. 状态(归一化队列长度)
state = obj.queue_length / obj.max_queue;
% 6. 奖励(负的延误)
reward = -sum(obj.queue_length) * 0.1 - sum(obj.waiting_time) * 0.01;
% 7. 检查是否结束
obj.step_count = obj.step_count + green_duration + obj.yellow_time;
done = obj.step_count >= obj.max_steps;
end
function state = getState(obj)
state = obj.queue_length / obj.max_queue;
end
end
end
三、Q-learning 智能体 QLearningAgent.m
matlab
classdef QLearningAgent < handle
properties
% Q-table: [状态离散化 × 动作]
Q_table
state_bins = 10; % 状态离散化区间数
n_actions = 12; % 4相位 × 3绿灯时长 = 12个动作
% 学习参数
alpha = 0.1; % 学习率
gamma = 0.99; % 折扣因子
epsilon = 1.0; % 探索率
epsilon_min = 0.05;
epsilon_decay = 0.995;
% 统计
training_rewards = [];
end
methods
function obj = QLearningAgent()
% 初始化 Q-table
% 状态:4个方向 × state_bins 离散化
state_size = obj.state_bins ^ 4;
obj.Q_table = zeros(state_size, obj.n_actions);
end
function action = selectAction(obj, state)
% ε-贪婪策略
state_idx = obj.discretizeState(state);
if rand() < obj.epsilon
action = randi(obj.n_actions); % 探索
else
[~, action] = max(obj.Q_table(state_idx, :)); % 利用
end
end
function update(obj, state, action, reward, next_state, done)
state_idx = obj.discretizeState(state);
next_state_idx = obj.discretizeState(next_state);
% Q-learning 更新
current_q = obj.Q_table(state_idx, action);
if done
target_q = reward;
else
target_q = reward + obj.gamma * max(obj.Q_table(next_state_idx, :));
end
% 更新 Q 值
obj.Q_table(state_idx, action) = current_q + ...
obj.alpha * (target_q - current_q);
% ε 衰减
if obj.epsilon > obj.epsilon_min
obj.epsilon = obj.epsilon * obj.epsilon_decay;
end
end
function idx = discretizeState(obj, state)
% 将连续状态离散化
% state: [q1, q2, q3, q4] ∈ [0,1]
bins = floor(state * obj.state_bins);
bins = min(bins, obj.state_bins - 1);
bins = max(bins, 0);
% 将多维索引转为线性索引
idx = 1;
multiplier = 1;
for i = 1:4
idx = idx + bins(i) * multiplier;
multiplier = multiplier * obj.state_bins;
end
end
function optimal_action = getOptimalAction(obj, state)
state_idx = obj.discretizeState(state);
[~, optimal_action] = max(obj.Q_table(state_idx, :));
end
end
end
四、训练主脚本 train_traffic_rl.m
matlab
%% 强化学习交通配时训练
clear; clc; close all;
% 参数
episodes = 500;
max_steps_per_episode = 100;
% 创建环境和智能体
env = TrafficEnvironment();
agent = QLearningAgent();
% 训练记录
rewards_history = zeros(episodes, 1);
delay_history = zeros(episodes, 1);
fprintf('开始训练...\n');
for ep = 1:episodes
env.reset();
state = env.getState();
total_reward = 0;
total_delay = 0;
step_count = 0;
while step_count < max_steps_per_episode
% 选择动作
action_idx = agent.selectAction(state);
% 解析动作:[相位, 绿灯时长]
phase = ceil(action_idx / 3);
green_idx = mod(action_idx - 1, 3) + 1;
action = [phase, green_idx];
% 执行动作
[next_state, reward, done] = env.step(action);
% 更新 Q 表
agent.update(state, action_idx, reward, next_state, done);
% 累计
total_reward = total_reward + reward;
total_delay = total_delay + env.total_delay;
state = next_state;
step_count = step_count + 1;
if done
break;
end
end
rewards_history(ep) = total_reward;
delay_history(ep) = total_delay;
if mod(ep, 50) == 0
fprintf('Episode %d/%d, Reward: %.2f, Delay: %.2f, ε: %.3f\n', ...
ep, episodes, total_reward, total_delay, agent.epsilon);
end
end
fprintf('训练完成!\n');
% 保存模型
save('traffic_qlearning_model.mat', 'agent', 'rewards_history');
% 绘制训练曲线
figure('Position', [100, 100, 1200, 400]);
subplot(1,2,1);
plot(rewards_history, 'b-', 'LineWidth', 1.5);
xlabel('Episode'); ylabel('Total Reward');
title('训练奖励曲线'); grid on;
subplot(1,2,2);
plot(delay_history, 'r-', 'LineWidth', 1.5);
xlabel('Episode'); ylabel('Total Delay');
title('总延误变化'); grid on;
五、最优配时方案选取 evaluate_policy.m
matlab
%% 评估最优配时方案
clear; clc;
% 加载训练好的模型
load('traffic_qlearning_model.mat');
% 创建测试环境
env = TrafficEnvironment();
env.max_steps = 7200; % 测试2小时
% 运行最优策略
env.reset();
state = env.getState();
schedule = [];
step = 0;
total_delay_rl = 0;
total_throughput_rl = 0;
fprintf('执行最优配时方案...\n');
while env.step_count < env.max_steps
% 选择最优动作
action_idx = agent.getOptimalAction(state);
phase = ceil(action_idx / 3);
green_idx = mod(action_idx - 1, 3) + 1;
action = [phase, green_idx];
% 记录配时方案
schedule = [schedule; step+1, phase, env.green_options(green_idx)];
% 执行
[next_state, reward, done] = env.step(action);
total_delay_rl = total_delay_rl + env.total_delay;
total_throughput_rl = total_throughput_rl + env.total_throughput;
state = next_state;
step = step + 1;
if done
break;
end
end
fprintf('RL配时方案评估结果:\n');
fprintf('总延误: %.2f 车·秒\n', total_delay_rl);
fprintf('总通行量: %d 辆车\n', total_throughput_rl);
fprintf('平均延误: %.2f 秒/车\n', total_delay_rl/max(total_throughput_rl,1));
% 输出最优配时表
fprintf('\n最优配时方案(前10个周期):\n');
fprintf('%-6s %-10s %-10s\n', '周期', '相位', '绿灯时长(s)');
phase_names = {'东西直行', '南北直行', '东西左转', '南北左转'};
for i = 1:min(10, size(schedule,1))
fprintf('%-6d %-10s %-10d\n', ...
schedule(i,1), phase_names{schedule(i,2)}, schedule(i,3));
end
% 保存结果
save('optimal_timing.mat', 'schedule', 'total_delay_rl', 'total_throughput_rl');
六、与固定配时对比 compare_fixed_timing.m
matlab
%% 对比固定配时
clear; clc;
load('optimal_timing.mat');
% 固定配时方案(典型:东西30s,南北30s,周期120s)
env = TrafficEnvironment();
env.max_steps = 7200;
env.reset();
fixed_phase = [1, 2]; % 东西直行、南北直行交替
fixed_green = [2, 2]; % 对应45秒绿灯(索引2)
step = 0;
total_delay_fixed = 0;
total_throughput_fixed = 0;
while env.step_count < env.max_steps
for i = 1:2
action = [fixed_phase(i), fixed_green(i)];
[~, reward, done] = env.step(action);
total_delay_fixed = total_delay_fixed + env.total_delay;
total_throughput_fixed = total_throughput_fixed + env.total_throughput;
step = step + 1;
if done, break; end
end
end
% 对比结果
fprintf('\n========== 配时方案对比 ==========\n');
fprintf('指标\t\tRL优化\t\t固定配时\t提升比例\n');
fprintf('总延误(车·秒)\t%.2f\t\t%.2f\t\t%.1f%%\n', ...
total_delay_rl, total_delay_fixed, ...
(total_delay_fixed-total_delay_rl)/total_delay_fixed*100);
fprintf('通行量(辆)\t%d\t\t%d\t\t%.1f%%\n', ...
total_throughput_rl, total_throughput_fixed, ...
(total_throughput_rl-total_throughput_fixed)/max(total_throughput_fixed,1)*100);
fprintf('平均延误(s/车)\t%.2f\t\t%.2f\t\t%.1f%%\n', ...
total_delay_rl/max(total_throughput_rl,1), ...
total_delay_fixed/max(total_throughput_fixed,1), ...
(total_delay_fixed/total_throughput_fixed - total_delay_rl/total_throughput_rl) / ...
(total_delay_fixed/total_throughput_fixed)*100);
% 绘制对比柱状图
figure('Position', [100, 100, 800, 400]);
subplot(1,2,1);
bar([total_delay_rl, total_delay_fixed]);
set(gca, 'XTickLabel', {'RL优化', '固定配时'});
ylabel('总延误(车·秒)'); title('延误对比'); grid on;
subplot(1,2,2);
bar([total_throughput_rl, total_throughput_fixed]);
set(gca, 'XTickLabel', {'RL优化', '固定配时'});
ylabel('通行量(辆)'); title('通行量对比'); grid on;
参考代码 实现强化学习交通配时,选取最优的配时方案 www.youwenfan.com/contentcsw/81770.html
七、运行说明
-
依次运行:
matlabtrain_traffic_rl.m % 训练(约5-10分钟) evaluate_policy.m % 评估最优配时 compare_fixed_timing.m % 对比实验 -
关键输出:
optimal_timing.mat:最优配时方案- 训练曲线:奖励和延误变化
- 对比表格:RL vs 固定配时
-
参数调优:
- 修改
TrafficEnvironment.m中的arrival_rate适应不同流量 - 调整
QLearningAgent.m中的学习率、折扣因子 - 增加
state_bins提高状态分辨率
- 修改