强化学习交通信号配时优化(MATLAB 实现)

包含环境建模、Q-learning 智能体、训练流程、最优配时选取


一、系统架构(MATLAB 实现)

复制代码
TrafficEnvironment.m   % 交通环境(简化交叉口)
QLearningAgent.m      % Q-learning 智能体
train_traffic_rl.m    % 训练主脚本
evaluate_policy.m    % 评估最优配时
compare_fixed_timing.m % 对比固定配时

二、交通环境模型 TrafficEnvironment.m

matlab 复制代码
classdef TrafficEnvironment < handle
    % 单交叉口交通环境(四相位)
    properties
        % 道路参数
        arrival_rate = [0.3, 0.25, 0.2, 0.15]; % 东、南、西、北到达率(veh/s)
        max_queue = 50;                        % 最大队列长度
        
        % 信号配时参数
        green_options = [15, 30, 45];           % 绿灯时长选项(秒)
        yellow_time = 3;                       % 黄灯时间
        cycle_length = 120;                    % 周期长度
        
        % 状态
        queue_length = zeros(4,1);             % 各方向队列长度
        waiting_time = zeros(4,1);             % 各方向等待时间
        step_count = 0;
        max_steps = 3600;                      % 仿真时长(秒)
        
        % 统计
        total_delay = 0;
        total_throughput = 0;
    end
    
    methods
        function obj = TrafficEnvironment()
            obj.reset();
        end
        
        function reset(obj)
            obj.queue_length = zeros(4,1);
            obj.waiting_time = zeros(4,1);
            obj.step_count = 0;
            obj.total_delay = 0;
            obj.total_throughput = 0;
        end
        
        function [state, reward, done] = step(obj, action)
            % action: [相位索引(1-4), 绿灯时长索引(1-3)]
            phase = action(1);
            green_duration = obj.green_options(action(2));
            
            % 1. 车辆到达(泊松过程)
            arrivals = poissrnd(obj.arrival_rate * green_duration);
            
            % 2. 放行当前相位的车辆
            if phase == 1  % 东西直行
                departures = min(obj.queue_length(1), arrivals(1));
                obj.queue_length(1) = obj.queue_length(1) - departures;
                obj.total_throughput = obj.total_throughput + departures;
                
                departures = min(obj.queue_length(3), arrivals(3));
                obj.queue_length(3) = obj.queue_length(3) - departures;
                obj.total_throughput = obj.total_throughput + departures;
                
            elseif phase == 2  % 南北直行
                departures = min(obj.queue_length(2), arrivals(2));
                obj.queue_length(2) = obj.queue_length(2) - departures;
                obj.total_throughput = obj.total_throughput + departures;
                
                departures = min(obj.queue_length(4), arrivals(4));
                obj.queue_length(4) = obj.queue_length(4) - departures;
                obj.total_throughput = obj.total_throughput + departures;
                
            elseif phase == 3  % 东西左转
                % 简化处理:左转占用部分绿灯时间
                departures = min(obj.queue_length(1)*0.3, arrivals(1)*0.3);
                obj.queue_length(1) = obj.queue_length(1) - departures;
                obj.total_throughput = obj.total_throughput + departures;
                
            elseif phase == 4  % 南北左转
                departures = min(obj.queue_length(2)*0.3, arrivals(2)*0.3);
                obj.queue_length(2) = obj.queue_length(2) - departures;
                obj.total_throughput = obj.total_throughput + departures;
            end
            
            % 3. 其他方向车辆累积
            for i = 1:4
                if ~(phase == 1 && (i == 1 || i == 3)) && ...
                   ~(phase == 2 && (i == 2 || i == 4)) && ...
                   ~(phase == 3 && i == 1) && ...
                   ~(phase == 4 && i == 2)
                    obj.queue_length(i) = min(obj.queue_length(i) + arrivals(i), obj.max_queue);
                end
            end
            
            % 4. 更新等待时间
            obj.waiting_time = obj.waiting_time + obj.queue_length * green_duration;
            obj.total_delay = obj.total_delay + sum(obj.queue_length) * green_duration;
            
            % 5. 状态(归一化队列长度)
            state = obj.queue_length / obj.max_queue;
            
            % 6. 奖励(负的延误)
            reward = -sum(obj.queue_length) * 0.1 - sum(obj.waiting_time) * 0.01;
            
            % 7. 检查是否结束
            obj.step_count = obj.step_count + green_duration + obj.yellow_time;
            done = obj.step_count >= obj.max_steps;
        end
        
        function state = getState(obj)
            state = obj.queue_length / obj.max_queue;
        end
    end
end

三、Q-learning 智能体 QLearningAgent.m

matlab 复制代码
classdef QLearningAgent < handle
    properties
        % Q-table: [状态离散化 × 动作]
        Q_table
        state_bins = 10;          % 状态离散化区间数
        n_actions = 12;          % 4相位 × 3绿灯时长 = 12个动作
        
        % 学习参数
        alpha = 0.1;             % 学习率
        gamma = 0.99;            % 折扣因子
        epsilon = 1.0;           % 探索率
        epsilon_min = 0.05;
        epsilon_decay = 0.995;
        
        % 统计
        training_rewards = [];
    end
    
    methods
        function obj = QLearningAgent()
            % 初始化 Q-table
            % 状态:4个方向 × state_bins 离散化
            state_size = obj.state_bins ^ 4;
            obj.Q_table = zeros(state_size, obj.n_actions);
        end
        
        function action = selectAction(obj, state)
            % ε-贪婪策略
            state_idx = obj.discretizeState(state);
            
            if rand() < obj.epsilon
                action = randi(obj.n_actions);  % 探索
            else
                [~, action] = max(obj.Q_table(state_idx, :));  % 利用
            end
        end
        
        function update(obj, state, action, reward, next_state, done)
            state_idx = obj.discretizeState(state);
            next_state_idx = obj.discretizeState(next_state);
            
            % Q-learning 更新
            current_q = obj.Q_table(state_idx, action);
            
            if done
                target_q = reward;
            else
                target_q = reward + obj.gamma * max(obj.Q_table(next_state_idx, :));
            end
            
            % 更新 Q 值
            obj.Q_table(state_idx, action) = current_q + ...
                obj.alpha * (target_q - current_q);
            
            % ε 衰减
            if obj.epsilon > obj.epsilon_min
                obj.epsilon = obj.epsilon * obj.epsilon_decay;
            end
        end
        
        function idx = discretizeState(obj, state)
            % 将连续状态离散化
            % state: [q1, q2, q3, q4] ∈ [0,1]
            bins = floor(state * obj.state_bins);
            bins = min(bins, obj.state_bins - 1);
            bins = max(bins, 0);
            
            % 将多维索引转为线性索引
            idx = 1;
            multiplier = 1;
            for i = 1:4
                idx = idx + bins(i) * multiplier;
                multiplier = multiplier * obj.state_bins;
            end
        end
        
        function optimal_action = getOptimalAction(obj, state)
            state_idx = obj.discretizeState(state);
            [~, optimal_action] = max(obj.Q_table(state_idx, :));
        end
    end
end

四、训练主脚本 train_traffic_rl.m

matlab 复制代码
%% 强化学习交通配时训练
clear; clc; close all;

% 参数
episodes = 500;
max_steps_per_episode = 100;

% 创建环境和智能体
env = TrafficEnvironment();
agent = QLearningAgent();

% 训练记录
rewards_history = zeros(episodes, 1);
delay_history = zeros(episodes, 1);

fprintf('开始训练...\n');
for ep = 1:episodes
    env.reset();
    state = env.getState();
    total_reward = 0;
    total_delay = 0;
    step_count = 0;
    
    while step_count < max_steps_per_episode
        % 选择动作
        action_idx = agent.selectAction(state);
        
        % 解析动作:[相位, 绿灯时长]
        phase = ceil(action_idx / 3);
        green_idx = mod(action_idx - 1, 3) + 1;
        action = [phase, green_idx];
        
        % 执行动作
        [next_state, reward, done] = env.step(action);
        
        % 更新 Q 表
        agent.update(state, action_idx, reward, next_state, done);
        
        % 累计
        total_reward = total_reward + reward;
        total_delay = total_delay + env.total_delay;
        
        state = next_state;
        step_count = step_count + 1;
        
        if done
            break;
        end
    end
    
    rewards_history(ep) = total_reward;
    delay_history(ep) = total_delay;
    
    if mod(ep, 50) == 0
        fprintf('Episode %d/%d, Reward: %.2f, Delay: %.2f, ε: %.3f\n', ...
                ep, episodes, total_reward, total_delay, agent.epsilon);
    end
end

fprintf('训练完成!\n');

% 保存模型
save('traffic_qlearning_model.mat', 'agent', 'rewards_history');

% 绘制训练曲线
figure('Position', [100, 100, 1200, 400]);
subplot(1,2,1);
plot(rewards_history, 'b-', 'LineWidth', 1.5);
xlabel('Episode'); ylabel('Total Reward');
title('训练奖励曲线'); grid on;

subplot(1,2,2);
plot(delay_history, 'r-', 'LineWidth', 1.5);
xlabel('Episode'); ylabel('Total Delay');
title('总延误变化'); grid on;

五、最优配时方案选取 evaluate_policy.m

matlab 复制代码
%% 评估最优配时方案
clear; clc;

% 加载训练好的模型
load('traffic_qlearning_model.mat');

% 创建测试环境
env = TrafficEnvironment();
env.max_steps = 7200;  % 测试2小时

% 运行最优策略
env.reset();
state = env.getState();
schedule = [];
step = 0;
total_delay_rl = 0;
total_throughput_rl = 0;

fprintf('执行最优配时方案...\n');

while env.step_count < env.max_steps
    % 选择最优动作
    action_idx = agent.getOptimalAction(state);
    phase = ceil(action_idx / 3);
    green_idx = mod(action_idx - 1, 3) + 1;
    action = [phase, green_idx];
    
    % 记录配时方案
    schedule = [schedule; step+1, phase, env.green_options(green_idx)];
    
    % 执行
    [next_state, reward, done] = env.step(action);
    
    total_delay_rl = total_delay_rl + env.total_delay;
    total_throughput_rl = total_throughput_rl + env.total_throughput;
    
    state = next_state;
    step = step + 1;
    
    if done
        break;
    end
end

fprintf('RL配时方案评估结果:\n');
fprintf('总延误: %.2f 车·秒\n', total_delay_rl);
fprintf('总通行量: %d 辆车\n', total_throughput_rl);
fprintf('平均延误: %.2f 秒/车\n', total_delay_rl/max(total_throughput_rl,1));

% 输出最优配时表
fprintf('\n最优配时方案(前10个周期):\n');
fprintf('%-6s %-10s %-10s\n', '周期', '相位', '绿灯时长(s)');
phase_names = {'东西直行', '南北直行', '东西左转', '南北左转'};
for i = 1:min(10, size(schedule,1))
    fprintf('%-6d %-10s %-10d\n', ...
            schedule(i,1), phase_names{schedule(i,2)}, schedule(i,3));
end

% 保存结果
save('optimal_timing.mat', 'schedule', 'total_delay_rl', 'total_throughput_rl');

六、与固定配时对比 compare_fixed_timing.m

matlab 复制代码
%% 对比固定配时
clear; clc;

load('optimal_timing.mat');

% 固定配时方案(典型:东西30s,南北30s,周期120s)
env = TrafficEnvironment();
env.max_steps = 7200;
env.reset();

fixed_phase = [1, 2];  % 东西直行、南北直行交替
fixed_green = [2, 2]; % 对应45秒绿灯(索引2)
step = 0;
total_delay_fixed = 0;
total_throughput_fixed = 0;

while env.step_count < env.max_steps
    for i = 1:2
        action = [fixed_phase(i), fixed_green(i)];
        [~, reward, done] = env.step(action);
        total_delay_fixed = total_delay_fixed + env.total_delay;
        total_throughput_fixed = total_throughput_fixed + env.total_throughput;
        step = step + 1;
        if done, break; end
    end
end

% 对比结果
fprintf('\n========== 配时方案对比 ==========\n');
fprintf('指标\t\tRL优化\t\t固定配时\t提升比例\n');
fprintf('总延误(车·秒)\t%.2f\t\t%.2f\t\t%.1f%%\n', ...
        total_delay_rl, total_delay_fixed, ...
        (total_delay_fixed-total_delay_rl)/total_delay_fixed*100);
fprintf('通行量(辆)\t%d\t\t%d\t\t%.1f%%\n', ...
        total_throughput_rl, total_throughput_fixed, ...
        (total_throughput_rl-total_throughput_fixed)/max(total_throughput_fixed,1)*100);
fprintf('平均延误(s/车)\t%.2f\t\t%.2f\t\t%.1f%%\n', ...
        total_delay_rl/max(total_throughput_rl,1), ...
        total_delay_fixed/max(total_throughput_fixed,1), ...
        (total_delay_fixed/total_throughput_fixed - total_delay_rl/total_throughput_rl) / ...
        (total_delay_fixed/total_throughput_fixed)*100);

% 绘制对比柱状图
figure('Position', [100, 100, 800, 400]);
subplot(1,2,1);
bar([total_delay_rl, total_delay_fixed]);
set(gca, 'XTickLabel', {'RL优化', '固定配时'});
ylabel('总延误(车·秒)'); title('延误对比'); grid on;

subplot(1,2,2);
bar([total_throughput_rl, total_throughput_fixed]);
set(gca, 'XTickLabel', {'RL优化', '固定配时'});
ylabel('通行量(辆)'); title('通行量对比'); grid on;

参考代码 实现强化学习交通配时,选取最优的配时方案 www.youwenfan.com/contentcsw/81770.html

七、运行说明

  1. 依次运行

    matlab 复制代码
    train_traffic_rl.m      % 训练(约5-10分钟)
    evaluate_policy.m       % 评估最优配时
    compare_fixed_timing.m  % 对比实验
  2. 关键输出

    • optimal_timing.mat:最优配时方案
    • 训练曲线:奖励和延误变化
    • 对比表格:RL vs 固定配时
  3. 参数调优

    • 修改 TrafficEnvironment.m 中的 arrival_rate 适应不同流量
    • 调整 QLearningAgent.m 中的学习率、折扣因子
    • 增加 state_bins 提高状态分辨率