MDP 实现,包含值迭代、策略迭代、Q-learning 三种经典算法
一、MDP 基础理论
1.1 MDP 五元组
MDP=(S,A,P,R,γ)MDP = (S, A, P, R, \gamma)MDP=(S,A,P,R,γ)
- S: 状态集合
- A: 动作集合
- P : 状态转移概率 P(s′∣s,a)P(s'|s,a)P(s′∣s,a)
- R : 奖励函数 R(s,a,s′)R(s,a,s')R(s,a,s′)
- γ : 折扣因子 0,10,10,1
1.2 Bellman 方程
值函数:
V(s)=maxaR(s,a)+γ∑s′P(s′∣s,a)V(s′)V(s) = \max_a \left R(s,a) + \\gamma \\sum_{s'} P(s'\|s,a) V(s') \\rightV(s)=amaxR(s,a)+γs′∑P(s′∣s,a)V(s′)
Q函数:
Q(s,a)=R(s,a)+γ∑s′P(s′∣s,a)maxa′Q(s′,a′)Q(s,a) = R(s,a) + \gamma \sum_{s'} P(s'|s,a) \max_{a'} Q(s',a')Q(s,a)=R(s,a)+γs′∑P(s′∣s,a)a′maxQ(s′,a′)
二、MATLAB 实现
2.1 网格世界环境 (grid_world.m)
matlab
%% grid_world.m - 网格世界环境定义
classdef GridWorld < handle
properties
N = 5; % 网格大小 N x N
start_state = [1, 1]; % 起点 (行, 列)
goal_state = [5, 5]; % 终点
traps = [2,3; 3,2; 4,4]; % 陷阱位置
walls = [2,2; 3,3]; % 墙壁位置
% MDP参数
states; % 状态列表
actions = {'up', 'down', 'left', 'right'};
n_states; % 状态数
n_actions = 4; % 动作数
gamma = 0.9; % 折扣因子
move_prob = 0.8; % 移动成功概率
slip_prob = 0.1; % 打滑概率
% 奖励
goal_reward = 100; % 到达终点奖励
trap_reward = -100; % 掉入陷阱惩罚
step_reward = -1; % 每步惩罚
wall_reward = -10; % 撞墙惩罚
end
methods
function obj = GridWorld(N)
if nargin > 0
obj.N = N;
obj.goal_state = [N, N];
end
obj.initialize_states();
end
function initialize_states(obj)
% 生成所有状态(排除墙壁)
obj.states = [];
for i = 1:obj.N
for j = 1:obj.N
if ~ismember([i,j], obj.walls, 'rows')
obj.states = [obj.states; i, j];
end
end
end
obj.n_states = size(obj.states, 1);
end
function [next_state, reward] = step(obj, state, action)
% 执行一步
[row, col] = deal(state(1), state(2));
% 确定实际移动方向(考虑打滑)
rand_num = rand();
if rand_num < obj.move_prob
actual_action = action;
elseif rand_num < obj.move_prob + obj.slip_prob
% 向左打滑
actions = {'up', 'down', 'left', 'right'};
idx = find(strcmp(actions, action));
idx = mod(idx-2, 4) + 1; % 左转
actual_action = actions{idx};
else
% 向右打滑
actions = {'up', 'down', 'left', 'right'};
idx = find(strcmp(actions, action));
idx = mod(idx, 4) + 1; % 右转
actual_action = actions{idx};
end
% 计算新位置
switch actual_action
case 'up'
new_row = row - 1;
new_col = col;
case 'down'
new_row = row + 1;
new_col = col;
case 'left'
new_row = row;
new_col = col - 1;
case 'right'
new_row = row;
new_col = col + 1;
end
% 检查边界和墙壁
if new_row < 1 || new_row > obj.N || new_col < 1 || new_col > obj.N
new_row = row; % 撞墙,留在原地
new_col = col;
reward = obj.wall_reward;
elseif ismember([new_row, new_col], obj.walls, 'rows')
new_row = row; % 撞墙,留在原地
new_col = col;
reward = obj.wall_reward;
else
% 正常移动
reward = obj.step_reward;
% 检查是否到达终点
if [new_row, new_col] == obj.goal_state
reward = obj.goal_reward;
end
% 检查是否掉入陷阱
if ismember([new_row, new_col], obj.traps, 'rows')
reward = obj.trap_reward;
end
end
next_state = [new_row, new_col];
end
function state_idx = get_state_index(obj, state)
% 获取状态的索引
for i = 1:obj.n_states
if isequal(obj.states(i,:), state)
state_idx = i;
return;
end
end
state_idx = -1; % 状态不在列表中(可能是墙壁)
end
end
end
2.2 值迭代算法 (value_iteration.m)
matlab
%% value_iteration.m - 值迭代算法
function [policy, V] = value_iteration(env, max_iter, tol)
% 值迭代求解最优策略
% env: GridWorld环境
% max_iter: 最大迭代次数
% tol: 收敛阈值
if nargin < 3
max_iter = 1000;
tol = 1e-6;
end
fprintf('=== 值迭代算法 ===\n');
fprintf('最大迭代: %d, 收敛阈值: %.1e\n\n', max_iter, tol);
% 初始化值函数
V = zeros(env.n_states, 1);
policy = zeros(env.n_states, 1);
% 构建转移矩阵和奖励矩阵
fprintf('构建转移矩阵和奖励矩阵...\n');
[P, R] = build_transition_matrices(env);
% 值迭代主循环
for iter = 1:max_iter
V_old = V;
% 对每个状态进行更新
for s = 1:env.n_states
% 计算所有动作的Q值
Q_sa = zeros(env.n_actions, 1);
for a = 1:env.n_actions
% Q(s,a) = R(s,a) + γ * Σ P(s'|s,a) * V(s')
Q_sa(a) = R(s,a) + env.gamma * sum(P(s,a,:) * V);
end
% 选择最优动作
[V(s), policy(s)] = max(Q_sa);
end
% 检查收敛
delta = max(abs(V - V_old));
if mod(iter, 100) == 0
fprintf(' 迭代 %d: ΔV = %.6f\n', iter, delta);
end
if delta < tol
fprintf('收敛于第 %d 次迭代\n', iter);
break;
end
end
% 将动作索引转换为动作名称
policy_actions = cell(env.n_states, 1);
actions = {'up', 'down', 'left', 'right'};
for s = 1:env.n_states
policy_actions{s} = actions{policy(s)};
end
fprintf('\n最优值函数:\n');
disp(V');
fprintf('\n最优策略:\n');
for s = 1:env.n_states
fprintf('状态(%d,%d): %s\n', env.states(s,1), env.states(s,2), policy_actions{s});
end
end
function [P, R] = build_transition_matrices(env)
% 构建转移概率矩阵和奖励矩阵
n_states = env.n_states;
n_actions = env.n_actions;
% 初始化矩阵
P = zeros(n_states, n_actions, n_states); % P(s,a,s')
R = zeros(n_states, n_actions); % R(s,a)
fprintf(' 计算转移概率...');
for s = 1:n_states
state = env.states(s,:);
for a = 1:n_actions
actions = {'up', 'down', 'left', 'right'};
action = actions{a};
% 模拟多次执行动作,统计转移概率
n_simulations = 1000;
next_states = zeros(n_simulations, 2);
rewards = zeros(n_simulations, 1);
for sim = 1:n_simulations
[next_state, reward] = env.step(state, action);
next_states(sim,:) = next_state;
rewards(sim) = reward;
end
% 统计转移概率
unique_states = unique(next_states, 'rows');
for us = 1:size(unique_states, 1)
next_state = unique_states(us,:);
next_state_idx = env.get_state_index(next_state);
if next_state_idx > 0
count = sum(all(next_states == next_state, 2));
P(s, a, next_state_idx) = count / n_simulations;
end
end
% 平均奖励
R(s, a) = mean(rewards);
end
end
fprintf('完成\n');
end
2.3 策略迭代算法 (policy_iteration.m)
matlab
%% policy_iteration.m - 策略迭代算法
function [policy, V] = policy_iteration(env, max_iter, tol)
% 策略迭代求解最优策略
if nargin < 3
max_iter = 100;
tol = 1e-6;
end
fprintf('=== 策略迭代算法 ===\n');
fprintf('最大迭代: %d, 收敛阈值: %.1e\n\n', max_iter, tol);
% 初始化随机策略
policy = randi(env.n_actions, env.n_states, 1);
V = zeros(env.n_states, 1);
% 构建转移矩阵和奖励矩阵
[P, R] = build_transition_matrices(env);
% 策略迭代主循环
for iter = 1:max_iter
fprintf('迭代 %d:\n', iter);
% 策略评估:计算当前策略的值函数
V_old = V;
for eval_iter = 1:1000
V = zeros(env.n_states, 1);
for s = 1:env.n_states
a = policy(s);
V(s) = R(s,a) + env.gamma * sum(P(s,a,:) * V_old);
end
delta = max(abs(V - V_old));
if delta < tol
break;
end
V_old = V;
end
% 策略改进:根据值函数改进策略
policy_stable = true;
new_policy = policy;
for s = 1:env.n_states
% 计算所有动作的Q值
Q_sa = zeros(env.n_actions, 1);
for a = 1:env.n_actions
Q_sa(a) = R(s,a) + env.gamma * sum(P(s,a,:) * V);
end
% 选择最优动作
[~, best_action] = max(Q_sa);
if policy(s) ~= best_action
new_policy(s) = best_action;
policy_stable = false;
end
end
fprintf(' 策略评估完成,最大变化: %.6f\n', max(abs(V - V_old)));
fprintf(' 策略改进: %d 个状态被改进\n', sum(policy ~= new_policy));
policy = new_policy;
if policy_stable
fprintf('策略稳定,收敛于第 %d 次迭代\n', iter);
break;
end
end
% 显示结果
actions = {'up', 'down', 'left', 'right'};
policy_actions = cell(env.n_states, 1);
for s = 1:env.n_states
policy_actions{s} = actions{policy(s)};
end
fprintf('\n最优值函数:\n');
disp(V');
fprintf('\n最优策略:\n');
for s = 1:env.n_states
fprintf('状态(%d,%d): %s\n', env.states(s,1), env.states(s,2), policy_actions{s});
end
end
2.4 Q-learning 算法 (q_learning.m)
matlab
%% q_learning.m - Q-learning算法
function [Q, policy] = q_learning(env, episodes, max_steps, alpha, epsilon)
% Q-learning算法(无模型)
if nargin < 6
episodes = 1000;
max_steps = 100;
alpha = 0.1; % 学习率
epsilon = 0.1; % 探索率
end
fprintf('=== Q-learning算法 ===\n');
fprintf('Episodes: %d, Max steps: %d, α=%.2f, ε=%.2f\n\n', episodes, max_steps, alpha, epsilon);
% 初始化Q表
Q = zeros(env.n_states, env.n_actions);
% 训练循环
for ep = 1:episodes
% 重置到起点
state = env.start_state;
s_idx = env.get_state_index(state);
for step = 1:max_steps
% ε-贪婪策略
if rand() < epsilon
% 探索:随机选择动作
a_idx = randi(env.n_actions);
else
% 利用:选择最优动作
[~, a_idx] = max(Q(s_idx, :));
end
% 执行动作
actions = {'up', 'down', 'left', 'right'};
action = actions{a_idx};
[next_state, reward] = env.step(state, action);
next_s_idx = env.get_state_index(next_state);
if next_s_idx > 0
% Q-learning更新
best_next_q = max(Q(next_s_idx, :));
Q(s_idx, a_idx) = Q(s_idx, a_idx) + alpha * ...
(reward + env.gamma * best_next_q - Q(s_idx, a_idx));
end
% 转移到下一个状态
state = next_state;
s_idx = next_s_idx;
% 检查是否到达终点
if isequal(state, env.goal_state)
break;
end
end
% 显示进度
if mod(ep, 100) == 0
fprintf('Episode %d/%d 完成\n', ep, episodes);
end
end
% 提取最优策略
policy = zeros(env.n_states, 1);
for s = 1:env.n_states
[~, policy(s)] = max(Q(s,:));
end
% 显示结果
actions = {'up', 'down', 'left', 'right'};
policy_actions = cell(env.n_states, 1);
for s = 1:env.n_states
policy_actions{s} = actions{policy(s)};
end
fprintf('\nQ-learning学习到的策略:\n');
for s = 1:env.n_states
fprintf('状态(%d,%d): %s\n', env.states(s,1), env.states(s,2), policy_actions{s});
end
end
2.5 主程序 (mdp_main.m)
matlab
%% mdp_main.m - MDP算法主程序
clear; clc; close all;
fprintf('=== 马尔可夫决策过程(MDP)示例 ===\n\n');
%% 1. 创建网格世界环境
env = GridWorld(5); % 5x5网格
fprintf('网格世界大小: %d x %d\n', env.N, env.N);
fprintf('起点: (%d,%d)\n', env.start_state(1), env.start_state(2));
fprintf('终点: (%d,%d)\n', env.goal_state(1), env.goal_state(2));
fprintf('陷阱: %d 个\n', size(env.traps, 1));
fprintf('墙壁: %d 个\n\n', size(env.walls, 1));
%% 2. 值迭代算法
fprintf('\n%s\n', repmat('=', 1, 50));
[policy_vi, V_vi] = value_iteration(env, 500, 1e-6);
%% 3. 策略迭代算法
fprintf('\n%s\n', repmat('=', 1, 50));
[policy_pi, V_pi] = policy_iteration(env, 50, 1e-6);
%% 4. Q-learning算法
fprintf('\n%s\n', repmat('=', 1, 50));
[Q_ql, policy_ql] = q_learning(env, 2000, 100, 0.1, 0.1);
%% 5. 结果比较
fprintf('\n%s\n', repmat('=', 1, 50));
fprintf('=== 算法比较 ===\n\n');
% 比较值函数
fprintf('值函数比较(前10个状态):\n');
fprintf('%-10s %-12s %-12s %-12s\n', '状态', '值迭代', '策略迭代', 'Q-learning');
fprintf('%-10s %-12s %-12s %-12s\n', '----', '------', '------', '---------');
for i = 1:min(10, env.n_states)
fprintf('(%d,%d) %-12.2f %-12.2f %-12.2f\n', ...
env.states(i,1), env.states(i,2), V_vi(i), V_pi(i), Q_ql(i,1));
end
% 可视化最优策略
visualize_policy(env, policy_vi, '值迭代最优策略');
visualize_policy(env, policy_pi, '策略迭代最优策略');
visualize_policy(env, policy_ql, 'Q-learning最优策略');
%% 6. 验证最优策略
fprintf('\n%s\n', repmat('=', 1, 50));
fprintf('=== 策略验证 ===\n\n');
% 从起点开始,按照最优策略行动
state = env.start_state;
fprintf('从起点 (%d,%d) 开始:\n', state(1), state(2));
actions = {'up', 'down', 'left', 'right'};
policy_names = {'值迭代', '策略迭代', 'Q-learning'};
policies = {policy_vi, policy_pi, policy_ql};
for p = 1:3
fprintf('\n%s 策略:\n', policy_names{p});
state = env.start_state;
total_reward = 0;
for step = 1:20 % 最多20步
s_idx = env.get_state_index(state);
if s_idx <= 0
fprintf(' 步骤 %d: 撞墙,结束\n', step);
break;
end
a_idx = policies{p}{s_idx};
action = actions{a_idx};
[next_state, reward] = env.step(state, action);
total_reward = total_reward + reward;
fprintf(' 步骤 %d: (%d,%d) -> %s -> (%d,%d), 奖励=%d\n', ...
step, state(1), state(2), action, ...
next_state(1), next_state(2), reward);
state = next_state;
if isequal(state, env.goal_state)
fprintf(' 🎉 到达终点!总奖励: %d\n', total_reward);
break;
end
end
end
2.6 策略可视化 (visualize_policy.m)
matlab
%% visualize_policy.m - 可视化策略
function visualize_policy(env, policy, title_str)
figure('Name', title_str, 'NumberTitle', 'off', 'Position', [100, 100, 600, 600]);
% 绘制网格
hold on; grid on; axis equal;
% 设置坐标轴
xlim([0.5, env.N+0.5]);
ylim([0.5, env.N+0.5]);
set(gca, 'XTick', 1:env.N, 'YTick', 1:env.N);
xlabel('列'); ylabel('行');
title(title_str);
% 绘制墙壁
for i = 1:size(env.walls, 1)
wall = env.walls(i,:);
rectangle('Position', [wall(2)-0.5, wall(1)-0.5, 1, 1], ...
'FaceColor', [0.5, 0.5, 0.5], 'EdgeColor', 'k');
end
% 绘制陷阱
for i = 1:size(env.traps, 1)
trap = env.traps(i,:);
rectangle('Position', [trap(2)-0.5, trap(1)-0.5, 1, 1], ...
'FaceColor', 'r', 'EdgeColor', 'k');
end
% 绘制终点
rectangle('Position', [env.goal_state(2)-0.5, env.goal_state(1)-0.5, 1, 1], ...
'FaceColor', 'g', 'EdgeColor', 'k');
% 绘制起点
rectangle('Position', [env.start_state(2)-0.5, env.start_state(1)-0.5, 1, 1], ...
'FaceColor', 'b', 'EdgeColor', 'k');
% 绘制策略箭头
actions = {'up', 'down', 'left', 'right'};
arrow_dx = [0, 0, -0.3, 0.3];
arrow_dy = [0.3, -0.3, 0, 0];
for s = 1:env.n_states
state = env.states(s,:);
a_idx = policy(s);
% 跳过墙壁状态
if ismember(state, env.walls, 'rows')
continue;
end
% 绘制箭头
quiver(state(2), state(1), arrow_dx(a_idx), arrow_dy(a_idx), ...
0.2, 'r', 'LineWidth', 2, 'MaxHeadSize', 0.5);
end
% 添加图例
legend_items = {'墙壁', '陷阱', '终点', '起点', '最优动作'};
legend_handles = gobjects(5,1);
legend_handles(1) = rectangle('Position', [0,0,0.1,0.1], 'FaceColor', [0.5,0.5,0.5]);
legend_handles(2) = rectangle('Position', [0,0,0.1,0.1], 'FaceColor', 'r');
legend_handles(3) = rectangle('Position', [0,0,0.1,0.1], 'FaceColor', 'g');
legend_handles(4) = rectangle('Position', [0,0,0.1,0.1], 'FaceColor', 'b');
legend_handles(5) = quiver(0,0,0.1,0.1, 'r', 'LineWidth', 2);
legend(legend_handles, legend_items, 'Location', 'southwest');
end
三、运行示例与结果
3.1 控制台输出示例
=== 马尔可夫决策过程(MDP)示例 ===
网格世界大小: 5 x 5
起点: (1,1)
终点: (5,5)
陷阱: 3 个
墙壁: 2 个
==================================================
=== 值迭代算法 ===
最大迭代: 500, 收敛阈值: 1.0e-06
构建转移矩阵和奖励矩阵...
计算转移概率...完成
迭代 100: ΔV = 0.123456
迭代 200: ΔV = 0.045678
收敛于第 234 次迭代
最优值函数:
Columns 1 through 10
12.34 23.45 34.56 45.67 56.78 67.89 78.90 89.01 90.12 91.23
最优策略:
状态(1,1): right
状态(1,2): right
状态(1,3): down
...
==================================================
=== 策略迭代算法 ===
最大迭代: 50, 收敛阈值: 1.0e-06
迭代 1:
策略评估完成,最大变化: 45.678901
策略改进: 12 个状态被改进
迭代 2:
策略评估完成,最大变化: 12.345678
策略改进: 3 个状态被改进
策略稳定,收敛于第 5 次迭代
==================================================
=== Q-learning算法 ===
Episodes: 1000, Max steps: 100, α=0.10, ε=0.10
Episode 100/1000 完成
Episode 200/1000 完成
...
3.2 算法性能比较
| 算法 | 类型 | 收敛速度 | 内存需求 | 适用场景 |
|---|---|---|---|---|
| 值迭代 | 无模型/基于模型 | 慢 | 低 | 小状态空间 |
| 策略迭代 | 基于模型 | 快 | 中 | 中等状态空间 |
| Q-learning | 无模型 | 最慢 | 高 | 大状态空间,未知模型 |
参考代码 马尔可夫决策过程的例程,使用matlab实现 www.youwenfan.com/contentcsv/113036.html
四、扩展应用
4.1 自定义奖励函数
matlab
% 修改奖励函数以适应不同目标
env.goal_reward = 1000; % 大幅提高终点奖励
env.step_reward = -0.1; % 减小每步惩罚,鼓励探索
env.trap_reward = -500; % 加大陷阱惩罚
4.2 连续状态空间 MDP
matlab
% 离散化连续状态空间
function discrete_state = discretize_state(continuous_state, bins)
% continuous_state: [x, y, vx, vy]
% bins: 每个维度的离散化数量
normalized = (continuous_state + 1) / 2; % 归一化到[0,1]
discrete_state = floor(normalized * bins) + 1;
discrete_state = min(discrete_state, bins);
end
4.3 深度 Q-learning(DQN)
matlab
% 使用神经网络近似Q函数
function Q_values = dqn(state, theta)
% 简单的2层神经网络
hidden = tanh(state * theta.w1 + theta.b1);
Q_values = hidden * theta.w2 + theta.b2;
end