一、自适应动态规划概述
1.1 ADP核心思想
自适应动态规划(Adaptive Dynamic Programming, ADP)是动态规划与近似方法的结合,用于解决高维、连续状态空间的最优控制问题。
1.2 ADP三大要素
| 要素 | 作用 | MATLAB实现 |
|---|---|---|
| 评价网络(Critic) | 近似值函数 | 神经网络、线性近似 |
| 行动网络(Actor) | 生成控制策略 | 神经网络、线性反馈 |
| 模型网络(Model) | 学习系统动力学 | 神经网络、ARX模型 |
二、基础ADP实现(线性二次调节器)
2.1 系统模型与代价函数
matlab
%% 线性系统:x_{k+1} = A*x_k + B*u_k
% 倒立摆系统
A = [1, 0.01; 0.098, 0.99]; % 离散系统矩阵
B = [0; 0.01]; % 控制矩阵
% 代价函数权重
Q = diag([1, 0.1]); % 状态权重
R = 0.01; % 控制权重
gamma = 0.95; % 折扣因子
% 初始状态
x0 = [0.1; 0.05]; % 偏离平衡位置的小扰动
2.2 基于值迭代的ADP
matlab
%% 值迭代ADP算法
classdef ValueIterationADP
% 基于值迭代的自适应动态规划
properties
A, B, Q, R, gamma; % 系统参数
P; % 值函数矩阵 (V(x)=x'Px)
K; % 控制增益矩阵 (u=-Kx)
max_iter = 1000; % 最大迭代次数
tol = 1e-6; % 收敛容差
learning_rate = 0.01; % 学习率
end
methods
function obj = ValueIterationADP(A, B, Q, R, gamma)
obj.A = A; obj.B = B; obj.Q = Q; obj.R = R; obj.gamma = gamma;
obj.P = eye(size(A,1)); % 初始化P为单位矩阵
obj.K = zeros(size(B,2), size(B,1)); % 初始化K为0
end
function [P_opt, K_opt, history] = solve(obj)
% 值迭代求解最优控制
history = zeros(obj.max_iter, 1);
for k = 1:obj.max_iter
% 1. 策略评估:求解Lyapunov方程
% V(x) = x'Qx + u'Ru + γ*V(x_next)
% 其中 u = -Kx
K_old = obj.K;
% 构建Lyapunov方程:A_cl'*P*A_cl - P = -Q_cl
A_cl = obj.A - obj.B * obj.K;
Q_cl = obj.Q + obj.K' * obj.R * obj.K;
% 求解离散Lyapunov方程
P_new = dlyap(A_cl', Q_cl);
% 2. 策略改进:更新控制增益
K_new = (obj.R + obj.gamma * obj.B' * P_new * obj.B) \ ...
(obj.gamma * obj.B' * P_new * obj.A);
% 3. 更新
obj.P = P_new;
obj.K = K_new;
% 4. 计算收敛误差
error = norm(obj.K - K_old, 'fro');
history(k) = error;
if error < obj.tol
fprintf('收敛于第 %d 次迭代,误差: %.2e\n', k, error);
break;
end
if mod(k, 100) == 0
fprintf('迭代 %d: 误差 = %.2e\n', k, error);
end
end
P_opt = obj.P;
K_opt = obj.K;
end
end
end
2.3 仿真验证
matlab
%% 主程序:线性系统ADP控制
clear; clc;
% 系统参数
A = [1, 0.01; 0.098, 0.99];
B = [0; 0.01];
Q = diag([1, 0.1]);
R = 0.01;
gamma = 0.95;
% 创建ADP求解器
adp = ValueIterationADP(A, B, Q, R, gamma);
% 求解最优控制
[P_opt, K_opt, history] = adp.solve();
fprintf('最优控制增益 K = [%f, %f]\n', K_opt(1), K_opt(2));
fprintf('最优值函数矩阵 P:\n'); disp(P_opt);
% 仿真闭环系统
T = 100; % 仿真步数
x = zeros(2, T);
u = zeros(1, T);
x(:,1) = [0.1; 0.05]; % 初始状态
for k = 1:T-1
u(k) = -K_opt * x(:,k);
x(:,k+1) = A * x(:,k) + B * u(k);
end
u(end) = -K_opt * x(:,end);
% 绘制结果
figure('Position', [100, 100, 1200, 400]);
subplot(1,3,1);
plot(1:T, x(1,:), 'b-', 'LineWidth', 2); hold on;
plot(1:T, x(2,:), 'r--', 'LineWidth', 2);
xlabel('时间步'); ylabel('状态');
legend('角度', '角速度');
title('系统状态响应');
grid on;
subplot(1,3,2);
plot(1:T, u, 'k-', 'LineWidth', 2);
xlabel('时间步'); ylabel('控制输入');
title('控制输入');
grid on;
subplot(1,3,3);
semilogy(1:length(history), history, 'b-o', 'LineWidth', 2);
xlabel('迭代次数'); ylabel('收敛误差');
title('ADP收敛过程');
grid on;
sgtitle('基于值迭代的ADP控制结果');
三、神经网络ADP实现
3.1 神经网络评价网络
matlab
%% 神经网络ADP(Critic-Actor结构)
classdef NeuralADP
% 基于神经网络的ADP
properties
% 系统参数
A, B, Q, R, gamma;
% 神经网络
critic_net; % 评价网络 (V(x))
actor_net; % 行动网络 (u(x))
model_net; % 模型网络 (x_{k+1} = f(x_k, u_k))
% 训练参数
learning_rate = 0.01;
batch_size = 64;
memory_size = 10000;
memory_counter = 0;
state_memory;
action_memory;
next_state_memory;
reward_memory;
end
methods
function obj = NeuralADP(A, B, Q, R, gamma)
obj.A = A; obj.B = B; obj.Q = Q; obj.R = R; obj.gamma = gamma;
% 初始化神经网络
obj.init_networks();
% 初始化经验回放池
state_dim = size(A,1);
action_dim = size(B,2);
obj.state_memory = zeros(obj.memory_size, state_dim);
obj.action_memory = zeros(obj.memory_size, action_dim);
obj.next_state_memory = zeros(obj.memory_size, state_dim);
obj.reward_memory = zeros(obj.memory_size, 1);
end
function init_networks(obj)
% 初始化三个神经网络
% 1. Critic网络(评价网络)
obj.critic_net = feedforwardnet([64, 32]);
obj.critic_net.trainParam.lr = obj.learning_rate;
obj.critic_net.trainParam.epochs = 100;
% 2. Actor网络(行动网络)
obj.actor_net = feedforwardnet([32, 16]);
obj.actor_net.trainParam.lr = obj.learning_rate;
obj.actor_net.trainParam.epochs = 100;
% 3. Model网络(模型网络)
obj.model_net = feedforwardnet([64, 32]);
obj.model_net.trainParam.lr = obj.learning_rate;
obj.model_net.trainParam.epochs = 100;
end
function store_transition(obj, state, action, next_state, reward)
% 存储经验到回放池
idx = mod(obj.memory_counter, obj.memory_size) + 1;
obj.state_memory(idx,:) = state';
obj.action_memory(idx,:) = action';
obj.next_state_memory(idx,:) = next_state';
obj.reward_memory(idx) = reward;
obj.memory_counter = obj.memory_counter + 1;
end
function train_step(obj)
% 执行一步训练
% 随机采样批次
if obj.memory_counter < obj.batch_size
return;
end
indices = randperm(min(obj.memory_counter, obj.memory_size), obj.batch_size);
states = obj.state_memory(indices,:)';
actions = obj.action_memory(indices,:)';
next_states = obj.next_state_memory(indices,:)';
rewards = obj.reward_memory(indices)';
% 1. 训练Critic网络
% 目标:V(s) = r + γ*V(s')
current_V = obj.critic_net(states);
next_V = obj.critic_net(next_states);
target_V = rewards + obj.gamma * next_V;
% 训练Critic
critic_inputs = states;
critic_targets = target_V;
obj.critic_net = train(obj.critic_net, critic_inputs, critic_targets);
% 2. 训练Actor网络
% 目标:最小化 V(s)
for i = 1:obj.batch_size
state = states(:,i);
action = obj.actor_net(state);
% 使用Critic网络评估动作
V = obj.critic_net([state; action]);
% 梯度下降更新Actor
% 简化版本:直接计算梯度
eps = 1e-4;
grad = zeros(size(action));
for j = 1:length(action)
action_perturbed = action;
action_perturbed(j) = action_perturbed(j) + eps;
V_perturbed = obj.critic_net([state; action_perturbed]);
grad(j) = (V_perturbed - V) / eps;
end
% 更新Actor网络权重
obj.actor_net = adapt(obj.actor_net, state, -grad);
end
end
function action = get_action(obj, state)
% 根据当前状态选择动作
action = obj.actor_net(state);
end
end
end
3.2 非线性系统ADP控制
matlab
%% 非线性系统ADP控制
clear; clc;
% 非线性系统:x_{k+1} = f(x_k) + g(x_k)*u_k
% 倒立摆非线性模型
f = @(x) [x(1) + 0.01*x(2); 0.098*x(1) + 0.99*x(2) + 0.01*sin(x(1))];
g = @(x) [0; 0.01];
% 代价函数
Q = diag([1, 0.1]);
R = 0.01;
gamma = 0.95;
% 创建神经网络ADP
adp = NeuralADP(eye(2), [0;1], Q, R, gamma);
% 训练循环
num_episodes = 1000;
episode_rewards = zeros(num_episodes, 1);
for episode = 1:num_episodes
x = [0.1; 0.05]; % 初始状态
total_reward = 0;
for step = 1:200 % 每个episode最多200步
% 选择动作
action = adp.get_action(x);
% 系统演化
next_x = f(x) + g(x)*action;
% 计算奖励
reward = - (x'*Q*x + action'*R*action);
total_reward = total_reward + reward;
% 存储经验
adp.store_transition(x, action, next_x, reward);
% 训练网络
adp.train_step();
x = next_x;
% 检查是否稳定
if norm(x) < 1e-3
break;
end
end
episode_rewards(episode) = total_reward;
if mod(episode, 100) == 0
fprintf('Episode %d: Total Reward = %.2f\n', episode, total_reward);
end
end
% 绘制训练曲线
figure;
plot(1:num_episodes, episode_rewards, 'b-', 'LineWidth', 2);
xlabel('Episode');
ylabel('Total Reward');
title('ADP训练过程');
grid on;
四、模型预测控制与ADP结合
4.1 ADP-MPC混合控制
matlab
%% ADP与MPC结合的控制器
classdef ADPMPCController
% 结合ADP和MPC的控制器
properties
% ADP组件
adp_solver;
% MPC参数
Np = 10; % 预测时域
Nc = 5; % 控制时域
Q_mpc = diag([1, 0.1]);
R_mpc = 0.01;
% 系统模型
A, B;
end
methods
function obj = ADPMPCController(A, B, Q, R)
obj.A = A; obj.B = B;
obj.adp_solver = ValueIterationADP(A, B, Q, R, 0.95);
% 预训练ADP
[~, ~, ~] = obj.adp_solver.solve();
end
function [u_opt, predicted_states] = compute_control(obj, x0)
% 计算MPC控制序列
% 使用ADP获得初始猜测
K_adp = obj.adp_solver.K;
% 构建优化问题
% min Σ(x'Qx + u'Ru) s.t. x_{k+1}=Ax_k+Bu_k
H = zeros(obj.Nc*size(obj.B,2), obj.Nc*size(obj.B,2));
f = zeros(obj.Nc*size(obj.B,2), 1);
% 构建Hessian矩阵和梯度向量
for i = 1:obj.Nc
% 控制权重
R_block = kron(eye(i), obj.R_mpc);
H((i-1)*size(obj.B,2)+1:i*size(obj.B,2), ...
(i-1)*size(obj.B,2)+1:i*size(obj.B,2)) = R_block;
% 状态权重(需要计算状态转移)
% 简化:使用ADP的反馈增益
for j = 1:obj.Np
x_j = obj.A^j * x0;
Q_block = kron(eye(j), obj.Q_mpc);
% ... 构建完整Hessian矩阵
end
end
% 约束条件
Aeq = []; beq = [];
lb = -10 * ones(obj.Nc*size(obj.B,2), 1);
ub = 10 * ones(obj.Nc*size(obj.B,2), 1);
% 初始猜测(来自ADP)
u0 = zeros(obj.Nc*size(obj.B,2), 1);
for i = 1:obj.Nc
u0((i-1)*size(obj.B,2)+1:i*size(obj.B,2)) = -K_adp * (obj.A^(i-1) * x0);
end
% 求解QP问题
options = optimoptions('quadprog', 'Display', 'off');
u_opt = quadprog(H, f, [], [], Aeq, beq, lb, ub, u0, options);
% 预测状态轨迹
predicted_states = zeros(size(obj.A,1), obj.Np);
x = x0;
for k = 1:obj.Np
if k <= obj.Nc
u = u_opt((k-1)*size(obj.B,2)+1:k*size(obj.B,2));
else
u = -K_adp * x; % 使用ADP反馈
end
x = obj.A * x + obj.B * u;
predicted_states(:,k) = x;
end
% 只返回第一个控制量
u_opt = u_opt(1:size(obj.B,2));
end
end
end
五、自适应权重调整ADP
5.1 自适应学习率ADP
matlab
%% 自适应学习率调整
function adaptive_adp_training()
% 自适应学习率的ADP训练
% 系统参数
A = [1, 0.01; 0.098, 0.99];
B = [0; 0.01];
Q = diag([1, 0.1]);
R = 0.01;
% 初始化
P = eye(2);
K = zeros(1,2);
learning_rate = 0.01;
momentum = 0.9;
velocity_P = zeros(2,2);
velocity_K = zeros(1,2);
% 训练历史
errors = zeros(1000, 1);
for iter = 1:1000
% 1. 策略评估
A_cl = A - B*K;
Q_cl = Q + K'*R*K;
% 求解Lyapunov方程
P_new = dlyap(A_cl', Q_cl);
% 2. 策略改进
K_new = (R + B'*P_new*B) \ (B'*P_new*A);
% 3. 自适应学习率调整
% 根据误差变化调整学习率
error = norm(K_new - K, 'fro');
if iter > 1
if error > errors(iter-1)
learning_rate = learning_rate * 0.9; % 减小学习率
else
learning_rate = learning_rate * 1.05; % 增大学习率
end
end
learning_rate = max(0.001, min(0.1, learning_rate));
% 4. 动量更新
velocity_P = momentum * velocity_P + (1-momentum) * (P_new - P);
velocity_K = momentum * velocity_K + (1-momentum) * (K_new - K);
P = P + learning_rate * velocity_P;
K = K + learning_rate * velocity_K;
errors(iter) = error;
if error < 1e-6
fprintf('收敛于第 %d 次迭代\n', iter);
break;
end
end
% 绘制收敛曲线
figure;
semilogy(1:iter, errors(1:iter), 'b-', 'LineWidth', 2);
xlabel('迭代次数');
ylabel('误差');
title('自适应学习率ADP收敛曲线');
grid on;
end
参考代码 MATLAB实现自适应动态规划方法 www.youwenfan.com/contentcsu/45070.html
六、实际应用示例
6.1 无人机姿态控制
matlab
%% 无人机姿态ADP控制
classdef DroneAttitudeADP
% 四旋翼无人机姿态控制
properties
% 无人机参数
Ixx = 0.01; Iyy = 0.01; Izz = 0.02; % 转动惯量
m = 0.5; % 质量
g = 9.81; % 重力加速度
% 状态:[roll, pitch, yaw, roll_rate, pitch_rate, yaw_rate]
% 控制:[M_roll, M_pitch, M_yaw, thrust]
% ADP控制器
adp_controllers; % 三个独立的ADP控制器
end
methods
function obj = DroneAttitudeADP()
% 初始化三个通道的ADP控制器
obj.adp_controllers = cell(3,1);
% 为每个姿态通道创建线性化模型
for i = 1:3
[A, B] = obj.get_channel_model(i);
Q = diag([10, 1]); % 状态权重
R = 0.1; % 控制权重
obj.adp_controllers{i} = ValueIterationADP(A, B, Q, R, 0.95);
end
end
function [A, B] = get_channel_model(obj, channel)
% 获取各通道的线性化模型
switch channel
case 1 % Roll通道
A = [0, 1; 0, -0.1];
B = [0; 1/obj.Ixx];
case 2 % Pitch通道
A = [0, 1; 0, -0.1];
B = [0; 1/obj.Iyy];
case 3 % Yaw通道
A = [0, 1; 0, -0.1];
B = [0; 1/obj.Izz];
end
end
function control = compute_control(obj, state_desired, state_current)
% 计算控制输出
control = zeros(4,1);
for i = 1:3
% 计算姿态误差
error = state_desired(i) - state_current(i);
error_rate = state_desired(i+3) - state_current(i+3);
state_error = [error; error_rate];
% 使用ADP计算控制力矩
K = obj.adp_controllers{i}.K;
M = -K * state_error;
control(i) = M;
end
% 推力控制(高度保持)
control(4) = obj.m * obj.g; % 悬停推力
end
end
end
七、性能评估与调试
7.1 ADP性能评估函数
matlab
%% ADP性能评估
function evaluate_adp_performance(adp_solver, test_cases)
% 评估ADP控制器性能
fprintf('=== ADP性能评估报告 ===\n');
% 1. 稳定性测试
fprintf('1. 稳定性测试:\n');
stable_count = 0;
for i = 1:length(test_cases)
x0 = test_cases{i};
x = x0;
stable = true;
for k = 1:100
u = -adp_solver.K * x;
x = adp_solver.A * x + adp_solver.B * u;
if norm(x) > 10 % 发散
stable = false;
break;
end
end
if stable
stable_count = stable_count + 1;
fprintf(' 测试用例 %d: 稳定\n', i);
else
fprintf(' 测试用例 %d: 不稳定\n', i);
end
end
fprintf(' 稳定性: %d/%d 通过\n\n', stable_count, length(test_cases));
% 2. 最优性验证
fprintf('2. 最优性验证:\n');
% 计算理论最优解
P_theoretical = dare(adp_solver.A, adp_solver.B, adp_solver.Q, adp_solver.R);
K_theoretical = (adp_solver.R + adp_solver.B'*P_theoretical*adp_solver.B) \ ...
(adp_solver.B'*P_theoretical*adp_solver.A);
% 比较ADP解与理论解
error_K = norm(adp_solver.K - K_theoretical, 'fro');
error_P = norm(adp_solver.P - P_theoretical, 'fro');
fprintf(' K矩阵误差: %.2e\n', error_K);
fprintf(' P矩阵误差: %.2e\n', error_P);
if error_K < 1e-4 && error_P < 1e-4
fprintf(' 最优性: 通过\n\n');
else
fprintf(' 最优性: 警告 - 误差较大\n\n');
end
% 3. 鲁棒性测试
fprintf('3. 鲁棒性测试:\n');
% 在参数摄动下测试
original_A = adp_solver.A;
perturbation_levels = [0.05, 0.1, 0.2];
for p = 1:length(perturbation_levels)
delta = perturbation_levels(p);
adp_solver.A = original_A + delta * randn(size(original_A));
% 测试稳定性
x = [0.1; 0.05];
stable = true;
for k = 1:50
u = -adp_solver.K * x;
x = adp_solver.A * x + adp_solver.B * u;
if norm(x) > 5
stable = false;
break;
end
end
if stable
fprintf(' 摄动 %.0f%%: 稳定\n', delta*100);
else
fprintf(' 摄动 %.0f%%: 不稳定\n', delta*100);
end
end
% 恢复原参数
adp_solver.A = original_A;
fprintf('=== 评估完成 ===\n');
end
八、总结
8.1 ADP方法特点
| 方法 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| 值迭代ADP | 简单易实现,收敛性好 | 需要已知系统模型 | 线性系统、模型已知 |
| 神经网络ADP | 可处理非线性,无需精确模型 | 训练复杂,可能不稳定 | 非线性系统、复杂环境 |
| ADP-MPC | 结合长期优化和短期约束 | 计算量大 | 约束系统、高性能要求 |
| 自适应ADP | 自动调整参数,鲁棒性强 | 参数调优复杂 | 时变系统、不确定性系统 |
8.2 实施建议
- 从小规模开始:先用值迭代ADP解决简单线性问题
- 验证收敛性:确保算法收敛到最优解
- 逐步复杂化:扩展到非线性系统和神经网络
- 充分测试:在不同初始条件和参数摄动下测试鲁棒性
8.3 调试技巧
matlab
% 常见问题及解决方案
debugging_guide = {
'不收敛', '减小学习率,检查系统稳定性';
'震荡', '增加动量项,调整折扣因子';
'发散', '检查奖励函数设计,确保正定性';
'训练慢', '增大批量大小,使用自适应学习率';
'性能差', '增加网络容量,改进特征工程';
};