强化学习之Dueling DQN对DQN的改进——以倒立摆环境(Inverted Pendulum)为例

0.简介


参考博客来源:DeepRL系列(10): Dueling DQN(DDQN)原理及实现https://zhuanlan.zhihu.com/p/114834834
通过前面的推导,我们得到了Dueling Network的数学形式为

实际中将最大化形式变成均值形式效果更好,更稳定,其数学形式如下



1.导库

python 复制代码
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import gym
import collections
import random

2.神经网络Qnet和VAnet构建

python 复制代码
class VAnet(torch.nn.Module):
    """ 只有一层隐藏层的A网络和V网络 """
    def __init__(self,statedim,hiddendim,actiondim):
        super(VAnet,self).__init__()
        self.fc1=torch.nn.Linear(statedim,hiddendim)
        self.fcA=torch.nn.Linear(hiddendim,actiondim)
        self.fcV=torch.nn.Linear(hiddendim,1)
    def forward(self,x):
        A=self.fcA(torch.nn.functional.relu(self.fc1(x)))
        V=self.fcV(torch.nn.functional.relu(self.fc1(x)))
        # Q=V+A-A.mean(1).unsqueeze(1)#unsqueeze 则用于在指定位置增加一个维度 本式子相当于下式
        Q=V+A-A.mean(1).view(-1,1)
        return Q
    def save(self, path):
        torch.save(self.state_dict(), path)
    def load(self, path):
        self.load_state_dict(torch.load(path))
class Qnet(torch.nn.Module):
    """ 只有一层隐藏层的Q网络 """
    def __init__(self,statedim,hiddendim,actiondim):
        super(Qnet,self).__init()
        self.fc1=torch.nn.Linear(statedim,hiddendim)
        self.fc2=torch.nn.Linear(hiddendim,actiondim)
    def forward(self,x):
        x=torch.nn.functional.relu(self.fc1(x))
        return self.fc2(x)
    def save(self, path):
        torch.save(self.state_dict(), path)
    def load(self, path):
        self.load_state_dict(torch.load(path))

3.经验回放池实现

python 复制代码
class ReplayBuffer:
    """ 经验回放池 """
    def __init__(self,capacity):
        self.buffer=collections.deque(maxlen=capacity)
    def add(self,state,action,reward,nextstate,done):
        self.buffer.append((state,action,reward,nextstate,done))
    def sample(self,batchsize):
        transitions=random.sample(self.buffer,batchsize)
        state,action,reward,nextstate,done=zip(*transitions)
        return np.array(state),action,reward,np.array(nextstate),done
    def size(self):
        return len(self.buffer)

当然我们神经网络也可以写成如下形式,是等价的。

python 复制代码
class VAnet(torch.nn.Module):
    def __init__(self, statedim, hiddendim, actiondim):
        super(VAnet, self).__init__()
        self.A = torch.nn.Sequential(
            torch.nn.Linear(statedim, hiddendim),
            torch.nn.ReLU(),
            torch.nn.Linear(hiddendim, actiondim),
            # torch.nn.Softmax(dim=1)
        )
        self.V = torch.nn.Sequential(
            torch.nn.Linear(statedim, hiddendim),
            torch.nn.ReLU(),
            torch.nn.Linear(hiddendim, 1)
        )

    def forward(self, x):
        a_output = self.A(x)
        v_output = self.V(x)
        a_mean = a_output.mean(1).view(-1, 1)
        return a_output + v_output - a_mean

4.离散动作转为连续函数的实现函数

python 复制代码
def dis_to_con(actionid,env,actiondim):#离散动作转回连续函数
    actionlowbound=env.action_space.low[0]#连续动作最小值
    actionupbound=env.action_space.high[0]#连续动作最大值
    return actionlowbound+actionid*(actionupbound-actionlowbound)/(actiondim-1)

5.DQN算法实现

python 复制代码
class DQN:
    """ DQN算法,包括DoubleDQN和DuelingDQN """
    def __init__(self,statedim,hiddendim,actiondim,learningrate,gamma,epsilon,targetupdate,device,dqntype='VanillaDQN'):
        self.actiondim=actiondim
        self.gamma=gamma
        self.epsilon=epsilon
        self.targetupdate=targetupdate
        self.device=device
        self.dqntype=dqntype
        self.count=0
        if self.dqntype=='DuelingDQN':#Dueling DQN采取不一样的网络框架
            self.qnet=VAnet(statedim=statedim,hiddendim=hiddendim,actiondim=actiondim).to(self.device)
            self.targetqnet=VAnet(statedim=statedim,hiddendim=hiddendim,actiondim=actiondim).to(self.device)
        else:
            self.qnet=Qnet(statedim=statedim,hiddendim=hiddendim,actiondim=actiondim).to(self.device)
            self.targetqnet=Qnet(statedim=statedim,hiddendim=hiddendim,actiondim=actiondim).to(self.device)
        self.optimizer=torch.optim.Adam(self.qnet.parameters(),lr=learningrate)
    def takeaction(self,state):
        if np.random.random()<self.epsilon:
            action=np.random.randint(self.actiondim)
        else:
            state=torch.tensor([state],dtype=torch.float).to(self.device)
            action=self.qnet(state).argmax().item()
        return action
    def max_qvalue(self,state):
        state=torch.tensor([state],dtype=torch.float).to(self.device)
        return self.qnet(state).max().item()
    def update(self,transition_dict):
        states=torch.tensor(transition_dict['states'],dtype=torch.float).to(self.device)
        actions=torch.tensor(transition_dict['actions']).view(-1,1).to(self.device)
        rewards=torch.tensor(transition_dict['rewards'],dtype=torch.float).view(-1,1).to(self.device)
        nextstates=torch.tensor(transition_dict['nextstates'],dtype=torch.float).to(self.device)
        dones=torch.tensor(transition_dict['dones'],dtype=torch.float).view(-1,1).to(self.device)
        qvalues=self.qnet(states).gather(1,actions)#gather(1, actions) 中的参数 1 表示沿着第 1 维度(即列维度)进行收集操作,根据 actions 提供的索引来收集相应的 qvalues 。
        if self.dqntype=='DoubleDQN':
            maxaction=self.qnet(nextstates).max(1)[1].view(-1,1)#max(1)表示在第 1 个维度(通常是列维度)上求最大值;max(1)会返回两个值,第一个是每行的最大值,第二个是最大值所在的索引[1]。
            maxnextqvalues=self.targetqnet(nextstates).gather(1,maxaction)
        else:
            maxnextqvalues=self.targetqnet(nextstates).max(1)[0].view(-1,1)
        targetqvalues=rewards+self.gamma*maxnextqvalues*(1-dones)
        dqnloss=torch.mean(torch.nn.functional.mse_loss(qvalues,targetqvalues))
        self.optimizer.zero_grad()
        dqnloss.backward()
        self.optimizer.step()
        if self.count % self.targetupdate==0:
            self.targetqnet.load_state_dict(self.qnet.state_dict())
        self.count+=1

6.训练DQN函数实现

python 复制代码
def trainDQN(agent,env,episodesnum,pbarnum,printreturnnum,replaybuffer,minimalsize,batchsize):
    returnlist=[]
    maxqvaluelist=[]
    maxqvalue=0
    for k in range(pbarnum):
        with tqdm(total=int(episodesnum/pbarnum),desc='Iteration %d'%k) as pbar:
            for episode in range(int(episodesnum/pbarnum)):
                episodereturn=0
                state=env.reset(seed=10)[0]
                done=False
                while not done:
                    action=agent.takeaction(state)
                    maxqvalue=agent.max_qvalue(state)*0.005+maxqvalue*0.995#平滑处理
                    maxqvaluelist.append(maxqvalue)#记录最大q值
                    action_continuous=dis_to_con(actionid=action,env=env,actiondim=agent.actiondim)
                    nextstate,reward,done,truncated,_=env.step([action_continuous])
                    done=done or truncated
                    replaybuffer.add(state,action,reward,nextstate,done)
                    state=nextstate
                    episodereturn+=reward
                    if replaybuffer.size()>minimalsize:
                        bs,ba,br,bns,bd=replaybuffer.sample(batchsize)
                        transitiondict={'states':bs,'actions':ba,'rewards':br,'nextstates':bns,'dones':bd}
                        agent.update(transitiondict)
                returnlist.append(episodereturn)
                if (episode+1)%printreturnnum==0:
                    pbar.set_postfix({'episode':'%d'%(int(episodesnum/pbarnum)*k+episode+1),'return':'%.3f'%np.mean(returnlist[-printreturnnum:])})
                pbar.update(1)
    return returnlist,maxqvaluelist

7.移动平均函数实现

python 复制代码
def moving_average(a, window_size):
    cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
    middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
    r = np.arange(1, window_size-1, 2)
    begin = np.cumsum(a[:window_size-1])[::2] / r
    end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
    return np.concatenate((begin, middle, end))

8.参数设置

python 复制代码
lr=1e-2
gamma=0.98
epsilon=0.01
target_update=10
batchsize=64
minimalsize=500
episodesnum=500
buffersize=10000
hiddendim=128
actiondim=11
pbarnum=10
printreturnnum=10
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

9.倒立摆环境下训练并实现可视化

python 复制代码
random.seed(10)
np.random.seed(10)
torch.manual_seed(10)
replaybuffer=ReplayBuffer(buffersize)
env=gym.make('Pendulum-v1')
env.reset(seed=10)
statedim=env.observation_space.shape[0]
agent=DQN(statedim=statedim,hiddendim=hiddendim,actiondim=actiondim,learningrate=lr,gamma=gamma,epsilon=epsilon,targetupdate=target_update,device=device,dqntype='DuelingDQN')
returnlist,maxqvaluelist=trainDQN(agent=agent,env=env,episodesnum=episodesnum,pbarnum=pbarnum,printreturnnum=printreturnnum,replaybuffer=replaybuffer,minimalsize=minimalsize,batchsize=batchsize)

episodelist=np.arange(len(returnlist))#等价于np.linspace(0,len(returnlist)-1,len(returnlist))以及list(range(len(returnlist)))
plt.plot(episodelist,returnlist)
plt.xlabel('Episodes')
plt.ylabel('Return')
plt.title(f'{agent.dqntype} on {env.spec.name}')
plt.show()
framslist=np.arange(len(maxqvaluelist))
plt.plot(framslist,maxqvaluelist)
plt.axhline(y=0,color='purple',linestyle='--')
plt.axhline(y=10,c='red',ls='--')

plt.xlabel('Frames')
plt.ylabel('Q value')
plt.title(f'{agent.dqntype} on {env.spec.name}')
plt.show()
env.close()

9.可视化结果显示以及结论



结论:相比传统的DQN,Dueing DQN在多个动作选择下的学习更加稳定,得到的回报最大值也更大,由Dueling DQN 原理知随着动作空间增大,Dueling DQN相比DQN优势更加明显。本实验中离散动作数设置为11,可以增加动作数(例如15,25,30等),继续对比实验,实验效果更为明显。
当然我们可以改变网络结构,加大隐藏层数量, 更改神经网络结构如下所示。

python 复制代码
class VAnet(torch.nn.Module):
    def __init__(self, statedim, hiddendim, actiondim):
        super(VAnet, self).__init__()
        self.A = torch.nn.Sequential(
            torch.nn.Linear(statedim, hiddendim),
            torch.nn.Tanh(),  # 改变激活函数为 Tanh
            torch.nn.Linear(hiddendim, hiddendim),  # 增加一层隐藏层
            torch.nn.ReLU(),
            torch.nn.Linear(hiddendim, actiondim),
            # torch.nn.Softmax(dim=1)
        )
        self.V = torch.nn.Sequential(
            torch.nn.Linear(statedim, hiddendim),
            torch.nn.Tanh(),  # 改变激活函数为 Tanh
            torch.nn.Linear(hiddendim, hiddendim),  # 增加一层隐藏层
            torch.nn.ReLU(),
            torch.nn.Linear(hiddendim, 1)
        )
    def forward(self, x):
        return self.A(x) + self.V(x) - self.A(x).mean(1).view(-1, 1)
    def save(self, path):
        torch.save(self.state_dict(), path)
    def load(self, path):
        self.load_state_dict(torch.load(path))

结果如下所示:

相关推荐
nbsaas-boot1 小时前
Java 正则表达式白皮书:语法详解、工程实践与常用表达式库
开发语言·python·mysql
岁忧1 小时前
(LeetCode 面试经典 150 题 ) 11. 盛最多水的容器 (贪心+双指针)
java·c++·算法·leetcode·面试·go
仗剑_走天涯1 小时前
基于pytorch.nn模块实现线性模型
人工智能·pytorch·python·深度学习
chao_7891 小时前
二分查找篇——搜索旋转排序数组【LeetCode】两次二分查找
开发语言·数据结构·python·算法·leetcode
数字芯片实验室1 小时前
分享一个可以学习正则表达式的网址:Pythex.org
学习·正则表达式
陈洪奇1 小时前
注册中心学习笔记整理
笔记·学习
光影少年1 小时前
从前端转go开发的学习路线
前端·学习·golang
秋说3 小时前
【PTA数据结构 | C语言版】一元多项式求导
c语言·数据结构·算法
Maybyy3 小时前
力扣61.旋转链表
算法·leetcode·链表
兴趣使然_5 小时前
【笔记】使用 html 创建网址快捷方式
笔记·html·js