强化学习之REINFORECE策略梯度算法——已CartPole环境为例

整体代码如下:

python 复制代码
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
def moving_average(a, window_size):
    cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
    middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
    r = np.arange(1, window_size-1, 2)
    begin = np.cumsum(a[:window_size-1])[::2] / r
    end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
    return np.concatenate((begin, middle, end))
class PolicyNetwork(torch.nn.Module):
    def __init__(self,statedim,hiddendim,actiondim):
        super(PolicyNetwork,self).__init__()
        self.cf1=torch.nn.Linear(statedim,hiddendim)
        self.cf2=torch.nn.Linear(hiddendim,actiondim)
    def forward(self,x):
        x=torch.nn.functional.relu(self.cf1(x))
        return torch.nn.functional.softmax(self.cf2(x),dim=1)
class REINFORCE:
    def __init__(self,statedim,hiddendim,actiondim,learningrate,gamma,device):
        self.policynet=PolicyNetwork(statedim,hiddendim,actiondim).to(device)
        self.gamma=gamma
        self.device=device
        self.optimizer=torch.optim.Adam(self.policynet.parameters(),lr=learningrate)
    def takeaction(self,state):
        state=torch.tensor([state],dtype=torch.float).to(self.device)
        probs=self.policynet(state)
        actiondist=torch.distributions.Categorical(probs)#torch.distributions.Categorical:这是 PyTorch 中用于表示类别分布的类,可以使用 actiondist.sample() 方法从这个分布中随机采样一个类别
        action=actiondist.sample()
        return action.item()
    def update(self,transitiondist):
        statelist=transitiondist['states']
        rewardlist=transitiondist['rewards']
        actionlist=transitiondist['actions']
        G=0
        self.optimizer.zero_grad()
        for i in reversed(range(len(rewardlist))):#从最后一步计算起
            reward=rewardlist[i]
            state=statelist[i]
            action=actionlist[i]
            state=torch.tensor([state],dtype=torch.float).to(self.device)
            action=torch.tensor([action]).view(-1,1).to(self.device)
            logprob=torch.log(self.policynet(state).gather(1,action)) #.gather(1, action) 方法从策略网络的输出中提取对应于特定动作 action 的概率值。这里的 1 表示沿着维度 1(通常对应于动作维度)进行索引。
            G=self.gamma*G+reward
            loss=-logprob*G#每一步的损失函数
            loss.backward()#反向传播计算梯度
        self.optimizer.step()#更新参数,梯度下降

learningrate=4e-3
episodesnum=1000
hiddendim=128
gamma=0.99
pbarnum=10
printreturnnum=10
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
env=gym.make('CartPole-v1')
env.reset(seed=880)
torch.manual_seed(880)
statedim=env.observation_space.shape[0]
actiondim=env.action_space.n
agent=REINFORCE(statedim=statedim,hiddendim=hiddendim,actiondim=actiondim,learningrate=learningrate,gamma=gamma,device=device)
returnlist=[]
for  k in range(pbarnum):
    with tqdm(total=int(episodesnum/pbarnum),desc='Iteration %d'%k)as pbar:
        for episode in range(int(episodesnum/pbarnum)):
            g=0
            transitiondist={'states':[],'actions':[],'nextstates':[],'rewards':[]}
            state,_=env.reset(seed=880)
            done=False
            while not done:
                action=agent.takeaction(state)
                nextstate,reward,done,truncated,_=env.step(action)
                done=done or truncated

                transitiondist['states'].append(state)
                transitiondist['actions'].append(action)
                transitiondist['nextstates'].append(nextstate)
                transitiondist['rewards'].append(reward)
                state=nextstate
                g=g+reward
            returnlist.append(g)
            agent.update(transitiondist)
            if (episode+1)%(printreturnnum)==0:
                pbar.set_postfix({'Episode':'%d'%(episodesnum//pbarnum+episode+1),'Return':'%.3f'%np.mean(returnlist[-printreturnnum:])})
            pbar.update(1)

episodelist=list(range(len(returnlist)))
plt.plot(episodelist,returnlist)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('REINFORCE on {}'.format(env.spec.name))     
plt.show()
mvreturn=moving_average(returnlist,9)
plt.plot(episodelist,mvreturn)
plt.xlabel('Episodes')
plt.ylabel('Returns')
plt.title('REINFORCE on {}'.format(env.spec.name))
plt.show()  

                
            

效果:

相关推荐
陈鋆几秒前
智慧城市初探与解决方案
人工智能·智慧城市
qdprobot1 分钟前
ESP32桌面天气摆件加文心一言AI大模型对话Mixly图形化编程STEAM创客教育
网络·人工智能·百度·文心一言·arduino
QQ39575332372 分钟前
金融量化交易模型的突破与前景分析
人工智能·金融
QQ39575332372 分钟前
金融量化交易:技术突破与模型优化
人工智能·金融
-一杯为品-4 分钟前
【51单片机】程序实验5&6.独立按键-矩阵按键
c语言·笔记·学习·51单片机·硬件工程
The_Ticker15 分钟前
CFD平台如何接入实时行情源
java·大数据·数据库·人工智能·算法·区块链·软件工程
Elastic 中国社区官方博客21 分钟前
Elasticsearch 开放推理 API 增加了对 IBM watsonx.ai Slate 嵌入模型的支持
大数据·数据库·人工智能·elasticsearch·搜索引擎·ai·全文检索
jwolf221 分钟前
摸一下elasticsearch8的AI能力:语义搜索/vector向量搜索案例
人工智能·搜索引擎
有Li30 分钟前
跨视角差异-依赖网络用于体积医学图像分割|文献速递-生成式模型与transformer在医学影像中的应用
人工智能·计算机视觉
傻啦嘿哟33 分钟前
如何使用 Python 开发一个简单的文本数据转换为 Excel 工具
开发语言·python·excel