DQN强化学习

算是自己写的第一个强化学习环境,目前还有很多纰漏,逐步改进ing。

希望能在两周内施工完成。

python 复制代码
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import pandas as pd


def moving_average(data, window_size):
    """
    平滑函数
    :param data:
    :param window_size:
    :return:
    """
    if window_size <= 0:
        raise ValueError("Window size should be greater than 0.")

    if window_size > len(data):
        raise ValueError("Window size should not be greater than the length of data.")

    # Cumulative sum of data elements
    cumsum = [0]
    for i, x in enumerate(data):
        cumsum.append(cumsum[i] + x)

    # Compute moving averages
    ma_values = []
    for i in range(len(data) - window_size + 1):
        average = (cumsum[i + window_size] - cumsum[i]) / window_size
        ma_values.append(average)

    return ma_values


def plot_data(data, title="Data Plot", x_label="X-axis", y_label="Y-axis"):
    """
    画图
    :param data:
    :param title:
    :param x_label:
    :param y_label:
    :return:

    Plots a simple line graph based on the provided data.

    Parameters:
    - data (list): A list of integers or floats to be plotted.
    - title (str): The title of the plot.
    - x_label (str): The label for the x-axis.
    - y_label (str): The label for the y-axis.
    """
    plt.figure(figsize=(10, 5))  # Set the figure size
    plt.plot(data)  # Plot the data
    plt.title(title)  # Set the title
    plt.xlabel(x_label)  # Set x-axis label
    plt.ylabel(y_label)  # Set y-axis label
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)  # Add a grid
    plt.tight_layout()  # Adjust subplot parameters to give specified padding
    plt.show()


class TransportMatchingEnv:

    def __init__(self, num_drivers=5, num_goods=5, max_price=10, max_time=5):
        """

        :param num_drivers: 货车数量
        :param num_goods: 货物数量
        :param max_price: 最大价格
        :param max_time: 最大时间
        """
        self.num_drivers = num_drivers
        self.num_goods = num_goods
        self.max_price = max_price
        self.max_time = max_time
        # 动作空间
        self.action_dim = self.num_drivers * self.num_goods * self.max_price * self.max_time
        # 当前协商状态 TODO: 状态,需要加很多东西
        self.current_negotiation = None
        # 状态
        self.combined_state = self.reset()
        # 距离矩阵,表示货与车之间的距离
        self.distance_matrix = np.random.randint(0, 100, (self.num_goods, self.num_drivers))
        # 货主期望抵达时间
        self.goods_time_preferences = np.random.randint(0, self.max_time, self.num_goods)
        # 货主期望价格
        self.goods_expected_prices = np.random.randint(0, self.max_price, self.num_goods)
        # 车主是否空闲
        self.driver_availabilities = np.random.choice([0, 1], self.num_drivers)
        # 货物是否有特殊需求
        self.goods_special_requirements = np.random.choice([0, 1], self.num_goods)
        # 车主是否有接受特殊货物的能力
        self.driver_special_capabilities = np.random.choice([0, 1])

    def decode_action(self, encoded_action):
        """
        将action解码为人类可以读懂的形式
        :param encoded_action:
        :return:
        """
        total_actions_for_price_time = self.max_price * self.max_time
        total_actions_per_good = self.num_drivers * total_actions_for_price_time
        total_actions = self.num_goods * total_actions_per_good

        if encoded_action >= total_actions:
            raise ValueError("Encoded action is out of bounds!")

        good_index = encoded_action // total_actions_per_good
        residual = encoded_action % total_actions_per_good

        driver_index = residual // total_actions_for_price_time
        residual = residual % total_actions_for_price_time

        price = residual // self.max_time
        time = residual % self.max_time

        return driver_index, good_index, price, time

    def compute_reward(self, driver_index, good_index, price, time):
        """
        计算reward,
        :param driver_index:
        :param good_index:
        :param price:
        :param time:
        :return:
        """
        # 1. Distance factor (assuming you have a distance matrix or function to compute distance)
        # distance_matrix = ... # a matrix containing distances between goods and drivers
        distance = self.distance_matrix[good_index][driver_index]
        distance_factor = -distance  # negative reward for longer distances

        # 2. Time factor
        delivery_time_preference = self.goods_time_preferences[good_index]  # assuming you have this data
        time_penalty = -abs(delivery_time_preference - time) * 2  # penalize based on how far from preferred time

        # 3. Price factor
        expected_price = self.goods_expected_prices[good_index]  # assuming you have this data
        price_difference = price - expected_price
        price_factor = -abs(price_difference)  # prefer prices close to expected

        # 4. Availability of the driver (assuming you have this data)
        driver_availability = self.driver_availabilities[driver_index]  # e.g., 0 for not available, 1 for available
        availability_factor = driver_availability * 10  # give a bonus for available drivers

        # 5. Special requirements (assuming you have this data)
        good_requirement = self.goods_special_requirements[
            good_index]  # e.g., 0 for no requirement, 1 for special storage
        driver_capability = self.driver_special_capabilities[
            driver_index]  # e.g., 0 for no capability, 1 for special storage
        requirement_factor = 0
        if good_requirement > 0 and driver_capability < good_requirement:
            requirement_factor = -20  # huge penalty if driver can't meet the special requirement

        total_reward = distance_factor + time_penalty + price_factor + availability_factor + requirement_factor
        return total_reward

    def reset(self):
        """
        重置环境
        :return:
        """
        random.seed(0)
        self.current_negotiation = np.zeros((self.num_goods, self.num_drivers))

        # Refresh all the parameters every time you reset the environment
        self.distance_matrix = np.random.randint(0, 100, (self.num_goods, self.num_drivers))
        self.goods_time_preferences = np.random.randint(0, self.max_time, self.num_goods)
        self.goods_expected_prices = np.random.randint(0, self.max_price, self.num_goods)
        self.driver_availabilities = np.random.choice([0, 1], self.num_drivers)
        self.goods_special_requirements = np.random.choice([0, 1], self.num_goods)
        self.driver_special_capabilities = np.random.choice([0, 1], self.num_drivers)
        # print(f'self.distance_matrix:{self.distance_matrix}')
        # print(f'goods_time_preferences:{self.goods_time_preferences}')
        # print(f'goods_expected_prices:{self.goods_expected_prices}')
        # print(f'driver_availabilities:{self.driver_availabilities}')
        # print(f'goods_special_requirements:{self.goods_special_requirements}')
        # print(f'driver_special_capabilities:{self.driver_special_capabilities}')

        # self.distance_matrix = np.array([[67, 53, 24, 68, 92, 64, 85, 6, 77, 43],
        #                                  [40, 78, 48, 31, 14, 6, 7, 37, 26, 67],
        #                                  [96, 43, 73, 2, 71, 74, 37, 87, 17, 64],
        #                                  [28, 25, 84, 62, 51, 28, 32, 58, 98, 72],
        #                                  [13, 52, 38, 44, 11, 49, 11, 56, 80, 25],
        #                                  [3, 68, 25, 65, 50, 64, 2, 22, 40, 46],
        #                                  [98, 1, 9, 45, 80, 51, 86, 65, 22, 50],
        #                                  [98, 6, 73, 22, 12, 58, 84, 13, 38, 79],
        #                                  [78, 48, 52, 21, 36, 92, 71, 1, 22, 33],
        #                                  [43, 76, 74, 89, 19, 51, 34, 63, 11, 99]])
        # self.goods_time_preferences = [1, 1, 3, 4, 1, 1, 1, 3, 0, 4]
        # self.goods_expected_prices = [3, 4, 7, 1, 2, 2, 7, 5, 8, 2]
        # self.driver_availabilities = [1, 1, 0, 1, 0, 0, 1, 1, 0, 0]
        # self.goods_special_requirements = [0, 1, 0, 0, 1, 1, 1, 1, 0, 0]
        # self.driver_special_capabilities = [1, 1, 0, 0, 0, 1, 0, 0, 1, 1]
        # Combine everything into a single flattened state
        combined_state = np.concatenate((
            self.current_negotiation.flatten(),
            self.distance_matrix.flatten(),
            self.goods_time_preferences,
            self.goods_expected_prices,
            self.driver_availabilities,
            self.goods_special_requirements,
            self.driver_special_capabilities
        ))
        # print(f'combined_state.shape:{combined_state.shape}')
        return combined_state

    def driver_satisfaction(self, fee_received, expected_fee, distance_travelled, max_distance, wait_time,
                            max_wait_time,
                            goods_condition):
        """
        为车主设计的满意度计算
        :param fee_received: 收到的费用
        :param expected_fee: 预期费用
        :param distance_travelled: 行驶距离
        :param max_distance: 最大距离
        :param wait_time: 等待时间
        :param max_wait_time: 最大等待时间
        :param goods_condition: 货物状况
        :return:
        """
        # 价格满意度
        price_satisfaction = (fee_received / expected_fee) * 40  # assuming max weightage of 40 for price
        # 距离满意度
        distance_satisfaction = ((
                                         max_distance - distance_travelled) / max_distance) * 30  # assuming max weightage of 30 for distance
        # 等待时间满意度
        wait_satisfaction = ((
                                     max_wait_time - wait_time) / max_wait_time) * 20  # assuming max weightage of 20 for wait time
        # 货物状况满意度
        goods_satisfaction = 10 if goods_condition == 'good' else 0  # assuming max weightage of 10 for goods condition
        # 总满意度
        total_satisfaction = price_satisfaction + distance_satisfaction + wait_satisfaction + goods_satisfaction
        return total_satisfaction

    def shipper_satisfaction(self, fee_paid, expected_fee, delivery_time, expected_delivery_time, goods_condition,
                             driver_service_quality):
        """
        为货主设计的满意度计算
        :param fee_paid: 已付费用
        :param expected_fee: 预期费用
        :param delivery_time: 运输时间
        :param expected_delivery_time: 期望运输时间
        :param goods_condition: 货物状况
        :param driver_service_quality: 司机服务质量
        :return:
        """
        # 价格满意度
        price_satisfaction = (expected_fee / fee_paid) * 30  # assuming max weightage of 30 for price
        # 时间满意度
        time_satisfaction = ((
                                     expected_delivery_time - delivery_time) / expected_delivery_time) * 30  # assuming max weightage of 30 for delivery time
        # 货物状况满意度
        goods_satisfaction = 20 if goods_condition == 'good' else 0
        # 服务满意度
        service_satisfaction = driver_service_quality * 20 / 100
        # 总满意度
        total_satisfaction = price_satisfaction + time_satisfaction + goods_satisfaction + service_satisfaction
        return total_satisfaction

    def successOrFailure(self):
        # 判断是否协商成功,根据双方满意度

        # True为协商成功,false为协商失败
        return 1

    def step(self, encoded_action):
        """ TODO
        核心逻辑部分
        首先,明确何时协商成功,何时协商失败
        :param encoded_action: 待被decode的action
        :return:
        """
        driver_index, good_index, price, time = self.decode_action(encoded_action)
        # print(f'driver_index, good_index, price, time:{driver_index, good_index, price, time}')
        # if self.current_negotiation[good_index][driver_index] == 1 or price >= self.max_price and time >= self.max_time:
        #     # 如果已经被匹配
        #     reward = 0
        #     state = self.current_negotiation.flatten()
        #     done = np.sum(self.current_negotiation) == self.num_goods
        #     return state, reward, done, {}
        # self.shipper_satisfaction()
        # if self.successOrFailure() == 1:
        #     # 如果协商成功
        #     pass
        # elif self.successOrFailure() == 2:
        #     # 协商失败,进行报价与反报价
        #     pass
        # else:
        #     # 协商失败,直接结束
        #     pass
        if price <= self.max_price and time <= self.max_time:
            self.current_negotiation[good_index][driver_index] = 1

        reward = self.compute_reward(driver_index, good_index, price, time)
        combined_state = np.concatenate((
            self.current_negotiation.flatten(),
            self.distance_matrix.flatten(),
            self.goods_time_preferences,
            self.goods_expected_prices,
            self.driver_availabilities,
            self.goods_special_requirements,
            self.driver_special_capabilities
        ))
        done = np.sum(self.current_negotiation) == self.num_goods
        # print(f'reward, state, done:{reward, state, done}')
        return combined_state, reward, done, {}

    def render(self):
        print(self.current_negotiation)


# Simple random agent for testing
class RandomAgent:
    def __init__(self, action_dim):
        self.action_dim = action_dim

    def act(self):
        return np.random.choice(self.action_dim)


class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        # print(f'input_dim,output_dim:{input_dim, output_dim}')
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        # print(f'x.shape:{x.shape}')
        return self.fc(x)


class DQNAgent:
    def __init__(self, input_dim, action_dim, gamma=0.99, epsilon=0.99, lr=0.001):
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.network = DQN(input_dim, action_dim).float().to(device)
        self.target_network = DQN(input_dim, action_dim).float().to(device)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(), lr=self.lr)
        self.memory = deque(maxlen=2000)

    def act(self, state):
        if np.random.random() > self.epsilon:
            state = torch.tensor([state], dtype=torch.float32).to(device)
            with torch.no_grad():
                action = self.network(state).argmax().item()
            return action
        else:
            return np.random.choice(self.action_dim)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self, batch_size=64):
        if len(self.memory) < batch_size:
            return

        batch = random.sample(self.memory, batch_size)
        # print(f'batch:{len(batch)}')
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.int64).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        current_values = self.network(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        next_values = self.target_network(next_states).max(1)[0].detach()
        target_values = rewards + self.gamma * next_values * (1 - dones)

        loss = nn.MSELoss()(current_values, target_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_network.load_state_dict(self.network.state_dict())

    def decrease_epsilon(self, decrement_value=0.001, min_epsilon=0.1):
        self.epsilon = max(self.epsilon - decrement_value, min_epsilon)


if __name__ == '__main__':
    start = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    rewards = []
    env = TransportMatchingEnv(num_drivers=10, num_goods=10)
    agent = DQNAgent(env.combined_state.flatten().shape[0], env.action_dim)
    # agent = DQNAgent(env, env.action_dim)
    # 运行次数
    episodes = 2000
    for episode in tqdm(range(episodes)):
        state = env.reset()
        done = False
        episode_reward = 0
        total_reward = 0
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            agent.train()
            episode_reward += reward
            total_reward += reward
            state = next_state
            # print(f'done:{type(done)}')
            done = done.item()
            # if done is True:
            # print(f'state:{state}')
        agent.decrease_epsilon()
        rewards.append(total_reward)
        if episode % 50 == 0:
            agent.update_target_network()

        # print(f"Episode {episode + 1}/{episodes} - Reward: {episode_reward}")
    # 将数据
    df = pd.DataFrame(data=rewards)
    # 将DataFrame保存为excel文件
    df.to_excel('sample.xlsx', index=True)
    plot_data(moving_average(data=rewards, window_size=1), title='reward', x_label='epoch', y_label='reward')

    end = time.time()
    print(f'device: {device}')
    print(f'time: {end - start}')
相关推荐
luckys.one5 小时前
第9篇:Freqtrade量化交易之config.json 基础入门与初始化
javascript·数据库·python·mysql·算法·json·区块链
大翻哥哥7 小时前
Python 2025:量化金融与智能交易的新纪元
开发语言·python·金融
zhousenshan8 小时前
Python爬虫常用框架
开发语言·爬虫·python
IMER SIMPLE8 小时前
人工智能-python-深度学习-经典神经网络AlexNet
人工智能·python·深度学习
CodeCraft Studio9 小时前
国产化Word处理组件Spire.DOC教程:使用 Python 将 Markdown 转换为 HTML 的详细教程
python·html·word·markdown·国产化·spire.doc·文档格式转换
专注API从业者9 小时前
Python/Java 代码示例:手把手教程调用 1688 API 获取商品详情实时数据
java·linux·数据库·python
java1234_小锋9 小时前
[免费]基于Python的协同过滤电影推荐系统(Django+Vue+sqlite+爬虫)【论文+源码+SQL脚本】
python·django·电影推荐系统·协同过滤
看海天一色听风起雨落10 小时前
Python学习之装饰器
开发语言·python·学习
XiaoMu_00111 小时前
基于Python+Streamlit的旅游数据分析与预测系统:从数据可视化到机器学习预测的完整实现
python·信息可视化·旅游
THMAIL11 小时前
深度学习从入门到精通 - 生成对抗网络(GAN)实战:创造逼真图像的魔法艺术
人工智能·python·深度学习·神经网络·机器学习·生成对抗网络·cnn