DQN强化学习

算是自己写的第一个强化学习环境,目前还有很多纰漏,逐步改进ing。

希望能在两周内施工完成。

python 复制代码
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
from collections import deque
import matplotlib.pyplot as plt
import time
from tqdm import tqdm
import pandas as pd


def moving_average(data, window_size):
    """
    平滑函数
    :param data:
    :param window_size:
    :return:
    """
    if window_size <= 0:
        raise ValueError("Window size should be greater than 0.")

    if window_size > len(data):
        raise ValueError("Window size should not be greater than the length of data.")

    # Cumulative sum of data elements
    cumsum = [0]
    for i, x in enumerate(data):
        cumsum.append(cumsum[i] + x)

    # Compute moving averages
    ma_values = []
    for i in range(len(data) - window_size + 1):
        average = (cumsum[i + window_size] - cumsum[i]) / window_size
        ma_values.append(average)

    return ma_values


def plot_data(data, title="Data Plot", x_label="X-axis", y_label="Y-axis"):
    """
    画图
    :param data:
    :param title:
    :param x_label:
    :param y_label:
    :return:

    Plots a simple line graph based on the provided data.

    Parameters:
    - data (list): A list of integers or floats to be plotted.
    - title (str): The title of the plot.
    - x_label (str): The label for the x-axis.
    - y_label (str): The label for the y-axis.
    """
    plt.figure(figsize=(10, 5))  # Set the figure size
    plt.plot(data)  # Plot the data
    plt.title(title)  # Set the title
    plt.xlabel(x_label)  # Set x-axis label
    plt.ylabel(y_label)  # Set y-axis label
    plt.grid(True, which='both', linestyle='--', linewidth=0.5)  # Add a grid
    plt.tight_layout()  # Adjust subplot parameters to give specified padding
    plt.show()


class TransportMatchingEnv:

    def __init__(self, num_drivers=5, num_goods=5, max_price=10, max_time=5):
        """

        :param num_drivers: 货车数量
        :param num_goods: 货物数量
        :param max_price: 最大价格
        :param max_time: 最大时间
        """
        self.num_drivers = num_drivers
        self.num_goods = num_goods
        self.max_price = max_price
        self.max_time = max_time
        # 动作空间
        self.action_dim = self.num_drivers * self.num_goods * self.max_price * self.max_time
        # 当前协商状态 TODO: 状态,需要加很多东西
        self.current_negotiation = None
        # 状态
        self.combined_state = self.reset()
        # 距离矩阵,表示货与车之间的距离
        self.distance_matrix = np.random.randint(0, 100, (self.num_goods, self.num_drivers))
        # 货主期望抵达时间
        self.goods_time_preferences = np.random.randint(0, self.max_time, self.num_goods)
        # 货主期望价格
        self.goods_expected_prices = np.random.randint(0, self.max_price, self.num_goods)
        # 车主是否空闲
        self.driver_availabilities = np.random.choice([0, 1], self.num_drivers)
        # 货物是否有特殊需求
        self.goods_special_requirements = np.random.choice([0, 1], self.num_goods)
        # 车主是否有接受特殊货物的能力
        self.driver_special_capabilities = np.random.choice([0, 1])

    def decode_action(self, encoded_action):
        """
        将action解码为人类可以读懂的形式
        :param encoded_action:
        :return:
        """
        total_actions_for_price_time = self.max_price * self.max_time
        total_actions_per_good = self.num_drivers * total_actions_for_price_time
        total_actions = self.num_goods * total_actions_per_good

        if encoded_action >= total_actions:
            raise ValueError("Encoded action is out of bounds!")

        good_index = encoded_action // total_actions_per_good
        residual = encoded_action % total_actions_per_good

        driver_index = residual // total_actions_for_price_time
        residual = residual % total_actions_for_price_time

        price = residual // self.max_time
        time = residual % self.max_time

        return driver_index, good_index, price, time

    def compute_reward(self, driver_index, good_index, price, time):
        """
        计算reward,
        :param driver_index:
        :param good_index:
        :param price:
        :param time:
        :return:
        """
        # 1. Distance factor (assuming you have a distance matrix or function to compute distance)
        # distance_matrix = ... # a matrix containing distances between goods and drivers
        distance = self.distance_matrix[good_index][driver_index]
        distance_factor = -distance  # negative reward for longer distances

        # 2. Time factor
        delivery_time_preference = self.goods_time_preferences[good_index]  # assuming you have this data
        time_penalty = -abs(delivery_time_preference - time) * 2  # penalize based on how far from preferred time

        # 3. Price factor
        expected_price = self.goods_expected_prices[good_index]  # assuming you have this data
        price_difference = price - expected_price
        price_factor = -abs(price_difference)  # prefer prices close to expected

        # 4. Availability of the driver (assuming you have this data)
        driver_availability = self.driver_availabilities[driver_index]  # e.g., 0 for not available, 1 for available
        availability_factor = driver_availability * 10  # give a bonus for available drivers

        # 5. Special requirements (assuming you have this data)
        good_requirement = self.goods_special_requirements[
            good_index]  # e.g., 0 for no requirement, 1 for special storage
        driver_capability = self.driver_special_capabilities[
            driver_index]  # e.g., 0 for no capability, 1 for special storage
        requirement_factor = 0
        if good_requirement > 0 and driver_capability < good_requirement:
            requirement_factor = -20  # huge penalty if driver can't meet the special requirement

        total_reward = distance_factor + time_penalty + price_factor + availability_factor + requirement_factor
        return total_reward

    def reset(self):
        """
        重置环境
        :return:
        """
        random.seed(0)
        self.current_negotiation = np.zeros((self.num_goods, self.num_drivers))

        # Refresh all the parameters every time you reset the environment
        self.distance_matrix = np.random.randint(0, 100, (self.num_goods, self.num_drivers))
        self.goods_time_preferences = np.random.randint(0, self.max_time, self.num_goods)
        self.goods_expected_prices = np.random.randint(0, self.max_price, self.num_goods)
        self.driver_availabilities = np.random.choice([0, 1], self.num_drivers)
        self.goods_special_requirements = np.random.choice([0, 1], self.num_goods)
        self.driver_special_capabilities = np.random.choice([0, 1], self.num_drivers)
        # print(f'self.distance_matrix:{self.distance_matrix}')
        # print(f'goods_time_preferences:{self.goods_time_preferences}')
        # print(f'goods_expected_prices:{self.goods_expected_prices}')
        # print(f'driver_availabilities:{self.driver_availabilities}')
        # print(f'goods_special_requirements:{self.goods_special_requirements}')
        # print(f'driver_special_capabilities:{self.driver_special_capabilities}')

        # self.distance_matrix = np.array([[67, 53, 24, 68, 92, 64, 85, 6, 77, 43],
        #                                  [40, 78, 48, 31, 14, 6, 7, 37, 26, 67],
        #                                  [96, 43, 73, 2, 71, 74, 37, 87, 17, 64],
        #                                  [28, 25, 84, 62, 51, 28, 32, 58, 98, 72],
        #                                  [13, 52, 38, 44, 11, 49, 11, 56, 80, 25],
        #                                  [3, 68, 25, 65, 50, 64, 2, 22, 40, 46],
        #                                  [98, 1, 9, 45, 80, 51, 86, 65, 22, 50],
        #                                  [98, 6, 73, 22, 12, 58, 84, 13, 38, 79],
        #                                  [78, 48, 52, 21, 36, 92, 71, 1, 22, 33],
        #                                  [43, 76, 74, 89, 19, 51, 34, 63, 11, 99]])
        # self.goods_time_preferences = [1, 1, 3, 4, 1, 1, 1, 3, 0, 4]
        # self.goods_expected_prices = [3, 4, 7, 1, 2, 2, 7, 5, 8, 2]
        # self.driver_availabilities = [1, 1, 0, 1, 0, 0, 1, 1, 0, 0]
        # self.goods_special_requirements = [0, 1, 0, 0, 1, 1, 1, 1, 0, 0]
        # self.driver_special_capabilities = [1, 1, 0, 0, 0, 1, 0, 0, 1, 1]
        # Combine everything into a single flattened state
        combined_state = np.concatenate((
            self.current_negotiation.flatten(),
            self.distance_matrix.flatten(),
            self.goods_time_preferences,
            self.goods_expected_prices,
            self.driver_availabilities,
            self.goods_special_requirements,
            self.driver_special_capabilities
        ))
        # print(f'combined_state.shape:{combined_state.shape}')
        return combined_state

    def driver_satisfaction(self, fee_received, expected_fee, distance_travelled, max_distance, wait_time,
                            max_wait_time,
                            goods_condition):
        """
        为车主设计的满意度计算
        :param fee_received: 收到的费用
        :param expected_fee: 预期费用
        :param distance_travelled: 行驶距离
        :param max_distance: 最大距离
        :param wait_time: 等待时间
        :param max_wait_time: 最大等待时间
        :param goods_condition: 货物状况
        :return:
        """
        # 价格满意度
        price_satisfaction = (fee_received / expected_fee) * 40  # assuming max weightage of 40 for price
        # 距离满意度
        distance_satisfaction = ((
                                         max_distance - distance_travelled) / max_distance) * 30  # assuming max weightage of 30 for distance
        # 等待时间满意度
        wait_satisfaction = ((
                                     max_wait_time - wait_time) / max_wait_time) * 20  # assuming max weightage of 20 for wait time
        # 货物状况满意度
        goods_satisfaction = 10 if goods_condition == 'good' else 0  # assuming max weightage of 10 for goods condition
        # 总满意度
        total_satisfaction = price_satisfaction + distance_satisfaction + wait_satisfaction + goods_satisfaction
        return total_satisfaction

    def shipper_satisfaction(self, fee_paid, expected_fee, delivery_time, expected_delivery_time, goods_condition,
                             driver_service_quality):
        """
        为货主设计的满意度计算
        :param fee_paid: 已付费用
        :param expected_fee: 预期费用
        :param delivery_time: 运输时间
        :param expected_delivery_time: 期望运输时间
        :param goods_condition: 货物状况
        :param driver_service_quality: 司机服务质量
        :return:
        """
        # 价格满意度
        price_satisfaction = (expected_fee / fee_paid) * 30  # assuming max weightage of 30 for price
        # 时间满意度
        time_satisfaction = ((
                                     expected_delivery_time - delivery_time) / expected_delivery_time) * 30  # assuming max weightage of 30 for delivery time
        # 货物状况满意度
        goods_satisfaction = 20 if goods_condition == 'good' else 0
        # 服务满意度
        service_satisfaction = driver_service_quality * 20 / 100
        # 总满意度
        total_satisfaction = price_satisfaction + time_satisfaction + goods_satisfaction + service_satisfaction
        return total_satisfaction

    def successOrFailure(self):
        # 判断是否协商成功,根据双方满意度

        # True为协商成功,false为协商失败
        return 1

    def step(self, encoded_action):
        """ TODO
        核心逻辑部分
        首先,明确何时协商成功,何时协商失败
        :param encoded_action: 待被decode的action
        :return:
        """
        driver_index, good_index, price, time = self.decode_action(encoded_action)
        # print(f'driver_index, good_index, price, time:{driver_index, good_index, price, time}')
        # if self.current_negotiation[good_index][driver_index] == 1 or price >= self.max_price and time >= self.max_time:
        #     # 如果已经被匹配
        #     reward = 0
        #     state = self.current_negotiation.flatten()
        #     done = np.sum(self.current_negotiation) == self.num_goods
        #     return state, reward, done, {}
        # self.shipper_satisfaction()
        # if self.successOrFailure() == 1:
        #     # 如果协商成功
        #     pass
        # elif self.successOrFailure() == 2:
        #     # 协商失败,进行报价与反报价
        #     pass
        # else:
        #     # 协商失败,直接结束
        #     pass
        if price <= self.max_price and time <= self.max_time:
            self.current_negotiation[good_index][driver_index] = 1

        reward = self.compute_reward(driver_index, good_index, price, time)
        combined_state = np.concatenate((
            self.current_negotiation.flatten(),
            self.distance_matrix.flatten(),
            self.goods_time_preferences,
            self.goods_expected_prices,
            self.driver_availabilities,
            self.goods_special_requirements,
            self.driver_special_capabilities
        ))
        done = np.sum(self.current_negotiation) == self.num_goods
        # print(f'reward, state, done:{reward, state, done}')
        return combined_state, reward, done, {}

    def render(self):
        print(self.current_negotiation)


# Simple random agent for testing
class RandomAgent:
    def __init__(self, action_dim):
        self.action_dim = action_dim

    def act(self):
        return np.random.choice(self.action_dim)


class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        # print(f'input_dim,output_dim:{input_dim, output_dim}')
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim)
        )

    def forward(self, x):
        # print(f'x.shape:{x.shape}')
        return self.fc(x)


class DQNAgent:
    def __init__(self, input_dim, action_dim, gamma=0.99, epsilon=0.99, lr=0.001):
        self.input_dim = input_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.network = DQN(input_dim, action_dim).float().to(device)
        self.target_network = DQN(input_dim, action_dim).float().to(device)
        self.target_network.load_state_dict(self.network.state_dict())
        self.optimizer = optim.Adam(self.network.parameters(), lr=self.lr)
        self.memory = deque(maxlen=2000)

    def act(self, state):
        if np.random.random() > self.epsilon:
            state = torch.tensor([state], dtype=torch.float32).to(device)
            with torch.no_grad():
                action = self.network(state).argmax().item()
            return action
        else:
            return np.random.choice(self.action_dim)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def train(self, batch_size=64):
        if len(self.memory) < batch_size:
            return

        batch = random.sample(self.memory, batch_size)
        # print(f'batch:{len(batch)}')
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.tensor(states, dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.int64).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        current_values = self.network(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        next_values = self.target_network(next_states).max(1)[0].detach()
        target_values = rewards + self.gamma * next_values * (1 - dones)

        loss = nn.MSELoss()(current_values, target_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_network.load_state_dict(self.network.state_dict())

    def decrease_epsilon(self, decrement_value=0.001, min_epsilon=0.1):
        self.epsilon = max(self.epsilon - decrement_value, min_epsilon)


if __name__ == '__main__':
    start = time.time()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    rewards = []
    env = TransportMatchingEnv(num_drivers=10, num_goods=10)
    agent = DQNAgent(env.combined_state.flatten().shape[0], env.action_dim)
    # agent = DQNAgent(env, env.action_dim)
    # 运行次数
    episodes = 2000
    for episode in tqdm(range(episodes)):
        state = env.reset()
        done = False
        episode_reward = 0
        total_reward = 0
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            agent.train()
            episode_reward += reward
            total_reward += reward
            state = next_state
            # print(f'done:{type(done)}')
            done = done.item()
            # if done is True:
            # print(f'state:{state}')
        agent.decrease_epsilon()
        rewards.append(total_reward)
        if episode % 50 == 0:
            agent.update_target_network()

        # print(f"Episode {episode + 1}/{episodes} - Reward: {episode_reward}")
    # 将数据
    df = pd.DataFrame(data=rewards)
    # 将DataFrame保存为excel文件
    df.to_excel('sample.xlsx', index=True)
    plot_data(moving_average(data=rewards, window_size=1), title='reward', x_label='epoch', y_label='reward')

    end = time.time()
    print(f'device: {device}')
    print(f'time: {end - start}')
相关推荐
Echo_Lee04 分钟前
C#与Python脚本使用共享内存通信
开发语言·python·c#
python之行14 分钟前
python 环境问题
开发语言·python
hakesashou30 分钟前
python怎么写csv文件
开发语言·python
欧阳枫落35 分钟前
pip 换源
开发语言·python·pip
学步_技术1 小时前
Python编码系列—Python组合模式:构建灵活的对象组合
开发语言·python·组合模式
ac-er88882 小时前
在Flask中处理后台任务
后端·python·flask
ac-er88882 小时前
Flask中的钩子函数
后端·python·flask
Book_熬夜!2 小时前
Python基础(六)——PyEcharts数据可视化初级版
开发语言·python·信息可视化·echarts·数据可视化
我的运维人生2 小时前
利用Python与Ansible实现高效网络配置管理
网络·python·ansible·运维开发·技术共享