背景
通过历史基金净产值,学习买入、卖出或持有基金的最优策略,目标最大化长期收益,主要用于学习。
强化学习
通过环境变化选择不同的动作得到不一样的奖励,根据奖励最大选择最优的动作。例如:天气冷了,我们可以选择穿短袖或者长袖(动作),穿短袖会感觉冷(奖励),正常人会穿长袖会感觉合适(最优的动作)。
基金
强化学习元素 | 在基金交易对应 |
---|---|
环境 | 净产值,持仓 |
动作 | 买、卖(50%,100%)、持有 (5个动作) |
奖励 | 利润 |
策略 | 交易策略 |
历史基金净值可以通过这个地址拿到:
fund.eastmoney.com/pingzhongda...
流程图
通过模型输出动作的预测奖励值,再计算真实动作的奖励,通过预测值和真实值比较相差,向前传播调整模型权重和偏置。

模型
历史净产值
是有时序、局部趋势,用卷积 Conv1d
当前持仓
是单值、结构化,直接拼接进 Linear
卷积
:自动学习时间序列中的关键特征。
假设你有一个 1D 卷积核:[0.2, 0.5, 0.3](随机生成的),通过学习历史净产值可能会得到下面特征:
卷积核学到的特征(举例) | 意义 |
---|---|
[0.1, 0.2, 0.7] | 抓"快速上涨"结构 |
[-0.2, -0.3, 0.5] | 抓"V 型反弹"模式 |
[0.33, 0.33, 0.33] | 平均,平滑滤波 |
代码
基金代码:012414
(招商中证白酒指数(LOF)C),日期:[2022-2024]
,整体看是下降的趋势。

下面是训练的完整代码:
Python
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import random
import requests
import re
import json
import matplotlib.pyplot as plt
from collections import deque
# ======== 1. 获取基金净值并转换为 log return =========
def get_log_return_series(fund_code: str, start_year=None, end_year=None) -> list[float]:
url = f"https://fund.eastmoney.com/pingzhongdata/{fund_code}.js"
resp = requests.get(url)
js_text = resp.text
match = re.search(r'var Data_netWorthTrend = (.*?);', js_text, re.S)
if not match:
raise ValueError("无法提取净值数据")
net_worth_data = json.loads(match.group(1))
df = pd.DataFrame(net_worth_data)
df['date'] = pd.to_datetime(df['x'], unit='ms')
df.set_index('date', inplace=True)
df.rename(columns={'y': 'net_value'}, inplace=True)
df = df[['net_value']]
if start_year is not None:
df = df[df.index.year >= start_year]
if end_year is not None:
df = df[df.index.year <= end_year]
prices = df['net_value'].tolist()
log_returns = [np.log(prices[i+1] / prices[i]) for i in range(len(prices)-1)]
return log_returns
# ======== 2. DQN 模型 =========
class DQNConvModel(nn.Module):
def __init__(self, seq_len=21, n_actions=5):
super().__init__()
self.conv = nn.Sequential(
nn.Conv1d(1, 16, kernel_size=3),
nn.ReLU(),
nn.Conv1d(16, 32, kernel_size=3),
nn.ReLU()
)
conv_out_size = 32 * (seq_len - 4)
self.fc = nn.Sequential(
nn.Linear(conv_out_size + 1, 64),
nn.ReLU(),
nn.Linear(64, n_actions)
)
def forward(self, x_seq, total_share):
x = self.conv(x_seq)
x = x.view(x.size(0), -1)
x = torch.cat([x, total_share], dim=1)
return self.fc(x)
# ======== 3. FIFO 交易环境(对数收益序列) =========
class SimpleTradingEnv:
def __init__(self, log_return_series, window_size=21, initial_cash=1.0):
self.return_series = log_return_series
self.window_size = window_size
self.initial_cash = initial_cash
self.reset()
def reset(self):
self.t = self.window_size
self.portfolio = self.initial_cash
self.holdings = deque()
self.total_shares = 0.0
self.history = [self.portfolio]
self.done = False
return self._get_state()
def _get_state(self):
seq = self.return_series[self.t - self.window_size:self.t]
return np.array(seq), np.array([self.total_shares], dtype=np.float32)
def step(self, action_idx):
actions = {
0: -1.0, # 全卖
1: -0.5,
2: 0.0, # 持有
3: 0.5,
4: 1.0 # 全买
}
action_ratio = actions[action_idx]
reward = 0.0
curr_price = 1.0 # log return 环境不需要真实价格,设置为 1 即可
# 买入
if action_ratio > 0:
max_shares = self.portfolio / curr_price
buy_shares = max_shares * action_ratio
cost = buy_shares * curr_price
self.portfolio -= cost
self.holdings.append({"shares": buy_shares, "day": self.t})
self.total_shares += buy_shares
# 卖出
elif action_ratio < 0 and self.total_shares > 0:
sell_ratio = -action_ratio
sell_shares = self.total_shares * sell_ratio
remaining = sell_shares
new_holdings = deque()
for lot in self.holdings:
if remaining <= 0:
new_holdings.append(lot)
continue
lot_shares = lot["shares"]
held_days = self.t - lot["day"]
sell_from_lot = min(lot_shares, remaining)
revenue = sell_from_lot * curr_price
if held_days < 7:
revenue *= 0.985 # 收1.5%手续费
self.portfolio += revenue
lot["shares"] -= sell_from_lot
remaining -= sell_from_lot
self.total_shares -= sell_from_lot
if lot["shares"] > 0:
new_holdings.append(lot)
self.holdings = new_holdings
# 下一天收益
if self.t + 1 < len(self.return_series):
log_r = self.return_series[self.t + 1]
reward = self.total_shares * log_r
self.portfolio *= np.exp(self.total_shares * log_r)
self.history.append(self.portfolio)
self.t += 1
self.done = self.t >= len(self.return_series) - 1
return self._get_state(), reward, self.done
# ======== 4. Replay Buffer =========
class ReplayBuffer:
def __init__(self, capacity=10000):
self.buffer = deque(maxlen=capacity)
def push(self, state, action, reward, next_state, done):
self.buffer.append((state, action, reward, next_state, done))
def sample(self, batch_size):
batch = random.sample(self.buffer, batch_size)
return zip(*batch)
def __len__(self):
return len(self.buffer)
# ======== 5. 训练主程序 =========
if __name__ == '__main__':
fund_code = "012414"
returns = get_log_return_series(fund_code, start_year=2022, end_year=2024)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
env = SimpleTradingEnv(returns, window_size=21)
model = DQNConvModel(seq_len=21, n_actions=5).to(device)
target_model = DQNConvModel(seq_len=21, n_actions=5).to(device)
target_model.load_state_dict(model.state_dict())
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
buffer = ReplayBuffer()
BATCH_SIZE = 32
GAMMA = 0.99
EPSILON = 0.1
TARGET_UPDATE_FREQ = 10
episode_rewards = []
portfolio_histories = []
for episode in range(1000):
state = env.reset()
total_reward = 0
while True:
seq, shares = state
seq_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
share_tensor = torch.tensor(shares, dtype=torch.float32).unsqueeze(0).to(device)
if np.random.rand() < EPSILON:
action = np.random.randint(5)
else:
with torch.no_grad():
q_values = model(seq_tensor, share_tensor)
action = q_values.argmax().item()
next_state, reward, done = env.step(action)
buffer.push(state, action, reward, next_state, done)
total_reward += reward
state = next_state
if len(buffer) >= BATCH_SIZE:
batch = buffer.sample(BATCH_SIZE)
states, actions, rewards, next_states, dones = batch
seq_batch = torch.tensor([s[0] for s in states], dtype=torch.float32).unsqueeze(1).to(device)
share_batch = torch.tensor([s[1] for s in states], dtype=torch.float32).to(device)
action_batch = torch.tensor(actions).unsqueeze(1).to(device)
reward_batch = torch.tensor(rewards, dtype=torch.float32).unsqueeze(1).to(device)
next_seq_batch = torch.tensor([s[0] for s in next_states], dtype=torch.float32).unsqueeze(1).to(device)
next_share_batch = torch.tensor([s[1] for s in next_states], dtype=torch.float32).to(device)
done_batch = torch.tensor(dones, dtype=torch.float32).unsqueeze(1).to(device)
q_eval = model(seq_batch, share_batch).gather(1, action_batch)
with torch.no_grad():
q_next = target_model(next_seq_batch, next_share_batch).max(1, keepdim=True)[0]
q_target = reward_batch + GAMMA * q_next * (1 - done_batch)
loss = F.mse_loss(q_eval, q_target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if done:
break
if episode % TARGET_UPDATE_FREQ == 0:
target_model.load_state_dict(model.state_dict())
avg_reward = total_reward / (env.t - env.window_size)
episode_rewards.append(avg_reward)
portfolio_histories.append(env.history)
print(f"Ep {episode} | AvgReward: {avg_reward:.4f} | Final: {env.portfolio:.4f}")
# 保存模型
torch.save(model.state_dict(), "dqn_fifo_logreturn.pt")
# ====== 绘制结果图 ======
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(episode_rewards)
plt.title("Average Reward per Episode")
plt.grid(True)
plt.subplot(1, 2, 2)
for h in portfolio_histories[-10:]:
plt.plot(h, alpha=0.7)
plt.title("Portfolio History (Last 10 Episodes)")
plt.grid(True)
plt.tight_layout()
plt.show()
训练效果
训练1000次,奖励整体也是下降的。

预测
下面是2025年的折线图,有点上升。

预测代码:
Python
import torch
import matplotlib.pyplot as plt
FUND_CODE = "012414"
MODEL_PATH = "dqn_fifo_logreturn.pt"
WINDOW_SIZE = 21 # 修改为 21
# ===== 1. 获取2025年log return数据 =====
returns_2025 = get_log_return_series(FUND_CODE, start_year=2025, end_year=2025) # 改为2025
# ===== 2. 初始化环境和模型 =====
env = SimpleTradingEnv(returns_2025, window_size=WINDOW_SIZE)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DQNConvModel(seq_len=WINDOW_SIZE, n_actions=5).to(device) # seq_len 改为 21
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()
# ===== 3. 回测执行 =====
state = env.reset()
history = [env.portfolio]
actions_taken = []
while not env.done:
seq, shares = state
seq_tensor = torch.tensor(seq, dtype=torch.float32).unsqueeze(0).unsqueeze(0).to(device)
share_tensor = torch.tensor(shares, dtype=torch.float32).unsqueeze(0).to(device)
with torch.no_grad():
q_values = model(seq_tensor, share_tensor)
action = q_values.argmax().item()
actions_taken.append(action)
next_state, reward, done = env.step(action)
state = next_state
history.append(env.portfolio)
# ===== 4. 输出和可视化 =====
print(f"2025年回测结束:最终资产 = {env.portfolio:.4f}")
plt.figure(figsize=(10, 5))
plt.plot(history, label="DQN Portfolio Value")
plt.title("Portfolio Value in 2025")
plt.xlabel("Time step (day)")
plt.ylabel("Value")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()
预测结果(2025年回测结束:最终资产 = 0.0000
)。

总结
预测效果并不理想,可能是用一个下降趋势去预测一个有点上升趋势,但是我又怎么知道2025是上升的呢。感觉还得用财经新闻,来预测上升和下降趋势靠谱些。
投资有风险需谨慎