以下是一个 完整的 Python 实现:基于遗传算法(GA)优化模糊 C 均值聚类(FCM)的多变量时间序列预测模型。该方法结合:
- FCM 聚类:对历史时间序列模式进行软聚类,提取典型模式;
- 遗传算法(GA):自动优化 FCM 的关键参数(如聚类数 (c)、模糊指数 (m))以及预测模型的超参数;
- 模式匹配 + 加权预测:利用聚类中心进行未来值预测。
✅ 适用于电力负荷、光伏功率、交通流量等多变量时间序列
✅ 提供端到端可运行代码(含模拟数据)
✅ 使用
skfuzzy(scikit-fuzzy)和DEAP(GA 库)
📦 依赖库安装
bash
pip install numpy pandas scikit-fuzzy deap matplotlib scikit-learn
🧠 方法原理简述
- 滑动窗口构造样本:将多变量时间序列转为监督学习格式。
- FCM 聚类:对历史样本(输入窗口)进行聚类,得到 (c) 个聚类中心。
- 预测机制 :
- 对新输入窗口,计算其到各聚类中心的隶属度;
- 用隶属度加权对应"未来输出"的平均值作为预测。
- GA 优化目标 :
- 决策变量:聚类数 (c \in [2,10]),模糊指数 (m \in [1.1, 5.0])
- 目标函数:验证集上的 均方误差(MSE)
📜 完整 Python 源码
python
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import skfuzzy as fuzz
from deap import base, creator, tools, algorithms
import random
import matplotlib.pyplot as plt
# ----------------------------
# 1. 数据生成(可替换为真实数据)
# ----------------------------
def generate_multivariate_time_series(n_samples=1000):
t = np.arange(n_samples)
# 多变量:温度、湿度、风速、负荷(目标)
temp = 20 + 10 * np.sin(2 * np.pi * t / 24) + np.random.normal(0, 1, n_samples)
humidity = 60 + 20 * np.cos(2 * np.pi * t / 24) + np.random.normal(0, 3, n_samples)
wind = 5 + 3 * np.sin(2 * np.pi * t / 12) + np.random.normal(0, 0.5, n_samples)
load = (temp * 0.8 + humidity * (-0.2) + wind * (-0.5) +
50 * np.sin(2 * np.pi * t / 24) + np.random.normal(0, 2, n_samples))
load = np.clip(load, 0, None) # 负荷非负
return np.column_stack([temp, humidity, wind, load])
# ----------------------------
# 2. 构造监督学习数据集
# ----------------------------
def create_dataset(data, lookback, horizon=1):
X, Y = [], []
for i in range(len(data) - lookback - horizon + 1):
X.append(data[i:i+lookback].flatten()) # 展平多变量窗口
Y.append(data[i+lookback:i+lookback+horizon, -1]) # 预测最后一列(负荷)
return np.array(X), np.array(Y).squeeze()
# ----------------------------
# 3. FCM 预测器类
# ----------------------------
class FCMForecaster:
def __init__(self, c, m, lookback, n_features):
self.c = int(c)
self.m = m
self.lookback = lookback
self.n_features = n_features
self.centers_ = None
self.future_means_ = None
def fit(self, X_train, Y_train):
# X_train: [N, lookback * n_features]
# 执行 FCM 聚类
cntr, u, _, _, _, _, _ = fuzz.cmeans(
data=X_train.T,
c=self.c,
m=self.m,
error=1e-5,
maxiter=1000,
init=None
)
self.centers_ = cntr # [c, D]
# 计算每个簇对应的未来值均值
future_means = np.zeros(self.c)
for i in range(self.c):
# 隶属度 > 0.5 视为属于该簇(或加权平均)
weights = u[i, :]
if np.sum(weights) > 1e-6:
future_means[i] = np.average(Y_train, weights=weights)
else:
future_means[i] = np.mean(Y_train)
self.future_means_ = future_means
def predict(self, X_test):
# 计算测试样本到各中心的隶属度
u, _, _, _, _, _ = fuzz.cmeans_predict(
test_data=X_test.T,
cntr_trained=self.centers_,
m=self.m,
error=1e-5,
maxiter=1000
)
# 加权预测
pred = np.dot(u.T, self.future_means_)
return pred
# ----------------------------
# 4. GA 评估函数
# ----------------------------
def evaluate_ga(individual, X_train, Y_train, X_val, Y_val, lookback, n_features):
c, m = individual
c = int(round(c))
c = max(2, min(10, c)) # 约束聚类数
m = max(1.1, min(5.0, m))
try:
model = FCMForecaster(c=c, m=m, lookback=lookback, n_features=n_features)
model.fit(X_train, Y_train)
y_pred = model.predict(X_val)
mse = mean_squared_error(Y_val, y_pred)
return (mse,)
except Exception as e:
return (float('inf'),)
# ----------------------------
# 5. 主程序
# ----------------------------
if __name__ == "__main__":
# --- 数据准备 ---
data = generate_multivariate_time_series(1200)
lookback = 24 # 使用前24小时预测下一小时
X, Y = create_dataset(data, lookback, horizon=1)
# 划分训练/验证/测试
split1 = int(0.7 * len(X))
split2 = int(0.85 * len(X))
X_train, Y_train = X[:split1], Y[:split1]
X_val, Y_val = X[split1:split2], Y[split1:split2]
X_test, Y_test = X[split2:], Y[split2:]
# 标准化(仅对输入X)
scaler_X = StandardScaler()
X_train = scaler_X.fit_transform(X_train)
X_val = scaler_X.transform(X_val)
X_test = scaler_X.transform(X_test)
n_features = data.shape[1]
# --- GA 设置 ---
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)
toolbox = base.Toolbox()
toolbox.register("c", random.randint, 2, 10)
toolbox.register("m", random.uniform, 1.1, 5.0)
toolbox.register("individual", tools.initCycle, creator.Individual,
(toolbox.c, toolbox.m), n=1)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)
# 评估函数绑定数据
toolbox.register("evaluate", evaluate_ga,
X_train=X_train, Y_train=Y_train,
X_val=X_val, Y_val=Y_val,
lookback=lookback, n_features=n_features)
toolbox.register("mate", tools.cxBlend, alpha=0.5)
toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.5, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)
# --- 运行 GA ---
pop = toolbox.population(n=20)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("min", np.min)
print("🚀 开始遗传算法优化 FCM 参数...")
algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.2, ngen=15,
stats=stats, halloffame=hof, verbose=True)
best_c, best_m = hof[0]
best_c = int(round(best_c))
best_m = max(1.1, min(5.0, best_m))
print(f"\n✅ 最优参数: c={best_c}, m={best_m:.2f}")
# --- 最终模型训练与测试 ---
final_model = FCMForecaster(c=best_c, m=best_m, lookback=lookback, n_features=n_features)
final_model.fit(X_train, Y_train)
y_pred_test = final_model.predict(X_test)
test_mse = mean_squared_error(Y_test, y_pred_test)
test_mae = np.mean(np.abs(Y_test - y_pred_test))
print(f"\n📊 测试集性能:")
print(f" MSE = {test_mse:.4f}")
print(f" MAE = {test_mae:.4f}")
# --- 可视化 ---
plt.figure(figsize=(12, 5))
plt.plot(Y_test[:200], label='Actual', linewidth=1.5)
plt.plot(y_pred_test[:200], '--', label='Predicted (GA-FCM)', linewidth=1.5)
plt.title(f'GA-FCM Time Series Prediction (MAE={test_mae:.2f})')
plt.xlabel('Time Step')
plt.ylabel('Target Value')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
🔍 关键说明
1. 为什么用 FCM 做预测?
- FCM 提供软聚类,每个样本属于多个模式,适合时间序列的模糊性;
- 聚类中心代表"典型历史模式",其对应的未来值可作为预测依据。
2. GA 优化什么?
- 聚类数 (c):太少欠拟合,太多过拟合;
- 模糊指数 (m):控制隶属度模糊程度((m \to 1):硬聚类;(m \to \infty):均匀隶属)。
3. 预测机制细节
- 训练时:记录每个簇的未来值加权平均;
- 预测时:用新样本的隶属度加权这些平均值。
4. 局限性与改进方向
| 问题 | 改进方案 |
|---|---|
| 仅用聚类中心,忽略时序动态 | 结合 LSTM/Transformer 提取特征后再聚类 |
| 预测单步 | 扩展为多步(递归预测或 seq2seq) |
| 无不确定性量化 | 输出隶属度作为置信度 |
📚 参考文献
- Bezdek, J. C. (1981). Pattern Recognition with Fuzzy Objective Function Algorithms.
- Goldberg, D. E. (1989). Genetic Algorithms in Search, Optimization, and Machine Learning.
- Chen, Y., et al. (2020). Short-term load forecasting using FCM and GA. Energy Reports.