机器学习：经验误差与过拟合（Python）

以目标函数

为例，采样数据并添加噪声，进行不同阶次的多项式曲线拟合，分析欠拟合和过拟合。

polynomial_feature.py

python 复制代码

import numpy as np

class PolynomialFeatureData:
    """
    生成特征多项式数据
    """
    def __init__(self, x, degree, with_bias=False):
        """
        参数初始化
        :param x: 采用数据，向量形式
        :param degree: 多项式最高阶次
        :param with_bias: 是否需要偏置项
        """
        self.x = np.asarray(x)
        self.degree = degree
        self.with_bias = with_bias
        if with_bias:
            self.data = np.zeros((len(x), degree + 1))
        else:
            self.data = np.zeros((len(x), degree))

    def fit_transform(self):
        """
        构造多项式特征数据
        :return:
        """
        if self.with_bias:
            self.data[:, 0] = np.ones(len(self.x))
            self.data[:, 1] = self.x.reshape(-1)
            for i in range(2, self.degree + 1):
                self.data[:, i] = (self.x ** i).reshape(-1)
        else:
            self.data[:, 0] = self.x.reshape(-1)
            for i in range(1, self.degree):
                self.data[:, i] = (self.x ** (i + 1)).reshape(-1)

        return self.data


if __name__ == '__main__':
    x = np.random.randn(5)
    feat_obj = PolynomialFeatureData(x, 5, with_bias=True)
    data = feat_obj.fit_transform()
    print(data)

polynomial_regression_curve.py

python 复制代码

import numpy as np
from polynomial_feature import PolynomialFeatureData

class PolynomialRegressionCurve:
    """
    多项式曲线拟合，采用线性回归的方法，且是闭式解
    """
    def __init__(self, X, y, fit_intercept=False):
        """
        参数的初始化
        :param X: 样本数据，矩阵形式的
        :param y: 目标值，向量
        :param fit_intercept: 是否拟合截距，偏置项
        """
        self.X, self.y = np.asarray(X), np.asarray(y)
        self.fit_intercept = fit_intercept
        self.theta = None # 模型拟合的最优参数

    def fit(self):
        """
        采用线性回归闭式解求解参数
        :return:
        """
        # pinv() 伪逆
        xtx = np.dot(self.X.T, self.X) + 0.01 * np.eye(self.X.shape[1]) # 添加正则项，保证矩阵是可逆的
        self.theta = np.linalg.inv(xtx).dot(self.X.T).dot(self.y)
        return self.theta

    def predict(self, x_test):
        """
        模型预测
        :param x_test: 测试样本
        :return:
        """
        x_test = x_test[:, np.newaxis]
        if x_test.shape[1] != self.X.shape[1]:
            if self.fit_intercept:
                feat_obj = PolynomialFeatureData(x_test, self.X.shape[1] - 1, with_bias=True)
                x_test = feat_obj.fit_transform()
            else:
                feat_obj = PolynomialFeatureData(x_test, self.X.shape[1])
                x_test = feat_obj.fit_transform()

        if self.theta is None:
            self.fit()
        y_pred = np.dot(self.theta, x_test.T)
        return  y_pred.reshape(-1)

test_poly_regression.py

python 复制代码

import matplotlib.pyplot as plt
import numpy as np
from polynomial_feature import PolynomialFeatureData
from polynomial_regression_curve import PolynomialRegressionCurve

objective_function = lambda x: 3 * np.exp(-x) * np.sin(x) # 目标函数
np.random.seed(0) # 随机种子，便于结果的可重新
n = 10 # 样本量
raw_x = np.linspace(0, 6, n)
raw_y = objective_function(raw_x) + 0.1 * np.random.randn(n)  # 目标值 + 噪声，模拟真实采样数据

degrees = [1, 3, 5, 7, 10, 12] # 多项式阶次
plt.figure(figsize=(15, 7))
for i, degree in enumerate(degrees):
    feat_data = PolynomialFeatureData(raw_x, degree, with_bias=True) # 根据阶次生成特征数据
    X_sample = feat_data.fit_transform()

    poly_obj = PolynomialRegressionCurve(X_sample, raw_y, fit_intercept=True)
    theta = poly_obj.fit() # 闭式解求解最优参数
    print("degree: %d, theta is " % degree, theta)

    x_test = np.linspace(0, 6, 150) # 测试样本
    y_pred = poly_obj.predict(x_test) # 预测

    # 可视化：采样散点图，真实目标函数，拟合的模型
    plt.subplot(231 + i)
    plt.scatter(raw_x, raw_y, edgecolors="k", s=16, label="Raw Data") # 采样数据散点图
    plt.plot(x_test, objective_function(x_test), "k-", lw=1, label="Objective Fun") # 目标向量
    plt.plot(x_test, y_pred, "r--", lw=1.5, label="Model Fitting")
    plt.legend(frameon=False) # 添加图例，且取消图例边框线
    plt.grid(ls=":")
    plt.xlabel("$x$", fontdict={"fontsize": 12})
    plt.ylabel("$y$", fontdict={"fontsize": 12})
    test_ess = (y_pred - objective_function(x_test)) ** 2 # 测试样本的误差平方和
    mse_score, mse_std = np.mean(test_ess), np.std(test_ess)
    train_mse = ((raw_y - poly_obj.predict(raw_y)) ** 2).mean() # 训练样本均方误差
    plt.title("Degree {} Test_MSE = {:.2e}(+/-{:.2e}) \n Train_MSE = {:.2e}".
              format(degree, mse_score, mse_std, train_mse), fontdict={"fontsize": 12})
plt.tight_layout()
plt.show()