实验目的
理解和掌握线性回归模型基本原理和方法,学会使用线性回归模型对分析问题进行建模和预测,掌握线性问题上模型评估方法。
实验内容
假设线性模型为y= w 1 x+ w 2,在给定数据集上训练模型,得到模型参数,计算模型在测试集上均方误差,并将训练数据、测试数据、训练模型绘制在一张图中。
假设二次线性模型为y= w 1 x 2 + w 2 x+ w 3,在给定数据集上训练模型,得到模型参数,计算模型在测试集上均方误差,并将训练数据、测试数据、训练模型绘制在一张图中。
实验环境
python
numpy
matplotlib
实验代码
代码
python
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('TkAgg')
# 读入训练数据
train_dataset = np.genfromtxt("experiment_02_training_set.csv", delimiter=',', skip_header=1)
# 得到数据条数
numberOfTrainData = train_dataset.shape[0]
# 将训练数据第一列和一个全为1的列拼接得到x
x = np.hstack((train_dataset[:, 0:1], np.ones((numberOfTrainData, 1))))
# 计算w
w = np.linalg.inv(x.T @ x) @ x.T @ train_dataset[:, 1:2]
# 打印模型参数
print(f"模型参数: w1: {w[0, 0]: .4f} w2: {w[1, 0]: .4f}")
# 读入测试数据
test_dataset = np.genfromtxt("experiment_02_testing_set.csv", delimiter=',', skip_header=1)
# 得到测试数据条数
numberOfTestData = test_dataset.shape[0]
# 将测试数据第一列和一个全为1的列拼接得到x
X = np.hstack((test_dataset[:, 0:1], np.ones((numberOfTestData, 1))))
# 计算Y
Y = X @ w
# 求MSE
diff = Y - test_dataset[:, 1:2]
diff = diff * diff
MSE = np.sum(diff) / numberOfTestData
print(f"MSE: {MSE: .4f}")
# 画图 准备好训练数据 测试数据 模型数据
train_x_scatter = train_dataset[:, 0:1]
train_y_scatter = train_dataset[:, 1:2]
test_x_scatter = test_dataset[:, 0:1]
test_y_scatter = test_dataset[:, 1:2]
x_line = np.linspace(0, 1, 50)
y_line = w[0] * x_line + w[1]
# 调整参数生成图像
plt.scatter(train_x_scatter, train_y_scatter, color='r', alpha=0.7, edgecolors='white', s=10, label='Train Data')
plt.scatter(test_x_scatter, test_y_scatter, color='g', alpha=0.7, edgecolors='white', s=10, label='Test Data')
plt.plot(x_line, y_line, color='b', linewidth=1.5, label='Model')
plt.title("Train Test Model", fontsize=14)
plt.xlabel("X-axis", fontsize=12)
plt.ylabel("Y-axis", fontsize=12)
plt.legend(loc='upper right', frameon=True)
plt.grid(alpha=0.3, linestyle=':')
plt.tight_layout()
plt.show()
代码
python
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("TkAgg")
# 读入训练数据
train_dataset = np.genfromtxt("experiment_02_training_set.csv", delimiter=',', skip_header=1)
# 得到数据条数
numberOfTrainData = train_dataset.shape[0]
# 得到一次项
x2 = train_dataset[:, 0:1]
# 将二次项作为x1
x1 = x2 * x2
# 将x1和x2和全1的列拼接得到x
x = np.hstack((x1, x2, np.ones((numberOfTrainData, 1))))
# 计算w
w = np.linalg.inv(x.T @ x) @ x.T @ train_dataset[:, 1:2]
# 打印模型参数
print(f"模型参数: w1: {w[0, 0]: .4f} w2: {w[1, 0]: .4f} w3: {w[2, 0]: .4f}")
# 读入测试数据
test_dataset = np.genfromtxt("experiment_02_testing_set.csv", delimiter=',', skip_header=1)
# 得到测试数据条数
numberOfTestData = test_dataset.shape[0]
# 得到一次项
X2 = test_dataset[:, 0:1]
# 将二次项作为X1
X1 = X2 * X2
# 拼接得到X
X = np.hstack((X1, X2, np.ones((numberOfTestData, 1))))
# 计算Y
Y = X @ w
# 计算MSE
diff = Y - test_dataset[:, 1:2]
diff = diff * diff
MSE = np.sum(diff) / numberOfTestData
print(f"MSE: {MSE: .4f}")
# 画图 准备训练数据 测试数据 模型数据
train_x_scatter = train_dataset[:, 0:1]
train_y_scatter = train_dataset[:, 1:2]
test_x_scatter = test_dataset[:, 0:1]
test_y_scatter = test_dataset[:, 1:2]
x_line = np.linspace(0, 1, 50)
y_line = w[0] * x_line * x_line + w[1] * x_line + w[2]
# 调整参数生成图像
plt.scatter(train_x_scatter, train_y_scatter, color='r', alpha=0.7, edgecolors='white', s=10, label='Train Data')
plt.scatter(test_x_scatter, test_y_scatter, color='g', alpha=0.7, edgecolors='white', s=10, label='Test Data')
plt.plot(x_line, y_line, color='b', linewidth=1.5, label="Model")
plt.title("Train Test Model", fontsize=14)
plt.xlabel("X-axis", fontsize=12)
plt.ylabel("Y-axis", fontsize=12)
plt.legend(loc='upper right', frameon=True)
plt.grid(alpha=0.3, linestyle=':')
plt.tight_layout()
plt.show()
结果分析
模型参数为:w1 = -20.1656 w2 = 205.4981
测试集均方误差为:MSE = 4.6256
绘图结果为:

模型参数为:w1 = -30.7577 w2 = 10.7791 w3 = 200.3408
测试集均方误差为:MSE = 0.1031
绘图结果为:
