神经网络实验3-线性回归

线性回归模型

python 复制代码

import torch
import numpy as np
import matplotlib.pyplot as plt

# -----------------------------
# 1. 定义线性函数 (真实分布)
# -----------------------------
def linear_func(x, w=1.2, b=0.5):
    return w * x + b


# -----------------------------
# 2. 数据生成函数
# -----------------------------
def create_toy_data(func, interval, sample_num, noise=0.0, add_outlier=False, outlier_ratio=0.001):
    # 均匀采样
    X = torch.rand(size=[sample_num]) * (interval[1] - interval[0]) + interval[0]
    y = func(X)

    # 添加高斯噪声
    epsilon = torch.tensor(np.random.normal(0, noise, size=y.shape[0]))
    y = y + epsilon

    # 添加异常点
    if add_outlier:
        outlier_num = int(len(y) * outlier_ratio)
        if outlier_num > 0:
            outlier_idx = torch.randint(0, len(y), size=[outlier_num])
            y[outlier_idx] = y[outlier_idx] * 5

    return X, y


# -----------------------------
# 3. 构建数据集
# -----------------------------
interval = (-10, 10)
train_num = 100
test_num = 50
noise = 2

X_train, y_train = create_toy_data(linear_func, interval, train_num, noise=noise)
X_test, y_test = create_toy_data(linear_func, interval, test_num, noise=noise)
X_train_large, y_train_large = create_toy_data(linear_func, interval, 5000, noise=noise)

# 用来画真实分布曲线
X_underlying = torch.linspace(interval[0], interval[1], train_num)
y_underlying = linear_func(X_underlying)


# -----------------------------
# 4. 定义线性算子 (手写模型)
# -----------------------------
class Op(object):
    def __call__(self, inputs):
        return self.forward(inputs)

    def forward(self, inputs):
        raise NotImplementedError

    def backward(self, inputs):
        raise NotImplementedError


class Linear(Op):
    def __init__(self, input_size):
        """
        输入:
          - input_size: 特征维度
        """
        self.input_size = input_size
        self.params = {}
        self.params['w'] = torch.randn(self.input_size, 1)
        self.params['b'] = torch.zeros([1])

    def forward(self, X):
        N, D = X.shape
        assert D == self.input_size
        return torch.matmul(X, self.params['w']) + self.params['b']


# 测试 Linear 模型
input_size = 3
N = 2
X = torch.randn(N, input_size)  # 生成 2 个样本，每个 3 维
model = Linear(input_size)
y_pred = model.forward(X)
print("模型预测 y_pred:", y_pred)


# -----------------------------
# 5. 可视化数据
# -----------------------------
plt.figure(figsize=(8, 6))
plt.scatter(X_train, y_train, marker='*', facecolor="none", edgecolor='#1EA5EE', s=50, label="train data")
plt.scatter(X_test, y_test,facecolor="none", edgecolor='#2BCD76', s=50, label="test data")
plt.plot(X_underlying, y_underlying, c='#000000', label=r"underlying distribution")
plt.legend(fontsize='large')
plt.title("Toy Data for Linear Regression")
plt.savefig('ml-vis.pdf')
plt.show()


import random
from cProfile import label

import torch
from d2l import torch as d2l

def synthetic_data(w,b,num_examples):
    X=torch.normal(0,1,(num_examples,len(w)))
    y=torch.matmul(X,w)+b
    y+=torch.normal(0,0.01,y.shape)
    return X,y.reshape((-1,1))

true_w=torch.tensor([2,-3.4])
true_b=4.2
features,labels=synthetic_data(true_w, true_b, 1000)

d2l.set_figsize()
d2l.plt.scatter(features[:,1].detach().numpy(),labels.detach().numpy(),marker='*',facecolors='none',edgecolors='#e4007f',s=1,label='train data')
d2l.plt.show()

def data_iter(batch_size,features,labels):
    num_examples=len(features)
    indices=list(range(num_examples))
    random.shuffle(indices)
    for i in range(0,num_examples,batch_size):
        batch_indices=torch.tensor(indices[i:min(i+batch_size,num_examples)])
        yield features[batch_indices],labels[batch_indices]

batch_size=10


for X,y in data_iter(batch_size,features,labels):
    print(X,'\n',y)
    break

w=torch.normal(0,0.01,size=(2,1),requires_grad=True)
b=torch.zeros(1,requires_grad=True)
def linreg(X,w,b):
    return torch.matmul(X,w) + b

def squared_loss(y_hat,y):
    return (y_hat-y.reshape(y_hat.shape))**2/2

def sgd(params,lr,batch_size):
    with torch.no_grad():
        for param in params:
            param-=lr*param.grad/batch_size
            param.grad.zero_()

lr=10
num_epochs=10
net=linreg
loss=squared_loss

for epoch in range(num_epochs):
    for X,y in data_iter(batch_size,features,labels):
        l=loss(net(X,w,b),y)
        l.sum().backward()
        sgd([w,b],lr,batch_size)
    with torch.no_grad():
        train_1=loss(net(features,w,b),labels)
        print(f'epoch {epoch+1},loss {float(train_1.mean()):f}')

print("learned w:", w.reshape(-1).detach().numpy())
print("true w:", true_w.numpy())
print("learned b:", b.item())
print("true b:", true_b)

多元线性回归模型

python 复制代码

import random
import torch
from d2l import torch as d2l

# -----------------------------
# 1. 构造多项式数据集
# -----------------------------
def synthetic_poly_data(num_examples):
    # 随机生成输入特征 x
    X = torch.normal(0, 1, (num_examples, 1))
    # 构造多项式： y = 5 + 1.2*x - 3.4*x^2 + 0.5*x^3 + 噪声
    true_w = torch.tensor([1.2, -3.4, 0.5])
    true_b = 5.0
    poly_features = torch.cat([X**i for i in range(1, len(true_w)+1)], 1)  # [x, x^2, x^3]
    y = torch.matmul(poly_features, true_w) + true_b
    y += torch.normal(0, 0.1, y.shape)  # 添加噪声
    return poly_features, y.reshape((-1, 1)), true_w, true_b

features, labels, true_w, true_b = synthetic_poly_data(1000)

# 可视化：只画输入 x 与标签的关系
d2l.set_figsize()
d2l.plt.scatter(features[:, 0].detach().numpy(), labels.detach().numpy(),
                marker='*', facecolors='none', edgecolors='#e4007f', s=1, label='train data')
d2l.plt.show()

# -----------------------------
# 2. 数据迭代器
# -----------------------------
def data_iter(batch_size, features, labels):
    num_examples = len(features)
    indices = list(range(num_examples))
    random.shuffle(indices)
    for i in range(0, num_examples, batch_size):
        batch_indices = torch.tensor(indices[i:min(i+batch_size, num_examples)])
        yield features[batch_indices], labels[batch_indices]

# -----------------------------
# 3. 初始化参数
# -----------------------------
w = torch.normal(0, 0.01, size=(3, 1), requires_grad=True)  # 三个多项式参数
b = torch.zeros(1, requires_grad=True)

# -----------------------------
# 4. 定义模型 & 损失 & 优化器
# -----------------------------
def polyreg(X, w, b):
    return torch.matmul(X, w) + b

def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape))**2 / 2

def sgd(params, lr, batch_size):
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad / batch_size
            param.grad.zero_()

# -----------------------------
# 5. 训练
# -----------------------------
lr = 0.01
num_epochs = 20
batch_size = 10
net = polyreg
loss = squared_loss

for epoch in range(num_epochs):
    for X, y in data_iter(batch_size, features, labels):
        l = loss(net(X, w, b), y)
        l.sum().backward()
        sgd([w, b], lr, batch_size)
    with torch.no_grad():
        train_l = loss(net(features, w, b), labels)
        print(f'epoch {epoch+1}, loss {float(train_l.mean()):f}')

print("learned w:", w.reshape(-1).detach().numpy())
print("true w:", true_w.numpy())
print("learned b:", b.item())
print("true b:", true_b)

基于线性回归的波士顿房价预测模型

python 复制代码

import pandas as pd
# 利用pandas加载波士顿房价的数据集
data=pd.read_csv("boston_house_prices.csv")
# 预览前5行数据
data.head()

print(data.head())
# 查看各字段缺失值统计情况
print(data.isna().sum())
#
# import matplotlib.pyplot as plt  # 可视化工具
#
#
# # 箱线图查看异常值分布
# def boxplot(data, fig_name):
#     # 绘制每个属性的箱线图
#     data_col = list(data.columns)
#
#     # 连续画几个图片
#     plt.figure(figsize=(5, 5), dpi=300)
#     # 子图调整
#     plt.subplots_adjust(wspace=0.6)
#     # 每个特征画一个箱线图
#     for i, col_name in enumerate(data_col):
#         plt.subplot(3, 5, i + 1)
#         # 画箱线图
#         plt.boxplot(data[col_name],
#                     showmeans=True,
#                     meanprops={"markersize": 1, "marker": "D", "markeredgecolor": '#f19ec2'},  # 均值的属性
#                     medianprops={"color": '#e4007f'},  # 中位数线的属性
#                     whiskerprops={"color": '#e4007f', "linewidth": 0.4, 'linestyle': "--"},
#                     flierprops={"markersize": 0.4},
#                     )
#         # 图名
#         plt.title(col_name, fontdict={"size": 5}, pad=2)
#         # y方向刻度
#         plt.yticks(fontsize=4, rotation=90)
#         plt.tick_params(pad=0.5)
#         # x方向刻度
#         plt.xticks([])
#     plt.savefig(fig_name)
#     plt.show()
#
#
# boxplot(data, 'ml-vis5.pdf')
#
#
import matplotlib.pyplot as plt
import pandas as pd

#
# def boxplot_enhanced_large_subplots(data, fig_name, figsize=(16, 12)):
#     """
#     美化版箱线图可视化函数 - 子图更大
#     用于检测数据中的异常值分布情况
#
#     Parameters:
#     data: DataFrame, 需要可视化的数据
#     fig_name: str, 保存图片的文件名
#     figsize: tuple, 图片尺寸 (宽, 高)
#     """
#
#     # 获取数据列名
#     data_col = list(data.columns)
#     num_features = len(data_col)
#
#     # 创建更大的画布，提高分辨率
#     plt.figure(figsize=figsize, dpi=300)
#     plt.suptitle('Boston Housing Dataset - Outlier Detection Boxplot',
#                  fontsize=18, fontweight='bold', y=0.99)
#
#     # 减少每行的子图数量，让每个子图更大
#     cols = 4  # 从5减少到4，让子图更宽
#     rows = (num_features + cols - 1) // cols
#
#     # 设置更大的子图间距
#     plt.subplots_adjust(wspace=0.4, hspace=0.5, top=0.92, bottom=0.08, left=0.06, right=0.98)
#
#     # 为每个特征绘制箱线图
#     for i, col_name in enumerate(data_col):
#         plt.subplot(rows, cols, i + 1)
#
#         # 绘制美化箱线图 - 使用patch_artist来设置填充色
#         box_plot = plt.boxplot(data[col_name],
#                                patch_artist=True,  # 启用填充色
#                                showmeans=True,
#                                meanprops={"markersize": 6,  # 增大标记尺寸
#                                           "marker": "D",
#                                           "markeredgecolor": '#FF6B9D',
#                                           "markerfacecolor": '#FFE4EC'},
#                                medianprops={"color": '#E4007F', "linewidth": 2},  # 加粗中线
#                                whiskerprops={"color": '#E4007F', "linewidth": 1.2},  # 加粗须线
#                                capprops={"color": '#E4007F', "linewidth": 1.2},
#                                flierprops={"markersize": 4,  # 增大异常点尺寸
#                                            "marker": "o",
#                                            "markerfacecolor": '#888888',
#                                            "markeredgecolor": '#666666',
#                                            "alpha": 0.6})
#
#         # 手动设置箱体填充色（避免facecolor错误）
#         for box in box_plot['boxes']:
#             box.set(facecolor='#FFF0F5', alpha=0.7)
#
#         # 设置更大的标题
#         plt.title(f'{col_name}',
#                   fontdict={"size": 12, "fontweight": "bold"},  # 增大字体
#                   pad=15,  # 增加标题间距
#                   color='#2F2F2F')
#
#         # 设置更大的y轴刻度
#         plt.yticks(fontsize=10)  # 增大刻度字体
#         plt.tick_params(axis='y', which='major', pad=4, length=6)  # 增大刻度尺寸
#
#         # 隐藏x轴刻度
#         plt.xticks([])
#
#         # 添加更明显的网格线
#         plt.grid(axis='y',
#                  alpha=0.4,
#                  linestyle='--',
#                  linewidth=0.8,  # 加粗网格线
#                  color='#CCCCCC')
#
#         # 设置背景色
#         plt.gca().set_facecolor('#F8F8F8')
#
#         # 添加更粗的边框
#         for spine in plt.gca().spines.values():
#             spine.set_color('#DDDDDD')
#             spine.set_linewidth(1.2)  # 加粗边框
#
#     # 保存图片
#     plt.savefig(fig_name,
#                 dpi=300,
#                 bbox_inches='tight',
#                 facecolor='white',
#                 edgecolor='none')
#
#     plt.show()
#
#     print(f"箱线图已保存为: {fig_name}")
#     print(f"画布尺寸: {figsize}")
#     print(f"布局: {rows}行 × {cols}列")
#     print(f"共可视化 {num_features} 个特征")
#
#
# # 使用示例 - 使用更大的画布
# boxplot_enhanced_large_subplots(data, 'boston_housing_boxplot_large.pdf', figsize=(16, 12))
#
# # 四分位处理异常值
# num_features = data.select_dtypes(exclude=['object', 'bool']).columns.tolist()
#
# for feature in num_features:
#     if feature == 'CHAS':
#         continue
#
#     Q1 = data[feature].quantile(q=0.25)  # 下四分位
#     Q3 = data[feature].quantile(q=0.75)  # 上四分位
#
#     IQR = Q3 - Q1
#     top = Q3 + 1.5 * IQR  # 最大估计值
#     bot = Q1 - 1.5 * IQR  # 最小估计值
#     values = data[feature].values
#     values[values > top] = top  # 临界值取代噪声
#     values[values < bot] = bot  # 临界值取代噪声
#     data[feature] = values.astype(data[feature].dtypes)
#
# # 再次查看箱线图，异常值已被临界值替换（数据量较多或本身异常值较少时，箱线图展示会不容易体现出来）
# boxplot_enhanced_large_subplots(data, 'ml-vis6.pdf')

print(data.describe())
#查看相关性
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

boston = data
corrboston = boston.corr()

# plt.figure(figsize=(14, 10))
#
# # 使用自定义颜色映射
# colors = ['#2E86AB', '#A8DADC', '#FFFFFF', '#FFB6C1', '#E4007F']
# from matplotlib.colors import LinearSegmentedColormap
# custom_cmap = LinearSegmentedColormap.from_list('custom_pink', colors, N=256)
#
# # 使用matshow绘制热力图
# im = plt.matshow(corrboston, cmap=custom_cmap, fignum=1, aspect='auto')
#
# # 美化颜色条
# cbar = plt.colorbar(im, shrink=0.8)
# cbar.set_label('Correlation Coefficient', rotation=270, labelpad=15, fontsize=12)
#
# # 设置刻度标签
# plt.xticks(range(len(corrboston.columns)), corrboston.columns,
#            rotation=45, ha='right', fontsize=10)
# plt.yticks(range(len(corrboston.columns)), corrboston.columns, fontsize=10)
#
# # 添加相关系数值标注，根据背景色调整文字颜色
# for i in range(len(corrboston.columns)):
#     for j in range(len(corrboston.columns)):
#         corr_value = corrboston.iloc[i, j]
#         # 根据相关系数值选择文字颜色
#         text_color = 'white' if abs(corr_value) > 0.3 else 'black'
#         plt.text(j, i, f'{corr_value:.2f}',
#                 ha='center', va='center',
#                 fontsize=9, fontweight='bold',
#                 color=text_color)
#
# # 美化标题
# plt.title('Boston Housing Dataset - Correlation Matrix',
#           pad=30, fontsize=16, fontweight='bold', color='#2F2F2F')
#
# # 添加网格线
# plt.grid(True, color='#DDDDDD', linewidth=0.5, alpha=0.3)
#
# # 设置背景色
# plt.gca().set_facecolor('#FAFAFA')
#
# # 美化边框
# for spine in plt.gca().spines.values():
#     spine.set_color('#CCCCCC')
#     spine.set_linewidth(1)
#
# plt.tight_layout()
# plt.show()
# # 查看是否穿过查尔斯河的两类占比
# # 可以看到被河流穿过的豪宅仅占比6.92%
#
# fig, ax = plt.subplots(1, 2, figsize=(10, 5))
#
# # 左侧：饼图
# boston['CHAS'].value_counts().plot.pie(ax=ax[0], shadow=False, autopct='%1.2f%%')
# ax[0].set_ylabel('')    # 设置y轴标签
# ax[0].set_xlabel('CHAS')    # 设置x轴标签
#
# # 右侧：使用matplotlib的柱状图（替代sns.countplot）
# chas_counts = boston['CHAS'].value_counts().sort_index()
# ax[1].bar(chas_counts.index, chas_counts.values, color=['skyblue', 'lightcoral'])
# ax[1].set_ylabel('Count')
# ax[1].set_xlabel('CHAS')
# ax[1].set_xticks(chas_counts.index)  # 设置x轴刻度
#
# plt.tight_layout()
# plt.show()
# import pandas as pd
# import matplotlib.pyplot as plt
# import numpy as np
# import os
#
# # 首先检查文件是否存在
# file_path = "boston_house_prices.csv"  # 使用当前目录下的文件
#
# if not os.path.exists(file_path):
#     print(f"错误：文件 {file_path} 不存在！")
#     print("请确保 boston_house_prices.csv 文件在当前目录下")
#     # 或者您可以创建一个示例数据
#     print("正在创建示例数据...")
#     from sklearn.datasets import fetch_california_housing
#
#     housing = fetch_california_housing()
#     boston = pd.DataFrame(housing.data, columns=housing.feature_names)
#     boston['PRICE'] = housing.target * 100000  # 将价格转换为实际数值
# else:
#     # 读取数据
#     boston = pd.read_csv(file_path)
#
# print("数据形状:", boston.shape)
# print("数据列名:", boston.columns.tolist())
#
# # 准备数据 - 排除目标变量PRICE，获取所有数值特征
# # 首先检查PRICE列是否存在
# if 'PRICE' in boston.columns:
#     features = boston.columns.drop(['PRICE']).tolist()
# else:
#     # 如果没有PRICE列，使用所有列
#     features = boston.columns.tolist()
#     print("警告：未找到PRICE列，将使用所有特征")
#
# print("使用的特征:", features)
# print("特征数量:", len(features))
#
# # 定义新的颜色方案
# scatter_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8', '#F7DC6F']
# heatmap_colors = ['#2E86AB', '#A8DADC', '#F4F4F4', '#FFB6C1', '#E4007F']
#
# from matplotlib.colors import LinearSegmentedColormap
#
# custom_cmap = LinearSegmentedColormap.from_list('custom_blue_pink', heatmap_colors, N=256)
#
# # 计算合适的子图布局
# num_features = len(features)
# # 计算行数和列数
# cols = 4
# rows = (num_features + cols - 1) // cols  # 向上取整
#
# plt.figure(figsize=(16, 4 * rows))  # 动态调整高度
#
# # 绘制每个特征与房价的关系
# for i, feature in enumerate(features):
#     plt.subplot(rows, cols, i + 1)
#
#     if 'PRICE' in boston.columns:
#         y_values = boston['PRICE']
#         y_label = 'Price'
#     else:
#         # 如果没有PRICE列，使用第一个特征作为y轴
#         y_values = boston[features[0]]
#         y_label = features[0]
#
#     if boston[feature].dtype in ['object', 'bool'] or boston[feature].nunique() < 10:
#         # 对于分类变量或取值较少的变量，使用箱线图
#         if 'PRICE' in boston.columns:
#             # 美化箱线图颜色
#             box_plot = boston.boxplot(column='PRICE', by=feature, ax=plt.gca(),
#                                       patch_artist=True,
#                                       boxprops=dict(facecolor='#A8DADC', color='#2E86AB'),
#                                       medianprops=dict(color='#E4007F'),
#                                       whiskerprops=dict(color='#2E86AB'),
#                                       capprops=dict(color='#2E86AB'),
#                                       flierprops=dict(markerfacecolor='#FF6B6B', markeredgecolor='#FF6B6B'))
#         plt.title(f'{i + 1}. {feature}', color='#2E86AB', fontweight='bold')
#     else:
#         # 对于连续变量，使用散点图 - 使用循环颜色
#         color_idx = i % len(scatter_colors)
#         plt.scatter(boston[feature], y_values, s=8, alpha=0.7,
#                     color=scatter_colors[color_idx], edgecolors='white', linewidth=0.3)
#         plt.xlabel(feature, color='#2E86AB')
#         plt.ylabel(y_label, color='#2E86AB')
#         plt.title(f'{i + 1}. {feature}', color='#2E86AB', fontweight='bold')
#
#     # 美化子图
#     plt.gca().set_facecolor('#F8F9FA')
#     plt.grid(True, alpha=0.3, color='#E9ECEF')
#
# plt.suptitle('Feature vs Price Relationship', fontsize=16, fontweight='bold', color='#2E86AB')
# plt.tight_layout()
# plt.show()
#
# # 单独显示相关矩阵（如果有足够的数据）
# if boston.shape[1] > 1:
#     # 计算相关系数矩阵
#     corr_matrix = boston.corr(numeric_only=True)
#
#     plt.figure(figsize=(12, 10))
#     # 使用自定义颜色映射
#     im = plt.imshow(corr_matrix, cmap=custom_cmap, aspect='auto', vmin=-1, vmax=1)
#
#     # 美化颜色条
#     cbar = plt.colorbar(im, shrink=0.8)
#     cbar.set_label('Correlation Coefficient', rotation=270, labelpad=20,
#                    fontsize=12, fontweight='bold', color='#2E86AB')
#
#     # 设置刻度
#     plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns,
#                rotation=45, ha='right', fontsize=10, color='#2E86AB')
#     plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns,
#                fontsize=10, color='#2E86AB')
#
#     # 添加相关系数值，根据背景色调整文字颜色
#     for i in range(len(corr_matrix.columns)):
#         for j in range(len(corr_matrix.columns)):
#             corr_value = corr_matrix.iloc[i, j]
#             # 根据相关系数值选择文字颜色
#             text_color = 'white' if abs(corr_value) > 0.5 else '#2E86AB'
#             font_weight = 'bold' if abs(corr_value) > 0.7 else 'normal'
#             plt.text(j, i, f'{corr_value:.2f}',
#                      ha='center', va='center',
#                      fontsize=9, fontweight=font_weight,
#                      color=text_color)
#
#     # 美化标题和背景
#     plt.title('Correlation Matrix - Boston Housing Dataset',
#               pad=20, fontsize=16, fontweight='bold', color='#2E86AB')
#     plt.gca().set_facecolor('#F8F9FA')
#
#     # 添加网格线
#     plt.grid(True, color='#E9ECEF', linewidth=0.5, alpha=0.5)
#
#     plt.tight_layout()
#     plt.show()
#导入Python常用数据分析的库

import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model

# 正确读取数据 - 不使用names参数，因为文件已经有列名了
boston = pd.read_csv("boston_house_prices.csv")
print("数据列名:", boston.columns.tolist())
print("\n前5行数据:")
print(boston.head())

# 根据实际列名定义特征和目标变量
# 注意：目标变量是MEDV，不是PRICE
x = boston[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']]
y = boston['MEDV']

print(f"\n特征数据形状: {x.shape}")
print(f"目标变量形状: {y.shape}")

# 建立线性回归模型
clf = linear_model.LinearRegression()
clf.fit(x, y)

# 查看回归系数
print('\n回归系数:', clf.coef_)
print('截距:', clf.intercept_)
print('R²得分:', clf.score(x, y))
from sklearn.metrics import r2_score
# score = r2_score(y, y_pred)
y_pred =clf.predict(x)
print(y_pred)