线性回归模型
pythonimport torch import numpy as np import matplotlib.pyplot as plt # ----------------------------- # 1. 定义线性函数 (真实分布) # ----------------------------- def linear_func(x, w=1.2, b=0.5): return w * x + b # ----------------------------- # 2. 数据生成函数 # ----------------------------- def create_toy_data(func, interval, sample_num, noise=0.0, add_outlier=False, outlier_ratio=0.001): # 均匀采样 X = torch.rand(size=[sample_num]) * (interval[1] - interval[0]) + interval[0] y = func(X) # 添加高斯噪声 epsilon = torch.tensor(np.random.normal(0, noise, size=y.shape[0])) y = y + epsilon # 添加异常点 if add_outlier: outlier_num = int(len(y) * outlier_ratio) if outlier_num > 0: outlier_idx = torch.randint(0, len(y), size=[outlier_num]) y[outlier_idx] = y[outlier_idx] * 5 return X, y # ----------------------------- # 3. 构建数据集 # ----------------------------- interval = (-10, 10) train_num = 100 test_num = 50 noise = 2 X_train, y_train = create_toy_data(linear_func, interval, train_num, noise=noise) X_test, y_test = create_toy_data(linear_func, interval, test_num, noise=noise) X_train_large, y_train_large = create_toy_data(linear_func, interval, 5000, noise=noise) # 用来画真实分布曲线 X_underlying = torch.linspace(interval[0], interval[1], train_num) y_underlying = linear_func(X_underlying) # ----------------------------- # 4. 定义线性算子 (手写模型) # ----------------------------- class Op(object): def __call__(self, inputs): return self.forward(inputs) def forward(self, inputs): raise NotImplementedError def backward(self, inputs): raise NotImplementedError class Linear(Op): def __init__(self, input_size): """ 输入: - input_size: 特征维度 """ self.input_size = input_size self.params = {} self.params['w'] = torch.randn(self.input_size, 1) self.params['b'] = torch.zeros([1]) def forward(self, X): N, D = X.shape assert D == self.input_size return torch.matmul(X, self.params['w']) + self.params['b'] # 测试 Linear 模型 input_size = 3 N = 2 X = torch.randn(N, input_size) # 生成 2 个样本,每个 3 维 model = Linear(input_size) y_pred = model.forward(X) print("模型预测 y_pred:", y_pred) # ----------------------------- # 5. 可视化数据 # ----------------------------- plt.figure(figsize=(8, 6)) plt.scatter(X_train, y_train, marker='*', facecolor="none", edgecolor='#1EA5EE', s=50, label="train data") plt.scatter(X_test, y_test,facecolor="none", edgecolor='#2BCD76', s=50, label="test data") plt.plot(X_underlying, y_underlying, c='#000000', label=r"underlying distribution") plt.legend(fontsize='large') plt.title("Toy Data for Linear Regression") plt.savefig('ml-vis.pdf') plt.show() import random from cProfile import label import torch from d2l import torch as d2l def synthetic_data(w,b,num_examples): X=torch.normal(0,1,(num_examples,len(w))) y=torch.matmul(X,w)+b y+=torch.normal(0,0.01,y.shape) return X,y.reshape((-1,1)) true_w=torch.tensor([2,-3.4]) true_b=4.2 features,labels=synthetic_data(true_w, true_b, 1000) d2l.set_figsize() d2l.plt.scatter(features[:,1].detach().numpy(),labels.detach().numpy(),marker='*',facecolors='none',edgecolors='#e4007f',s=1,label='train data') d2l.plt.show() def data_iter(batch_size,features,labels): num_examples=len(features) indices=list(range(num_examples)) random.shuffle(indices) for i in range(0,num_examples,batch_size): batch_indices=torch.tensor(indices[i:min(i+batch_size,num_examples)]) yield features[batch_indices],labels[batch_indices] batch_size=10 for X,y in data_iter(batch_size,features,labels): print(X,'\n',y) break w=torch.normal(0,0.01,size=(2,1),requires_grad=True) b=torch.zeros(1,requires_grad=True) def linreg(X,w,b): return torch.matmul(X,w) + b def squared_loss(y_hat,y): return (y_hat-y.reshape(y_hat.shape))**2/2 def sgd(params,lr,batch_size): with torch.no_grad(): for param in params: param-=lr*param.grad/batch_size param.grad.zero_() lr=10 num_epochs=10 net=linreg loss=squared_loss for epoch in range(num_epochs): for X,y in data_iter(batch_size,features,labels): l=loss(net(X,w,b),y) l.sum().backward() sgd([w,b],lr,batch_size) with torch.no_grad(): train_1=loss(net(features,w,b),labels) print(f'epoch {epoch+1},loss {float(train_1.mean()):f}') print("learned w:", w.reshape(-1).detach().numpy()) print("true w:", true_w.numpy()) print("learned b:", b.item()) print("true b:", true_b)多元线性回归模型
pythonimport random import torch from d2l import torch as d2l # ----------------------------- # 1. 构造多项式数据集 # ----------------------------- def synthetic_poly_data(num_examples): # 随机生成输入特征 x X = torch.normal(0, 1, (num_examples, 1)) # 构造多项式: y = 5 + 1.2*x - 3.4*x^2 + 0.5*x^3 + 噪声 true_w = torch.tensor([1.2, -3.4, 0.5]) true_b = 5.0 poly_features = torch.cat([X**i for i in range(1, len(true_w)+1)], 1) # [x, x^2, x^3] y = torch.matmul(poly_features, true_w) + true_b y += torch.normal(0, 0.1, y.shape) # 添加噪声 return poly_features, y.reshape((-1, 1)), true_w, true_b features, labels, true_w, true_b = synthetic_poly_data(1000) # 可视化:只画输入 x 与标签的关系 d2l.set_figsize() d2l.plt.scatter(features[:, 0].detach().numpy(), labels.detach().numpy(), marker='*', facecolors='none', edgecolors='#e4007f', s=1, label='train data') d2l.plt.show() # ----------------------------- # 2. 数据迭代器 # ----------------------------- def data_iter(batch_size, features, labels): num_examples = len(features) indices = list(range(num_examples)) random.shuffle(indices) for i in range(0, num_examples, batch_size): batch_indices = torch.tensor(indices[i:min(i+batch_size, num_examples)]) yield features[batch_indices], labels[batch_indices] # ----------------------------- # 3. 初始化参数 # ----------------------------- w = torch.normal(0, 0.01, size=(3, 1), requires_grad=True) # 三个多项式参数 b = torch.zeros(1, requires_grad=True) # ----------------------------- # 4. 定义模型 & 损失 & 优化器 # ----------------------------- def polyreg(X, w, b): return torch.matmul(X, w) + b def squared_loss(y_hat, y): return (y_hat - y.reshape(y_hat.shape))**2 / 2 def sgd(params, lr, batch_size): with torch.no_grad(): for param in params: param -= lr * param.grad / batch_size param.grad.zero_() # ----------------------------- # 5. 训练 # ----------------------------- lr = 0.01 num_epochs = 20 batch_size = 10 net = polyreg loss = squared_loss for epoch in range(num_epochs): for X, y in data_iter(batch_size, features, labels): l = loss(net(X, w, b), y) l.sum().backward() sgd([w, b], lr, batch_size) with torch.no_grad(): train_l = loss(net(features, w, b), labels) print(f'epoch {epoch+1}, loss {float(train_l.mean()):f}') print("learned w:", w.reshape(-1).detach().numpy()) print("true w:", true_w.numpy()) print("learned b:", b.item()) print("true b:", true_b)基于线性回归的波士顿房价预测模型
pythonimport pandas as pd # 利用pandas加载波士顿房价的数据集 data=pd.read_csv("boston_house_prices.csv") # 预览前5行数据 data.head() print(data.head()) # 查看各字段缺失值统计情况 print(data.isna().sum()) # # import matplotlib.pyplot as plt # 可视化工具 # # # # 箱线图查看异常值分布 # def boxplot(data, fig_name): # # 绘制每个属性的箱线图 # data_col = list(data.columns) # # # 连续画几个图片 # plt.figure(figsize=(5, 5), dpi=300) # # 子图调整 # plt.subplots_adjust(wspace=0.6) # # 每个特征画一个箱线图 # for i, col_name in enumerate(data_col): # plt.subplot(3, 5, i + 1) # # 画箱线图 # plt.boxplot(data[col_name], # showmeans=True, # meanprops={"markersize": 1, "marker": "D", "markeredgecolor": '#f19ec2'}, # 均值的属性 # medianprops={"color": '#e4007f'}, # 中位数线的属性 # whiskerprops={"color": '#e4007f', "linewidth": 0.4, 'linestyle': "--"}, # flierprops={"markersize": 0.4}, # ) # # 图名 # plt.title(col_name, fontdict={"size": 5}, pad=2) # # y方向刻度 # plt.yticks(fontsize=4, rotation=90) # plt.tick_params(pad=0.5) # # x方向刻度 # plt.xticks([]) # plt.savefig(fig_name) # plt.show() # # # boxplot(data, 'ml-vis5.pdf') # # import matplotlib.pyplot as plt import pandas as pd # # def boxplot_enhanced_large_subplots(data, fig_name, figsize=(16, 12)): # """ # 美化版箱线图可视化函数 - 子图更大 # 用于检测数据中的异常值分布情况 # # Parameters: # data: DataFrame, 需要可视化的数据 # fig_name: str, 保存图片的文件名 # figsize: tuple, 图片尺寸 (宽, 高) # """ # # # 获取数据列名 # data_col = list(data.columns) # num_features = len(data_col) # # # 创建更大的画布,提高分辨率 # plt.figure(figsize=figsize, dpi=300) # plt.suptitle('Boston Housing Dataset - Outlier Detection Boxplot', # fontsize=18, fontweight='bold', y=0.99) # # # 减少每行的子图数量,让每个子图更大 # cols = 4 # 从5减少到4,让子图更宽 # rows = (num_features + cols - 1) // cols # # # 设置更大的子图间距 # plt.subplots_adjust(wspace=0.4, hspace=0.5, top=0.92, bottom=0.08, left=0.06, right=0.98) # # # 为每个特征绘制箱线图 # for i, col_name in enumerate(data_col): # plt.subplot(rows, cols, i + 1) # # # 绘制美化箱线图 - 使用patch_artist来设置填充色 # box_plot = plt.boxplot(data[col_name], # patch_artist=True, # 启用填充色 # showmeans=True, # meanprops={"markersize": 6, # 增大标记尺寸 # "marker": "D", # "markeredgecolor": '#FF6B9D', # "markerfacecolor": '#FFE4EC'}, # medianprops={"color": '#E4007F', "linewidth": 2}, # 加粗中线 # whiskerprops={"color": '#E4007F', "linewidth": 1.2}, # 加粗须线 # capprops={"color": '#E4007F', "linewidth": 1.2}, # flierprops={"markersize": 4, # 增大异常点尺寸 # "marker": "o", # "markerfacecolor": '#888888', # "markeredgecolor": '#666666', # "alpha": 0.6}) # # # 手动设置箱体填充色(避免facecolor错误) # for box in box_plot['boxes']: # box.set(facecolor='#FFF0F5', alpha=0.7) # # # 设置更大的标题 # plt.title(f'{col_name}', # fontdict={"size": 12, "fontweight": "bold"}, # 增大字体 # pad=15, # 增加标题间距 # color='#2F2F2F') # # # 设置更大的y轴刻度 # plt.yticks(fontsize=10) # 增大刻度字体 # plt.tick_params(axis='y', which='major', pad=4, length=6) # 增大刻度尺寸 # # # 隐藏x轴刻度 # plt.xticks([]) # # # 添加更明显的网格线 # plt.grid(axis='y', # alpha=0.4, # linestyle='--', # linewidth=0.8, # 加粗网格线 # color='#CCCCCC') # # # 设置背景色 # plt.gca().set_facecolor('#F8F8F8') # # # 添加更粗的边框 # for spine in plt.gca().spines.values(): # spine.set_color('#DDDDDD') # spine.set_linewidth(1.2) # 加粗边框 # # # 保存图片 # plt.savefig(fig_name, # dpi=300, # bbox_inches='tight', # facecolor='white', # edgecolor='none') # # plt.show() # # print(f"箱线图已保存为: {fig_name}") # print(f"画布尺寸: {figsize}") # print(f"布局: {rows}行 × {cols}列") # print(f"共可视化 {num_features} 个特征") # # # # 使用示例 - 使用更大的画布 # boxplot_enhanced_large_subplots(data, 'boston_housing_boxplot_large.pdf', figsize=(16, 12)) # # # 四分位处理异常值 # num_features = data.select_dtypes(exclude=['object', 'bool']).columns.tolist() # # for feature in num_features: # if feature == 'CHAS': # continue # # Q1 = data[feature].quantile(q=0.25) # 下四分位 # Q3 = data[feature].quantile(q=0.75) # 上四分位 # # IQR = Q3 - Q1 # top = Q3 + 1.5 * IQR # 最大估计值 # bot = Q1 - 1.5 * IQR # 最小估计值 # values = data[feature].values # values[values > top] = top # 临界值取代噪声 # values[values < bot] = bot # 临界值取代噪声 # data[feature] = values.astype(data[feature].dtypes) # # # 再次查看箱线图,异常值已被临界值替换(数据量较多或本身异常值较少时,箱线图展示会不容易体现出来) # boxplot_enhanced_large_subplots(data, 'ml-vis6.pdf') print(data.describe()) #查看相关性 import matplotlib.pyplot as plt import pandas as pd import numpy as np boston = data corrboston = boston.corr() # plt.figure(figsize=(14, 10)) # # # 使用自定义颜色映射 # colors = ['#2E86AB', '#A8DADC', '#FFFFFF', '#FFB6C1', '#E4007F'] # from matplotlib.colors import LinearSegmentedColormap # custom_cmap = LinearSegmentedColormap.from_list('custom_pink', colors, N=256) # # # 使用matshow绘制热力图 # im = plt.matshow(corrboston, cmap=custom_cmap, fignum=1, aspect='auto') # # # 美化颜色条 # cbar = plt.colorbar(im, shrink=0.8) # cbar.set_label('Correlation Coefficient', rotation=270, labelpad=15, fontsize=12) # # # 设置刻度标签 # plt.xticks(range(len(corrboston.columns)), corrboston.columns, # rotation=45, ha='right', fontsize=10) # plt.yticks(range(len(corrboston.columns)), corrboston.columns, fontsize=10) # # # 添加相关系数值标注,根据背景色调整文字颜色 # for i in range(len(corrboston.columns)): # for j in range(len(corrboston.columns)): # corr_value = corrboston.iloc[i, j] # # 根据相关系数值选择文字颜色 # text_color = 'white' if abs(corr_value) > 0.3 else 'black' # plt.text(j, i, f'{corr_value:.2f}', # ha='center', va='center', # fontsize=9, fontweight='bold', # color=text_color) # # # 美化标题 # plt.title('Boston Housing Dataset - Correlation Matrix', # pad=30, fontsize=16, fontweight='bold', color='#2F2F2F') # # # 添加网格线 # plt.grid(True, color='#DDDDDD', linewidth=0.5, alpha=0.3) # # # 设置背景色 # plt.gca().set_facecolor('#FAFAFA') # # # 美化边框 # for spine in plt.gca().spines.values(): # spine.set_color('#CCCCCC') # spine.set_linewidth(1) # # plt.tight_layout() # plt.show() # # 查看是否穿过查尔斯河的两类占比 # # 可以看到被河流穿过的豪宅仅占比6.92% # # fig, ax = plt.subplots(1, 2, figsize=(10, 5)) # # # 左侧:饼图 # boston['CHAS'].value_counts().plot.pie(ax=ax[0], shadow=False, autopct='%1.2f%%') # ax[0].set_ylabel('') # 设置y轴标签 # ax[0].set_xlabel('CHAS') # 设置x轴标签 # # # 右侧:使用matplotlib的柱状图(替代sns.countplot) # chas_counts = boston['CHAS'].value_counts().sort_index() # ax[1].bar(chas_counts.index, chas_counts.values, color=['skyblue', 'lightcoral']) # ax[1].set_ylabel('Count') # ax[1].set_xlabel('CHAS') # ax[1].set_xticks(chas_counts.index) # 设置x轴刻度 # # plt.tight_layout() # plt.show() # import pandas as pd # import matplotlib.pyplot as plt # import numpy as np # import os # # # 首先检查文件是否存在 # file_path = "boston_house_prices.csv" # 使用当前目录下的文件 # # if not os.path.exists(file_path): # print(f"错误:文件 {file_path} 不存在!") # print("请确保 boston_house_prices.csv 文件在当前目录下") # # 或者您可以创建一个示例数据 # print("正在创建示例数据...") # from sklearn.datasets import fetch_california_housing # # housing = fetch_california_housing() # boston = pd.DataFrame(housing.data, columns=housing.feature_names) # boston['PRICE'] = housing.target * 100000 # 将价格转换为实际数值 # else: # # 读取数据 # boston = pd.read_csv(file_path) # # print("数据形状:", boston.shape) # print("数据列名:", boston.columns.tolist()) # # # 准备数据 - 排除目标变量PRICE,获取所有数值特征 # # 首先检查PRICE列是否存在 # if 'PRICE' in boston.columns: # features = boston.columns.drop(['PRICE']).tolist() # else: # # 如果没有PRICE列,使用所有列 # features = boston.columns.tolist() # print("警告:未找到PRICE列,将使用所有特征") # # print("使用的特征:", features) # print("特征数量:", len(features)) # # # 定义新的颜色方案 # scatter_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4', '#FFEAA7', '#DDA0DD', '#98D8C8', '#F7DC6F'] # heatmap_colors = ['#2E86AB', '#A8DADC', '#F4F4F4', '#FFB6C1', '#E4007F'] # # from matplotlib.colors import LinearSegmentedColormap # # custom_cmap = LinearSegmentedColormap.from_list('custom_blue_pink', heatmap_colors, N=256) # # # 计算合适的子图布局 # num_features = len(features) # # 计算行数和列数 # cols = 4 # rows = (num_features + cols - 1) // cols # 向上取整 # # plt.figure(figsize=(16, 4 * rows)) # 动态调整高度 # # # 绘制每个特征与房价的关系 # for i, feature in enumerate(features): # plt.subplot(rows, cols, i + 1) # # if 'PRICE' in boston.columns: # y_values = boston['PRICE'] # y_label = 'Price' # else: # # 如果没有PRICE列,使用第一个特征作为y轴 # y_values = boston[features[0]] # y_label = features[0] # # if boston[feature].dtype in ['object', 'bool'] or boston[feature].nunique() < 10: # # 对于分类变量或取值较少的变量,使用箱线图 # if 'PRICE' in boston.columns: # # 美化箱线图颜色 # box_plot = boston.boxplot(column='PRICE', by=feature, ax=plt.gca(), # patch_artist=True, # boxprops=dict(facecolor='#A8DADC', color='#2E86AB'), # medianprops=dict(color='#E4007F'), # whiskerprops=dict(color='#2E86AB'), # capprops=dict(color='#2E86AB'), # flierprops=dict(markerfacecolor='#FF6B6B', markeredgecolor='#FF6B6B')) # plt.title(f'{i + 1}. {feature}', color='#2E86AB', fontweight='bold') # else: # # 对于连续变量,使用散点图 - 使用循环颜色 # color_idx = i % len(scatter_colors) # plt.scatter(boston[feature], y_values, s=8, alpha=0.7, # color=scatter_colors[color_idx], edgecolors='white', linewidth=0.3) # plt.xlabel(feature, color='#2E86AB') # plt.ylabel(y_label, color='#2E86AB') # plt.title(f'{i + 1}. {feature}', color='#2E86AB', fontweight='bold') # # # 美化子图 # plt.gca().set_facecolor('#F8F9FA') # plt.grid(True, alpha=0.3, color='#E9ECEF') # # plt.suptitle('Feature vs Price Relationship', fontsize=16, fontweight='bold', color='#2E86AB') # plt.tight_layout() # plt.show() # # # 单独显示相关矩阵(如果有足够的数据) # if boston.shape[1] > 1: # # 计算相关系数矩阵 # corr_matrix = boston.corr(numeric_only=True) # # plt.figure(figsize=(12, 10)) # # 使用自定义颜色映射 # im = plt.imshow(corr_matrix, cmap=custom_cmap, aspect='auto', vmin=-1, vmax=1) # # # 美化颜色条 # cbar = plt.colorbar(im, shrink=0.8) # cbar.set_label('Correlation Coefficient', rotation=270, labelpad=20, # fontsize=12, fontweight='bold', color='#2E86AB') # # # 设置刻度 # plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, # rotation=45, ha='right', fontsize=10, color='#2E86AB') # plt.yticks(range(len(corr_matrix.columns)), corr_matrix.columns, # fontsize=10, color='#2E86AB') # # # 添加相关系数值,根据背景色调整文字颜色 # for i in range(len(corr_matrix.columns)): # for j in range(len(corr_matrix.columns)): # corr_value = corr_matrix.iloc[i, j] # # 根据相关系数值选择文字颜色 # text_color = 'white' if abs(corr_value) > 0.5 else '#2E86AB' # font_weight = 'bold' if abs(corr_value) > 0.7 else 'normal' # plt.text(j, i, f'{corr_value:.2f}', # ha='center', va='center', # fontsize=9, fontweight=font_weight, # color=text_color) # # # 美化标题和背景 # plt.title('Correlation Matrix - Boston Housing Dataset', # pad=20, fontsize=16, fontweight='bold', color='#2E86AB') # plt.gca().set_facecolor('#F8F9FA') # # # 添加网格线 # plt.grid(True, color='#E9ECEF', linewidth=0.5, alpha=0.5) # # plt.tight_layout() # plt.show() #导入Python常用数据分析的库 import pandas as pd import matplotlib.pyplot as plt from sklearn import linear_model # 正确读取数据 - 不使用names参数,因为文件已经有列名了 boston = pd.read_csv("boston_house_prices.csv") print("数据列名:", boston.columns.tolist()) print("\n前5行数据:") print(boston.head()) # 根据实际列名定义特征和目标变量 # 注意:目标变量是MEDV,不是PRICE x = boston[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT']] y = boston['MEDV'] print(f"\n特征数据形状: {x.shape}") print(f"目标变量形状: {y.shape}") # 建立线性回归模型 clf = linear_model.LinearRegression() clf.fit(x, y) # 查看回归系数 print('\n回归系数:', clf.coef_) print('截距:', clf.intercept_) print('R²得分:', clf.score(x, y)) from sklearn.metrics import r2_score # score = r2_score(y, y_pred) y_pred =clf.predict(x) print(y_pred)
神经网络实验3-线性回归
丰海洋2025-10-06 23:04
相关推荐
文火冰糖的硅基工坊1 天前
[人工智能-大模型-66]:模型层技术 - 两种编程范式:数学函数式编程与逻辑推理式编程,构建起截然不同的智能系统。AntBlack1 天前
不当韭菜 : 好像真有点效果 ,想藏起来自己用了百锦再1 天前
破茧成蝶:全方位解析Java学习难点与征服之路可触的未来,发芽的智生1 天前
触摸未来2025-10-25:蓝图绘制新手村领路人1 天前
python opencv gpu加速 cmake msvc cuda编译问题和设置暴风鱼划水1 天前
卡码网语言基础课(Python) | 19.洗盘子Gitpchy1 天前
Day 23 机器学习管道 pipeline程序员小远1 天前
使用Jmeter进行http接口测试B站_计算机毕业设计之家1 天前
spark实战:python股票数据分析可视化系统 Flask框架 金融数据分析 Echarts可视化 大数据技术 ✅百锦再1 天前
低代码开发的约束性及ABP框架的实践解析