02-机器学习基础：监督学习——线性回归

线性回归：预测连续值的基础模型

一、线性回归要解决什么问题？

1.1 问题场景

python 复制代码

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("线性回归：预测连续值")
print("=" * 60)

# 直观示例：房价预测
np.random.seed(42)
house_size = np.random.rand(100) * 150 + 50  # 面积 50-200 平米
house_price = house_size * 0.8 + 20 + np.random.randn(100) * 10  # 价格 = 面积 × 0.8 + 20 + 噪声

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.scatter(house_size, house_price, alpha=0.6)
plt.xlabel('房屋面积 (平方米)')
plt.ylabel('价格 (万元)')
plt.title('房屋面积 vs 价格\n我们想要找到一条最佳拟合直线')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
# 画几条可能的直线
x_line = np.linspace(50, 200, 100)
for slope, intercept, color in [(0.5, 30, 'gray'), (0.8, 20, 'red'), (1.0, 10, 'blue')]:
    y_line = slope * x_line + intercept
    plt.plot(x_line, y_line, color=color, linestyle='--', alpha=0.7, 
             label=f'y={slope}x+{intercept}')
plt.scatter(house_size, house_price, alpha=0.3)
plt.xlabel('房屋面积 (平方米)')
plt.ylabel('价格 (万元)')
plt.title('哪条直线最好？')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n💡 线性回归要解决的问题:")
print("   给定一组数据点 (x₁, y₁), (x₂, y₂), ..., (xₙ, yₙ)")
print("   找到一条直线 y = wx + b，使得它最能代表数据的趋势")
print("   然后用这条直线预测新数据点的 y 值")

二、核心原理：最小二乘法

2.1 什么是"最好"的直线？

python 复制代码

def demonstrate_cost_function():
    """演示损失函数的概念"""
    
    # 真实数据
    x = np.array([1, 2, 3, 4, 5])
    y = np.array([2, 4, 5, 4, 5])
    
    # 候选直线
    lines = [
        ("y = 0.5x + 2", 0.5, 2),
        ("y = 0.7x + 1.5", 0.7, 1.5),
        ("y = 0.6x + 1.8", 0.6, 1.8),  # 最佳
        ("y = 1.0x + 1.0", 1.0, 1.0),
    ]
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.flatten()
    
    for idx, (name, w, b) in enumerate(lines):
        ax = axes[idx]
        y_pred = w * x + b
        
        # 绘制数据点和直线
        ax.scatter(x, y, s=80, c='blue', alpha=0.7, label='数据点')
        ax.plot(x, y_pred, 'r-', linewidth=2, label='拟合线')
        
        # 绘制误差线（残差）
        for i in range(len(x)):
            ax.plot([x[i], x[i]], [y[i], y_pred[i]], 'g--', alpha=0.5, linewidth=1)
        
        # 计算MSE
        mse = np.mean((y - y_pred) ** 2)
        ax.set_title(f'{name}\nMSE = {mse:.2f}')
        ax.set_xlabel('x')
        ax.set_ylabel('y')
        ax.legend()
        ax.grid(True, alpha=0.3)
    
    plt.suptitle('不同直线的拟合效果（绿色虚线=预测误差）', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    print("\n📊 损失函数（MSE）的解释:")
    print("   均方误差 MSE = (1/n) × Σ(y_true - y_pred)²")
    print("   - 误差越大，MSE越大")
    print("   - MSE越小，直线拟合越好")
    print("   - 平方的作用：让大误差惩罚更重，且误差正负不会抵消")

demonstrate_cost_function()

2.2 最小二乘法的数学原理

python 复制代码

def explain_least_squares():
    """解释最小二乘法的数学原理"""
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # 1. 损失函数曲面
    ax1 = axes[0]
    
    # 生成损失函数曲面
    w_range = np.linspace(-1, 3, 50)
    b_range = np.linspace(-1, 4, 50)
    W, B = np.meshgrid(w_range, b_range)
    
    # 示例数据
    x = np.array([1, 2, 3, 4, 5])
    y = np.array([2, 4, 5, 4, 5])
    
    MSE = np.zeros_like(W)
    for i in range(len(w_range)):
        for j in range(len(b_range)):
            y_pred = W[i, j] * x + B[i, j]
            MSE[j, i] = np.mean((y - y_pred) ** 2)
    
    # 绘制3D曲面
    surf = ax1.plot_surface(W, B, MSE, cmap='viridis', alpha=0.8)
    ax1.set_xlabel('斜率 w')
    ax1.set_ylabel('截距 b')
    ax1.set_zlabel('MSE')
    ax1.set_title('损失函数曲面：有唯一最小值点')
    
    # 2. 公式推导
    ax2 = axes[1]
    ax2.axis('off')
    ax2.set_title('最小二乘法公式推导', fontsize=12)
    
    formula = """
    🎯 目标：最小化 MSE = (1/n) Σ (y_i - (w x_i + b))²
    
    📐 求偏导数为0：
    
    ∂MSE/∂w = -2/n Σ x_i (y_i - w x_i - b) = 0
    ∂MSE/∂b = -2/n Σ (y_i - w x_i - b) = 0
    
    🔧 解得闭式解（正规方程）：
    
    w = (n Σ x_i y_i - Σ x_i Σ y_i) / (n Σ x_i² - (Σ x_i)²)
    b = (Σ y_i - w Σ x_i) / n
    
    💡 这意味着：我们可以直接计算最优的 w 和 b，
       不需要像梯度下降那样迭代！
    """
    
    ax2.text(0.05, 0.95, formula, transform=ax2.transAxes, fontsize=10,
            verticalalignment='top', fontfamily='monospace')
    
    plt.suptitle('最小二乘法：通过求导找到最优解', fontsize=14)
    plt.tight_layout()
    plt.show()

explain_least_squares()

三、从零实现线性回归

3.1 使用正规方程（最小二乘法）

python 复制代码

class LinearRegressionNormalEquation:
    """使用正规方程（最小二乘法）的线性回归"""
    
    def __init__(self):
        self.w = None  # 权重（斜率）
        self.b = None  # 偏置（截距）
    
    def fit(self, X, y):
        """
        训练模型：使用正规方程直接计算最优参数
        
        参数:
            X: 特征矩阵，形状 (n_samples, n_features)
            y: 目标值，形状 (n_samples,)
        """
        n_samples = X.shape[0]
        
        # 方法1：简单线性回归（单特征）使用公式
        if X.shape[1] == 1:
            x = X.flatten()
            # 计算必要的统计量
            sum_x = np.sum(x)
            sum_y = np.sum(y)
            sum_xy = np.sum(x * y)
            sum_x2 = np.sum(x ** 2)
            
            # 正规方程公式
            self.w = (n_samples * sum_xy - sum_x * sum_y) / (n_samples * sum_x2 - sum_x ** 2)
            self.b = (sum_y - self.w * sum_x) / n_samples
        
        # 方法2：多元线性回归（多特征）使用矩阵运算
        else:
            # 添加偏置项（相当于在X后面加一列1）
            X_with_bias = np.column_stack([X, np.ones(n_samples)])
            # 正规方程：w = (X^T X)^(-1) X^T y
            w_opt = np.linalg.inv(X_with_bias.T @ X_with_bias) @ X_with_bias.T @ y
            self.w = w_opt[:-1]
            self.b = w_opt[-1]
        
        return self
    
    def predict(self, X):
        """预测新数据"""
        return X @ self.w + self.b
    
    def score(self, X, y):
        """计算R²分数"""
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

# 生成数据并测试
np.random.seed(42)
X = np.random.rand(100, 1) * 10
y = 2.5 * X.flatten() + 1.8 + np.random.randn(100) * 1.5

# 训练模型
lr_norm = LinearRegressionNormalEquation()
lr_norm.fit(X, y)

print("\n" + "=" * 60)
print("正规方程求解结果")
print("=" * 60)
print(f"真实值: w=2.5, b=1.8")
print(f"计算值: w={lr_norm.w[0]:.4f}, b={lr_norm.b:.4f}")
print(f"R²分数: {lr_norm.score(X, y):.4f}")

3.2 使用梯度下降（迭代方法）

python 复制代码

class LinearRegressionGD:
    """使用梯度下降的线性回归"""
    
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.lr = learning_rate
        self.n_iterations = n_iterations
        self.w = None
        self.b = None
        self.loss_history = []
    
    def fit(self, X, y):
        """
        训练模型：使用梯度下降迭代优化
        
        梯度下降的核心思想：
        1. 随机初始化参数
        2. 计算损失函数对参数的梯度
        3. 沿着梯度下降的方向更新参数
        4. 重复直到收敛
        """
        n_samples, n_features = X.shape
        
        # 1. 随机初始化参数
        self.w = np.random.randn(n_features) * 0.01
        self.b = 0
        
        # 2. 梯度下降迭代
        for i in range(self.n_iterations):
            # 前向传播：计算预测值
            y_pred = X @ self.w + self.b
            
            # 计算损失（MSE）
            loss = np.mean((y_pred - y) ** 2)
            self.loss_history.append(loss)
            
            # 计算梯度
            # ∂Loss/∂w = (2/n) * X^T (y_pred - y)
            dw = (2 / n_samples) * X.T @ (y_pred - y)
            # ∂Loss/∂b = (2/n) * Σ (y_pred - y)
            db = (2 / n_samples) * np.sum(y_pred - y)
            
            # 更新参数（沿着梯度反方向）
            self.w -= self.lr * dw
            self.b -= self.lr * db
            
            # 可选：打印进度
            if i % 200 == 0:
                print(f"Epoch {i:4d}, Loss: {loss:.6f}, w={self.w[0]:.4f}, b={self.b:.4f}")
        
        return self
    
    def predict(self, X):
        return X @ self.w + self.b
    
    def score(self, X, y):
        y_pred = self.predict(X)
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        return 1 - (ss_res / ss_tot)

# 训练梯度下降版本
print("\n" + "=" * 60)
print("梯度下降求解过程")
print("=" * 60)
lr_gd = LinearRegressionGD(learning_rate=0.02, n_iterations=1000)
lr_gd.fit(X, y)

print(f"\n最终结果: w={lr_gd.w[0]:.4f}, b={lr_gd.b:.4f}")
print(f"R²分数: {lr_gd.score(X, y):.4f}")

# 可视化梯度下降过程
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 损失下降曲线
axes[0].plot(lr_gd.loss_history, 'b-', linewidth=2)
axes[0].set_xlabel('迭代次数')
axes[0].set_ylabel('MSE损失')
axes[0].set_title('梯度下降：损失下降曲线')
axes[0].grid(True, alpha=0.3)
axes[0].set_yscale('log')

# 拟合结果
axes[1].scatter(X, y, alpha=0.5, label='数据点')
X_sorted = np.sort(X, axis=0)
y_pred_line = lr_gd.predict(X_sorted)
axes[1].plot(X_sorted, y_pred_line, 'r-', linewidth=2, label='拟合线')
axes[1].set_xlabel('X')
axes[1].set_ylabel('y')
axes[1].set_title(f'线性回归拟合结果\nw={lr_gd.w[0]:.3f}, b={lr_gd.b:.3f}, R²={lr_gd.score(X, y):.3f}')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

四、评估指标：R²分数详解

4.1 R²的含义

python 复制代码

def explain_r2_score():
    """解释R²分数的含义"""
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # 1. R²的公式分解
    ax1 = axes[0]
    ax1.axis('off')
    ax1.set_title('R²分数的含义', fontsize=12)
    
    r2_formula = """
    📊 R² = 1 - (SS_res / SS_tot)
    
    其中：
    • SS_res = Σ (y_true - y_pred)²  (残差平方和)
    • SS_tot = Σ (y_true - ȳ)²       (总平方和)
    • ȳ = 所有y的平均值
    
    解读：
    • R² = 1：完美拟合（SS_res = 0）
    • R² = 0：模型预测 = 简单取平均值
    • R² < 0：模型比取平均值还差
    • R² 越大，模型解释的方差越多
    
    直观理解：
    用平均值预测 → 误差 = SS_tot
    用模型预测   → 误差 = SS_res
    模型减少的误差比例 = (SS_tot - SS_res) / SS_tot = R²
    """
    
    ax1.text(0.05, 0.95, r2_formula, transform=ax1.transAxes, fontsize=10,
            verticalalignment='top', fontfamily='monospace')
    
    # 2. 不同R²值的可视化
    ax2 = axes[1]
    
    x = np.linspace(0, 10, 50)
    np.random.seed(42)
    
    # 不同拟合质量的数据
    scenarios = [
        ("R² ≈ 0.98 (很好)", 2.0, 0.2),
        ("R² ≈ 0.85 (良好)", 2.0, 1.0),
        ("R² ≈ 0.50 (一般)", 2.0, 2.5),
        ("R² ≈ 0.10 (很差)", 2.0, 4.0),
    ]
    
    for i, (title, slope, noise_scale) in enumerate(scenarios):
        y = slope * x + 5 + np.random.randn(50) * noise_scale
        # 计算R²
        y_pred = slope * x + 5
        ss_res = np.sum((y - y_pred) ** 2)
        ss_tot = np.sum((y - np.mean(y)) ** 2)
        r2 = 1 - ss_res / ss_tot
        
        ax2.scatter(x, y, alpha=0.5, s=15)
        ax2.plot(x, y_pred, 'r-', linewidth=1.5, alpha=0.7)
        ax2.text(0.5, 0.92 - i*0.22, f'{title}, 实际R²={r2:.2f}', 
                transform=ax2.transAxes, fontsize=9,
                bbox=dict(boxstyle='round', facecolor='lightyellow'))
    
    ax2.set_xlabel('X')
    ax2.set_ylabel('y')
    ax2.set_title('不同R²值对应的拟合质量')
    ax2.set_ylim(0, 30)
    ax2.grid(True, alpha=0.3)
    
    plt.suptitle('R²：模型解释了多少数据变化', fontsize=14)
    plt.tight_layout()
    plt.show()

explain_r2_score()

五、使用scikit-learn实现

python 复制代码

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

# 生成更有意义的数据（多特征）
np.random.seed(42)
n_samples = 200
X_multi = np.random.randn(n_samples, 3)  # 3个特征
true_w = np.array([2.5, -1.3, 0.8])
true_b = 1.5
y_multi = X_multi @ true_w + true_b + np.random.randn(n_samples) * 0.5

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_multi, y_multi, test_size=0.2, random_state=42)

# 使用scikit-learn
sklearn_lr = LinearRegression()
sklearn_lr.fit(X_train, y_train)

# 预测
y_train_pred = sklearn_lr.predict(X_train)
y_test_pred = sklearn_lr.predict(X_test)

# 评估
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print("\n" + "=" * 60)
print("scikit-learn 线性回归结果")
print("=" * 60)
print(f"真实权重: {true_w}")
print(f"学习权重: {sklearn_lr.coef_}")
print(f"真实偏置: {true_b}")
print(f"学习偏置: {sklearn_lr.intercept_:.4f}")
print(f"\n训练集: MSE={train_mse:.4f}, R²={train_r2:.4f}")
print(f"测试集: MSE={test_mse:.4f}, R²={test_r2:.4f}")

# 可视化预测结果
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# 训练集预测 vs 真实
axes[0].scatter(y_train, y_train_pred, alpha=0.5, c='blue', label='训练集')
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 
             'r--', linewidth=2, label='完美预测')
axes[0].set_xlabel('真实值')
axes[0].set_ylabel('预测值')
axes[0].set_title(f'训练集预测 (R²={train_r2:.3f})')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 测试集预测 vs 真实
axes[1].scatter(y_test, y_test_pred, alpha=0.5, c='green', label='测试集')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
             'r--', linewidth=2, label='完美预测')
axes[1].set_xlabel('真实值')
axes[1].set_ylabel('预测值')
axes[1].set_title(f'测试集预测 (R²={test_r2:.3f})')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.suptitle('线性回归预测效果', fontsize=14)
plt.tight_layout()
plt.show()

六、代码讲解与关键点总结

python 复制代码

def code_explanation():
    """代码讲解"""
    
    print("\n" + "=" * 60)
    print("代码关键点讲解")
    print("=" * 60)
    
    explanations = {
        "正规方程": """
    # 正规方程代码详解
    X_with_bias = np.column_stack([X, np.ones(n_samples)])
    w_opt = np.linalg.inv(X_with_bias.T @ X_with_bias) @ X_with_bias.T @ y
    
    步骤：
    1. np.column_stack([X, np.ones]) - 添加一列1，对应偏置项b
    2. X_with_bias.T @ X_with_bias - 计算 X^T X
    3. np.linalg.inv() - 计算矩阵的逆
    4. @ X_with_bias.T @ y - 完成正规方程计算
    
    优点：一步到位，不需要迭代
    缺点：O(n³)时间复杂度，大数据集慢；矩阵可能不可逆
    """,
    
        "梯度下降": """
    # 梯度下降代码详解
    y_pred = X @ self.w + self.b          # 前向传播
    loss = np.mean((y_pred - y) ** 2)      # 计算损失
    dw = (2/n_samples) * X.T @ (y_pred - y)  # 计算w的梯度
    db = (2/n_samples) * np.sum(y_pred - y)  # 计算b的梯度
    self.w -= self.lr * dw                 # 更新w
    self.b -= self.lr * db                 # 更新b
    
    理解：
    - 梯度指向损失增加最快的方向
    - 减去梯度 × 学习率 = 向损失减小方向移动
    - 迭代直到损失不再下降
    
    优点：适合大数据集，可以在线学习
    缺点：需要选择学习率，可能陷入局部最优
    """,
    
        "R²分数": """
    # R²计算详解
    ss_res = np.sum((y - y_pred) ** 2)    # 模型未能解释的方差
    ss_tot = np.sum((y - np.mean(y)) ** 2) # 总方差
    r2 = 1 - ss_res / ss_tot
    
    解读：
    - R² = 0.8 表示模型解释了80%的数据变化
    - 比较基线：如果总是预测平均值，R²=0
    - R²可以是负数（模型比平均值还差）
    """
    }
    
    for title, content in explanations.items():
        print(f"\n📌 {title}")
        print(content)

code_explanation()

七、总结

线性回归核心要点：

概念	公式	作用
模型	ŷ = wx + b	预测连续值
损失函数	MSE = (1/n)Σ(y - ŷ)²	衡量预测好坏
优化方法	正规方程或梯度下降	找到最优参数
评估指标	R² = 1 - SS_res/SS_tot	评估拟合质量

正规方程 vs 梯度下降：

特性	正规方程	梯度下降
计算方式	直接计算	迭代优化
数据规模	适合小数据	适合大数据
需要调参	否	学习率
特征数量	受限	不限

记住：

线性回归假设线性关系
最小二乘法找到最佳直线
R²衡量模型解释了多少变化
线性回归是理解更复杂模型的基础

02-机器学习基础： 监督学习——线性回归

线性回归：预测连续值的基础模型

一、线性回归要解决什么问题？

1.1 问题场景

二、核心原理：最小二乘法

2.1 什么是"最好"的直线？

2.2 最小二乘法的数学原理

三、从零实现线性回归

3.1 使用正规方程（最小二乘法）

3.2 使用梯度下降（迭代方法）

四、评估指标：R²分数详解

4.1 R²的含义

五、使用scikit-learn实现

六、代码讲解与关键点总结

七、总结

02-机器学习基础：监督学习——线性回归