03-深度学习基础：循环神经网络（RNN）

循环神经网络（RNN）：处理序列数据

一、为什么需要RNN？

1.1 传统网络的局限

python 复制代码

import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

print("=" * 60)
print("为什么需要RNN？")
print("=" * 60)

# 示例：预测下一个词
print("\n📝 序列预测示例:")
print("   输入: '我 喜欢 吃' → 下一个词应该是？")
print("   传统网络: 无法记住前面的信息")
print("   RNN: 有记忆能力，能利用上下文")

# 可视化RNN的记忆概念
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# 传统网络
axes[0].axis('off')
axes[0].set_title('传统网络：无记忆')
axes[0].text(0.5, 0.5, '输入 → 输出\n每个输入独立处理\n没有时间概念', 
             ha='center', va='center', fontsize=12)

# RNN
axes[1].axis('off')
axes[1].set_title('RNN：有记忆')
axes[1].text(0.5, 0.5, '输入 → 输出\n      ↑\n     记忆\n信息在时间步之间传递', 
             ha='center', va='center', fontsize=12)

plt.tight_layout()
plt.show()

print("\n📊 RNN的应用场景:")
applications = [
    "文本生成: 根据前文生成下一个字",
    "机器翻译: 中文 → 英文",
    "情感分析: 判断评论是好评还是差评",
    "时间序列预测: 股票价格、天气预测",
    "语音识别: 音频信号 → 文字"
]
for app in applications:
    print(f"   • {app}")

二、RNN原理

2.1 RNN的基本结构

python 复制代码

def rnn_principle():
    """RNN原理讲解"""
    
    print("\n" + "=" * 60)
    print("RNN原理")
    print("=" * 60)
    
    # RNN公式
    print("\n📐 RNN核心公式:")
    print("   h_t = tanh(W_x * x_t + W_h * h_{t-1} + b)")
    print("   其中:")
    print("     x_t: 当前时间步的输入")
    print("     h_{t-1}: 上一时间步的隐藏状态（记忆）")
    print("     h_t: 当前时间步的输出（更新后的记忆）")
    
    # 从零实现RNN单元
    class SimpleRNNCell(nn.Module):
        def __init__(self, input_size, hidden_size):
            super(SimpleRNNCell, self).__init__()
            self.input_size = input_size
            self.hidden_size = hidden_size
            self.W_x = nn.Linear(input_size, hidden_size, bias=False)
            self.W_h = nn.Linear(hidden_size, hidden_size)
            self.b = nn.Parameter(torch.zeros(hidden_size))
        
        def forward(self, x, h_prev):
            # h = tanh(W_x * x + W_h * h_prev + b)
            h = torch.tanh(self.W_x(x) + self.W_h(h_prev) + self.b)
            return h
    
    # 测试RNN单元
    cell = SimpleRNNCell(10, 20)
    x = torch.randn(3, 10)  # batch=3, input=10
    h = torch.zeros(3, 20)   # batch=3, hidden=20
    h_next = cell(x, h)
    
    print(f"\n输入形状: {x.shape}")
    print(f"隐藏状态形状: {h.shape}")
    print(f"输出形状: {h_next.shape}")
    
    return cell

rnn_principle()

2.2 梯度消失与爆炸

python 复制代码

def vanishing_gradient_demo():
    """梯度消失/爆炸演示"""
    
    print("\n" + "=" * 60)
    print("梯度消失与梯度爆炸")
    print("=" * 60)
    
    # 模拟不同时间步的梯度大小
    time_steps = np.arange(1, 51)
    
    # 不同激活函数的梯度衰减
    tanh_grad = np.exp(-time_steps / 10)  # tanh梯度衰减
    relu_grad = np.ones_like(time_steps)  # ReLU梯度不衰减
    relu_grad[20:] = 0.5
    
    plt.figure(figsize=(12, 5))
    
    plt.plot(time_steps, tanh_grad, 'r-', linewidth=2, label='Tanh (梯度消失)')
    plt.plot(time_steps, relu_grad, 'b-', linewidth=2, label='ReLU (缓解)')
    plt.xlabel('时间步')
    plt.ylabel('梯度大小')
    plt.title('RNN中的梯度消失问题')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print("\n💡 梯度消失/爆炸的原因:")
    print("   1. 梯度消失: 激活函数导数<1，连乘导致梯度指数衰减")
    print("   2. 梯度爆炸: 权重矩阵特征值>1，连乘导致梯度指数增长")
    print("\n🔧 解决方案:")
    print("   1. 使用LSTM/GRU（门控机制）")
    print("   2. 梯度裁剪（Gradient Clipping）")
    print("   3. 使用ReLU激活函数")
    print("   4. 残差连接")

vanishing_gradient_demo()

三、LSTM：解决长期依赖

3.1 LSTM门控机制

python 复制代码

def lstm_principle():
    """LSTM原理讲解"""
    
    print("\n" + "=" * 60)
    print("LSTM：长短时记忆网络")
    print("=" * 60)
    
    # LSTM公式
    print("\n📐 LSTM核心公式:")
    print("   遗忘门: f_t = σ(W_f·[h_{t-1}, x_t] + b_f)")
    print("   输入门: i_t = σ(W_i·[h_{t-1}, x_t] + b_i)")
    print("   候选值: C̃_t = tanh(W_c·[h_{t-1}, x_t] + b_c)")
    print("   细胞状态: C_t = f_t * C_{t-1} + i_t * C̃_t")
    print("   输出门: o_t = σ(W_o·[h_{t-1}, x_t] + b_o)")
    print("   隐藏状态: h_t = o_t * tanh(C_t)")
    
    # 从零实现LSTM单元
    class LSTMCell(nn.Module):
        def __init__(self, input_size, hidden_size):
            super(LSTMCell, self).__init__()
            self.input_size = input_size
            self.hidden_size = hidden_size
            
            # 合并所有门控的权重（提升效率）
            self.W = nn.Linear(input_size + hidden_size, 4 * hidden_size)
        
        def forward(self, x, h_prev, c_prev):
            # 拼接输入和隐藏状态
            combined = torch.cat([x, h_prev], dim=1)
            
            # 计算所有门控
            gates = self.W(combined)
            i, f, g, o = torch.chunk(gates, 4, dim=1)
            
            # 激活函数
            i = torch.sigmoid(i)  # 输入门
            f = torch.sigmoid(f)  # 遗忘门
            g = torch.tanh(g)     # 候选值
            o = torch.sigmoid(o)  # 输出门
            
            # 更新细胞状态和隐藏状态
            c = f * c_prev + i * g
            h = o * torch.tanh(c)
            
            return h, c
    
    # 测试LSTM单元
    lstm_cell = LSTMCell(10, 20)
    x = torch.randn(3, 10)
    h = torch.zeros(3, 20)
    c = torch.zeros(3, 20)
    h_next, c_next = lstm_cell(x, h, c)
    
    print(f"\n输入形状: {x.shape}")
    print(f"隐藏状态形状: {h.shape}")
    print(f"细胞状态形状: {c.shape}")
    print(f"输出隐藏状态: {h_next.shape}")
    
    # 可视化LSTM门控
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    titles = ['遗忘门', '输入门', '输出门']
    colors = ['lightcoral', 'lightgreen', 'lightblue']
    descs = [
        '决定丢弃哪些旧信息\nsigmoid输出0(忘记)到1(保留)',
        '决定存储哪些新信息\nsigmoid+tanh生成候选值',
        '决定输出哪些信息\n基于细胞状态的tanh'
    ]
    
    for i, (title, color, desc) in enumerate(zip(titles, colors, descs)):
        ax = axes[i]
        ax.axis('off')
        ax.set_title(title, fontsize=12)
        ax.text(0.5, 0.5, desc, ha='center', va='center', fontsize=10,
               bbox=dict(boxstyle='round', facecolor=color, alpha=0.5))
    
    plt.suptitle('LSTM的三个门控机制', fontsize=14)
    plt.tight_layout()
    plt.show()
    
    return lstm_cell

lstm_principle()

3.2 PyTorch LSTM实现

python 复制代码

def lstm_pytorch():
    """PyTorch LSTM使用"""
    
    print("\n" + "=" * 60)
    print("PyTorch LSTM")
    print("=" * 60)
    
    # PyTorch内置LSTM
    lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
    
    print(f"LSTM层: {lstm}")
    print(f"参数量: {sum(p.numel() for p in lstm.parameters()):,}")
    
    # 前向传播
    x = torch.randn(32, 50, 10)  # batch=32, seq_len=50, input=10
    h0 = torch.zeros(2, 32, 20)   # num_layers, batch, hidden
    c0 = torch.zeros(2, 32, 20)
    
    output, (h_n, c_n) = lstm(x, (h0, c0))
    
    print(f"\n输入形状: {x.shape}")
    print(f"输出形状: {output.shape}")  # batch, seq_len, hidden
    print(f"最后隐藏状态: {h_n.shape}")
    print(f"最后细胞状态: {c_n.shape}")
    
    # 文本分类示例
    class LSTMClassifier(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2):
            super(LSTMClassifier, self).__init__()
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                                batch_first=True, dropout=0.2)
            self.fc = nn.Linear(hidden_dim, output_dim)
            self.dropout = nn.Dropout(0.2)
        
        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, (hidden, cell) = self.lstm(embedded)
            # 取最后时刻的隐藏状态
            hidden = self.dropout(hidden[-1])
            output = self.fc(hidden)
            return output
    
    model = LSTMClassifier(10000, 128, 256, 2)
    print(f"\nLSTM分类器参数量: {sum(p.numel() for p in model.parameters()):,}")
    
    return model

lstm_pytorch()

四、GRU：简化版LSTM

4.1 GRU原理

python 复制代码

def gru_principle():
    """GRU原理讲解"""
    
    print("\n" + "=" * 60)
    print("GRU：门控循环单元")
    print("=" * 60)
    
    # GRU公式
    print("\n📐 GRU核心公式:")
    print("   更新门: z_t = σ(W_z·[h_{t-1}, x_t])")
    print("   重置门: r_t = σ(W_r·[h_{t-1}, x_t])")
    print("   候选值: h̃_t = tanh(W_h·[r_t * h_{t-1}, x_t])")
    print("   隐藏状态: h_t = (1 - z_t) * h_{t-1} + z_t * h̃_t")
    
    # LSTM vs GRU对比
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # LSTM结构
    ax1 = axes[0]
    ax1.axis('off')
    ax1.set_title('LSTM (3个门控)', fontsize=12)
    ax1.text(0.5, 0.5, '遗忘门 + 输入门 + 输出门\n独立细胞状态\n参数量更多', 
             ha='center', va='center', fontsize=10)
    
    # GRU结构
    ax2 = axes[1]
    ax2.axis('off')
    ax2.set_title('GRU (2个门控)', fontsize=12)
    ax2.text(0.5, 0.5, '更新门 + 重置门\n无独立细胞状态\n参数量更少', 
             ha='center', va='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print("\n📊 LSTM vs GRU对比:")
    print("   LSTM: 3个门控，有独立细胞状态，参数量多，效果好")
    print("   GRU: 2个门控，无独立细胞状态，参数量少，训练快")
    print("   选择: 数据量大选LSTM，数据量小/速度优先选GRU")
    
    # PyTorch GRU
    gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2, batch_first=True)
    print(f"\nGRU参数量: {sum(p.numel() for p in gru.parameters()):,}")
    print(f"LSTM参数量对比: {sum(p.numel() for p in nn.LSTM(10,20,2).parameters()):,}")

gru_principle()

五、双向RNN

5.1 双向RNN原理

python 复制代码

def bidirectional_rnn():
    """双向RNN原理"""
    
    print("\n" + "=" * 60)
    print("双向RNN")
    print("=" * 60)
    
    # 双向LSTM
    bi_lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2, 
                      batch_first=True, bidirectional=True)
    
    print(f"双向LSTM: {bi_lstm}")
    print(f"参数量: {sum(p.numel() for p in bi_lstm.parameters()):,}")
    
    # 前向传播
    x = torch.randn(32, 50, 10)
    output, (h_n, c_n) = bi_lstm(x)
    
    print(f"\n输入形状: {x.shape}")
    print(f"输出形状: {output.shape}")  # batch, seq_len, 2*hidden
    print(f"隐藏状态形状: {h_n.shape}")  # 2*num_layers, batch, hidden
    
    # 双向RNN可视化
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.axis('off')
    
    # 绘制序列
    words = ['我', '爱', '北京', '天安门']
    x_pos = np.linspace(0.1, 0.9, len(words))
    
    for word, x in zip(words, x_pos):
        circle = plt.Circle((x, 0.5), 0.08, color='lightblue', ec='black')
        ax.add_patch(circle)
        ax.text(x, 0.5, word, ha='center', va='center', fontsize=10)
    
    # 正向箭头（蓝色）
    for i in range(len(words)-1):
        ax.annotate('', xy=(x_pos[i+1], 0.6), xytext=(x_pos[i], 0.6),
                   arrowprops=dict(arrowstyle='->', color='blue', lw=2))
    
    # 反向箭头（红色）
    for i in range(len(words)-1, 0, -1):
        ax.annotate('', xy=(x_pos[i-1], 0.4), xytext=(x_pos[i], 0.4),
                   arrowprops=dict(arrowstyle='->', color='red', lw=2))
    
    ax.set_title('双向RNN：同时从两个方向处理序列')
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.show()
    
    print("\n💡 双向RNN的优势:")
    print("   1. 同时利用过去和未来的信息")
    print("   2. 适合需要完整上下文的任务（文本分类、命名实体识别）")
    print("   3. 不适合实时预测（因为需要未来信息）")

bidirectional_rnn()

六、实战应用

6.1 文本分类（情感分析）

python 复制代码

def text_classification_demo():
    """文本分类实战"""
    
    print("\n" + "=" * 60)
    print("文本分类：情感分析")
    print("=" * 60)
    
    # 生成模拟数据
    np.random.seed(42)
    n_samples = 2000
    seq_length = 20
    vocab_size = 100
    
    # 模拟文本序列和标签
    X = np.random.randint(0, vocab_size, (n_samples, seq_length))
    y = (X.sum(axis=1) > 100).astype(np.int64)  # 简单规则
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # 转换为Tensor
    X_train_t = torch.LongTensor(X_train)
    y_train_t = torch.LongTensor(y_train)
    X_test_t = torch.LongTensor(X_test)
    y_test_t = torch.LongTensor(y_test)
    
    # 创建DataLoader
    train_dataset = TensorDataset(X_train_t, y_train_t)
    test_dataset = TensorDataset(X_test_t, y_test_t)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # 定义模型
    class TextLSTM(nn.Module):
        def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2):
            super(TextLSTM, self).__init__()
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
            self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                                batch_first=True, dropout=0.2)
            self.fc = nn.Linear(hidden_dim, output_dim)
            self.dropout = nn.Dropout(0.2)
        
        def forward(self, x):
            embedded = self.embedding(x)
            lstm_out, (hidden, cell) = self.lstm(embedded)
            hidden = self.dropout(hidden[-1])
            output = self.fc(hidden)
            return output
    
    # 训练
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TextLSTM(vocab_size, 64, 128, 2).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print(f"使用设备: {device}")
    print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
    
    epochs = 30
    train_losses = []
    test_accs = []
    
    for epoch in range(epochs):
        # 训练
        model.train()
        total_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        # 测试
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                output = model(batch_X)
                _, predicted = torch.max(output, 1)
                total += batch_y.size(0)
                correct += (predicted == batch_y).sum().item()
        
        accuracy = correct / total
        test_accs.append(accuracy)
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}')
    
    print(f"\n最终测试准确率: {test_accs[-1]:.4f}")
    
    # 可视化
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    axes[0].plot(train_losses, 'b-', linewidth=2)
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('训练损失曲线')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(test_accs, 'r-', linewidth=2)
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Accuracy')
    axes[1].set_title('测试准确率曲线')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

# text_classification_demo()

6.2 时间序列预测

python 复制代码

def time_series_demo():
    """时间序列预测实战"""
    
    print("\n" + "=" * 60)
    print("时间序列预测")
    print("=" * 60)
    
    # 生成正弦波数据
    np.random.seed(42)
    seq_length = 30
    n_samples = 1000
    
    def generate_sine_sequence(seq_length, n_samples):
        X = []
        y = []
        for _ in range(n_samples):
            start = np.random.uniform(0, 2*np.pi)
            t = np.linspace(start, start + seq_length * 0.1, seq_length + 1)
            wave = np.sin(t)
            X.append(wave[:-1].reshape(-1, 1))
            y.append(wave[-1])
        return np.array(X), np.array(y)
    
    X, y = generate_sine_sequence(seq_length, n_samples)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # 转换为Tensor
    X_train_t = torch.FloatTensor(X_train)
    y_train_t = torch.FloatTensor(y_train).view(-1, 1)
    X_test_t = torch.FloatTensor(X_test)
    y_test_t = torch.FloatTensor(y_test).view(-1, 1)
    
    # 定义LSTM预测模型
    class TimeSeriesLSTM(nn.Module):
        def __init__(self, input_size=1, hidden_size=64, num_layers=2, output_size=1):
            super(TimeSeriesLSTM, self).__init__()
            self.lstm = nn.LSTM(input_size, hidden_size, num_layers, 
                                batch_first=True, dropout=0.2)
            self.fc = nn.Linear(hidden_size, output_size)
        
        def forward(self, x):
            lstm_out, _ = self.lstm(x)
            last_output = lstm_out[:, -1, :]
            output = self.fc(last_output)
            return output
    
    # 训练
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = TimeSeriesLSTM().to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print(f"模型参数量: {sum(p.numel() for p in model.parameters()):,}")
    
    # 创建DataLoader
    train_dataset = TensorDataset(X_train_t, y_train_t)
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    epochs = 100
    train_losses = []
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_X, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            output = model(batch_X)
            loss = criterion(output, batch_y)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        train_losses.append(avg_loss)
        
        if (epoch + 1) % 20 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], Loss: {avg_loss:.6f}')
    
    # 预测
    model.eval()
    with torch.no_grad():
        predictions = model(X_test_t.to(device)).cpu().numpy()
    
    # 可视化
    plt.figure(figsize=(12, 4))
    plt.plot(y_test[:100], 'b-', label='真实值', linewidth=1.5)
    plt.plot(predictions[:100], 'r--', label='预测值', linewidth=1.5)
    plt.xlabel('时间步')
    plt.ylabel('值')
    plt.title('时间序列预测结果')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # 计算误差
    mse = np.mean((predictions - y_test) ** 2)
    print(f"\n测试集MSE: {mse:.6f}")

# time_series_demo()

七、总结

网络	特点	适用场景	参数量
RNN	简单循环结构	短序列	少
LSTM	3个门控+细胞状态	长序列、复杂依赖	多
GRU	2个门控	适中序列	中
双向RNN	前后向处理	完整上下文	2倍

选择指南：

短序列、简单任务 → RNN
长序列、复杂依赖 → LSTM
速度优先 → GRU
需要上下文 → 双向LSTM

记住：

RNN适合一切序列数据
LSTM解决了长期依赖
梯度消失是核心问题
门控机制是关键创新