Datawhale 大模型算法全栈基础篇 202602第3次笔记

第三节 循环神经网络

作业:

my_rnn.py

python 复制代码
import numpy as np
import torch
import torch.nn as nn

# ---------- 数据准备(直接复制)----------
def prepare_inputs():
    np.random.seed(42)
    vocab = {"播放": 0, "周杰伦": 1, "的": 2, "《稻香》": 3}
    tokens = ["播放", "周杰伦", "的", "《稻香》"]
    ids = [vocab[t] for t in tokens]

    V = len(vocab)          # 词表大小
    E = 128                 # 词向量维度
    H = 3                   # 隐藏状态维度(为了好算,设小一点)
    emb_table = np.random.randn(V, E).astype(np.float32)

    # 取出序列词向量,加上 batch 维度 → (B, T, E)  B=1
    x_np = emb_table[ids][None]   # shape: (1, 4, 128)
    return tokens, x_np, H, E

# ---------- 手写 RNN(用 NumPy)----------
def manual_rnn_numpy(x_np, U_np, W_np):
    B, T, E = x_np.shape
    H = W_np.shape[0]       # 隐藏维度
    h_prev = np.zeros((B, H), dtype=np.float32)   # 初始记忆为0
    steps = []
    for t in range(T):
        x_t = x_np[:, t, :]                         # 当前词向量
        h_t = np.tanh(x_t @ U_np + h_prev @ W_np)   # 核心公式
        steps.append(h_t)
        h_prev = h_t                                 # 更新记忆
    return np.stack(steps, axis=1), h_prev           # 返回所有步的输出和最后记忆

# ---------- PyTorch 官方 RNN ----------
def pytorch_rnn_forward(x_torch, U_torch, W_torch):
    E, H = U_torch.shape   # U 的尺寸是 (E, H)
    rnn = nn.RNN(
        input_size=E,
        hidden_size=H,
        num_layers=1,
        nonlinearity='tanh',
        bias=False,
        batch_first=True,
        bidirectional=False,
    )
    with torch.no_grad():
        # PyTorch 内部权重是转置后的,所以这里要转置
        rnn.weight_ih_l0.copy_(U_torch.T)
        rnn.weight_hh_l0.copy_(W_torch.T)
    y, h_n = rnn(x_torch)
    return y, h_n.squeeze(0)   # h_n 形状是 (1, B, H),去掉第一维

# ---------- 主程序 ----------
if __name__ == "__main__":
    # 1. 准备数据
    tokens, x_np, H, E = prepare_inputs()
    print("句子:", tokens)
    print("输入形状 (B,T,E):", x_np.shape)

    # 2. 初始化 RNN 权重(随机,但要固定种子)
    np.random.seed(42)
    U_np = np.random.randn(E, H).astype(np.float32)   # 输入权重
    W_np = np.random.randn(H, H).astype(np.float32)   # 循环权重

    # 3. 手写 RNN 计算
    out_manual_np, last_manual = manual_rnn_numpy(x_np, U_np, W_np)
    print("手写 RNN 输出形状:", out_manual_np.shape)   # (1,4,3)

    # 4. PyTorch RNN 计算
    x_torch = torch.from_numpy(x_np)
    U_torch = torch.from_numpy(U_np)
    W_torch = torch.from_numpy(W_np)
    out_torch, last_torch = pytorch_rnn_forward(x_torch, U_torch, W_torch)
    print("PyTorch RNN 输出形状:", out_torch.shape)

    # 5. 验证是否一致
    out_manual = torch.from_numpy(out_manual_np)
    print("两个结果是否一致?", torch.allclose(out_manual, out_torch, atol=1e-6))

输出:

(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_my_rnn.py"

句子: ['播放', '周杰伦', '的', '《稻香》']

输入形状 (B,T,E): (1, 4, 128)

手写 RNN 输出形状: (1, 4, 3)

PyTorch RNN 输出形状: torch.Size([1, 4, 3])

两个结果是否一致? True


第二节 LSTM 与 GRU

pip install scikit-learn torch numpy

lstm_text_classifier.py

python 复制代码
"""
lstm_text_classifier.py
基于 LSTM 的 20newsgroups 文本分类器
(对比之前的全连接模型,LSTM 能利用词语顺序信息)
"""

# ==================== 1. 导入所需库 ====================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# ==================== 2. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
# 训练集和测试集
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

X_train_texts = train_data.data          # 原始文本列表
y_train = train_data.target               # 标签(0~19)
X_test_texts = test_data.data
y_test = test_data.target

num_classes = len(train_data.target_names)  # 20
print(f"训练样本数: {len(X_train_texts)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")
print("类别名称:", train_data.target_names)

# ==================== 3. 文本预处理:词表构建与序列化 ====================
# 使用 CountVectorizer 构建词表(只保留最常用的 10000 个词)
# 注意:CountVectorizer 默认会对英文文本进行合理的分词(按单词边界分割,忽略标点)
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts)   # 只在训练集上拟合,构建词表

# 获取词表(特征名称)
feature_names = vectorizer.get_feature_names_out()
# 构建单词到索引的映射,索引从1开始,保留0给填充符和未知词
word2idx = {word: i+1 for i, word in enumerate(feature_names)}
vocab_size = len(word2idx) + 1   # 词表大小(含0)
print(f"词表大小(含填充符0): {vocab_size}")

# 定义将文本转为固定长度整数序列的函数
def text_to_sequence(text, max_len=200):
    """
    将输入文本转为整数ID序列,并进行截断/填充至固定长度max_len
    - 未知词用0表示(填充符)
    - 已知词用word2idx中的ID表示
    """
    # 简单按空格分词(CountVectorizer内部的分词方式更复杂,但这里为了快速对齐,直接split)
    # 注意:实际应用时建议使用与CountVectorizer一致的分词器,但为了简洁我们采用简单方式
    tokens = text.lower().split()
    ids = [word2idx.get(token, 0) for token in tokens]   # 未知词映射为0
    # 截断或填充
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids = ids + [0] * (max_len - len(ids))
    return ids

# 设置最大序列长度(根据数据分布可调整,这里取200,覆盖大部分文章)
max_len = 200

# 将训练集和测试集全部转换为整数序列
print("正在转换文本为整数序列...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])

# ==================== 4. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# 创建数据集和数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ==================== 5. 定义 LSTM 模型 ====================
class LSTMClassifier(nn.Module):
    """
    基于 LSTM 的文本分类器
    结构:Embedding -> LSTM -> Dropout -> Linear
    """
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1):
        super(LSTMClassifier, self).__init__()
        # 词嵌入层:将单词ID映射为稠密向量
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # LSTM层:batch_first=True 表示输入形状为 (batch, seq_len, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        # Dropout层,防止过拟合
        self.dropout = nn.Dropout(0.5)
        # 全连接分类层
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # x 形状: (batch_size, seq_len)
        embedded = self.embedding(x)            # (batch, seq_len, embed_dim)
        lstm_out, (h_n, c_n) = self.lstm(embedded)   # lstm_out: (batch, seq_len, hidden_dim)
        # 取最后一个时间步的隐藏状态作为整句的表示
        last_hidden = lstm_out[:, -1, :]         # (batch, hidden_dim)
        last_hidden = self.dropout(last_hidden)
        logits = self.fc(last_hidden)            # (batch, num_classes)
        return logits

# 超参数设置
embed_dim = 128          # 词嵌入维度
hidden_dim = 64          # LSTM 隐藏状态维度
num_layers = 1           # LSTM 层数
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers)
print("模型结构:\n", model)

# ==================== 6. 训练准备 ====================
# 选择设备(GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ==================== 7. 训练循环 ====================
num_epochs = 10
print("开始训练...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # 反向传播与优化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# ==================== 8. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"\n测试集准确率: {accuracy:.2f}%")

# ==================== 9. 推理示例 ====================
def predict(text, model, word2idx, max_len=200):
    """对单个文本进行预测,返回类别名称"""
    model.eval()
    seq = text_to_sequence(text, max_len)   # 转换为整数序列
    input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
    with torch.no_grad():
        logits = model(input_tensor)
        pred_idx = torch.argmax(logits, dim=1).item()
    return train_data.target_names[pred_idx]

# 用测试集中的第一个文本进行演示
sample_text = test_data.data[0]
true_label = test_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model, word2idx)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")

# (可选)保存模型
# torch.save(model.state_dict(), "lstm_classifier.pth")
# print("模型已保存为 lstm_classifier.pth")

输出:

(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_2_lstm_text_classifier.py"

正在加载 20newsgroups 数据...

训练样本数: 11314

测试样本数: 7532

类别数: 20

类别名称: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos',

'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

词表大小(含填充符0): 10001

正在转换文本为整数序列...

模型结构:

LSTMClassifier(

(embedding): Embedding(10001, 128, padding_idx=0)

(lstm): LSTM(128, 64, batch_first=True)

(dropout): Dropout(p=0.5, inplace=False)

(fc): Linear(in_features=64, out_features=20, bias=True)

)

使用设备: cuda

开始训练...

Epoch [1/10], Loss: 2.9933

Epoch [2/10], Loss: 2.9779

Epoch [3/10], Loss: 2.9484

Epoch [4/10], Loss: 2.8657

Epoch [5/10], Loss: 2.8117

Epoch [6/10], Loss: 2.8019

Epoch [7/10], Loss: 2.7235

Epoch [8/10], Loss: 2.7693

Epoch [9/10], Loss: 2.6299

Epoch [10/10], Loss: 2.6277

测试集准确率: 8.95%

推理示例:

文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)

Subject: Need info on 88-89 Bonneville

Organization: University at Buffalo

Lines: 10

News-Software: VAX/VMS VNEWS 1.41

Nntp-Posting-Host: ubvmsd....

真实类别: rec.autos

预测类别: rec.autos

📋 输出解释

1. 数据加载

text

复制代码
训练样本数: 11314
测试样本数: 7532
类别数: 20
类别名称: ['alt.atheism', 'comp.graphics', ...]

正常,数据加载成功。

2. 词表大小

text

复制代码
词表大小(含填充符0): 10001

因为设置了 max_features=10000,加上填充符0,词表共10001个词。

3. 模型结构

text

复制代码
LSTMClassifier(
  (embedding): Embedding(10001, 128, padding_idx=0)
  (lstm): LSTM(128, 64, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=64, out_features=20, bias=True)
)
  • 词嵌入维度 128

  • LSTM 隐藏层维度 64

  • Dropout 0.5

  • 全连接输出 20 类

    结构正确。

4. 训练损失

text

复制代码
Epoch [1/10], Loss: 2.9933
...
Epoch [10/10], Loss: 2.6277

损失从 2.99 降到 2.62,下降非常缓慢,说明模型几乎没有学到有效特征(正常文本分类损失应该能降到 1.0 以下)。

5. 测试准确率

text

复制代码
测试集准确率: 8.95%

20 个类别,随机猜测的正确率是 5%,8.95% 仅比随机好一点点,基本等于没学会。

6. 推理示例

text

复制代码
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)...
真实类别: rec.autos
预测类别: rec.autos

这个样本预测正确,但整体准确率极低,说明只是个别样本运气好。

使用 vectorizer.build_tokenizer() 获取与词表构建一致的分词器,保证训练和推理时文本转序列的一致性。修改为:

python 复制代码
"""
lstm_classifier_fixed.py
修复版 LSTM 文本分类器(20newsgroups)
主要修复:使用与 CountVectorizer 一致的 tokenizer,确保分词一致性
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# ==================== 1. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

X_train_texts = train_data.data
y_train = train_data.target
X_test_texts = test_data.data
y_test = test_data.target

num_classes = len(train_data.target_names)
print(f"训练样本数: {len(X_train_texts)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")

# ==================== 2. 构建词表与分词器 ====================
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts)   # 只拟合训练集

# 获取词表(特征名称)
feature_names = vectorizer.get_feature_names_out()
word2idx = {word: i+1 for i, word in enumerate(feature_names)}  # 索引从1开始,0留给填充符/未知词
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")

# 获取与 CountVectorizer 一致的分词器
tokenizer = vectorizer.build_tokenizer()

# ==================== 3. 文本转整数序列 ====================
max_len = 200   # 统一序列长度

def text_to_sequence(text, max_len=200):
    """
    将文本转为整数ID序列,使用与词表一致的分词器
    """
    tokens = tokenizer(text.lower())   # 先小写,再分词(CountVectorizer默认也会小写)
    ids = [word2idx.get(token, 0) for token in tokens]   # 未知词用0
    # 截断或填充
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids = ids + [0] * (max_len - len(ids))
    return ids

print("正在转换训练集...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
print("正在转换测试集...")
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])

# 可选:检查一下第一条序列的有效词比例(非零)
nonzero_ratio = np.count_nonzero(X_train_seq[0]) / max_len
print(f"第一条训练样本的有效词比例: {nonzero_ratio:.2f}")

# ==================== 4. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ==================== 5. 定义 LSTM 模型 ====================
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # 可选用双向LSTM(注释掉其中一行)
        # self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=False)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        # 如果使用双向LSTM,则隐藏维度为 hidden_dim * 2
        lstm_output_dim = hidden_dim * (2 if self.lstm.bidirectional else 1)
        self.fc = nn.Linear(lstm_output_dim, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)                # (batch, seq_len, embed_dim)
        lstm_out, (h_n, c_n) = self.lstm(embedded)  # lstm_out: (batch, seq_len, hidden_dim*num_directions)
        # 取最后一个时间步的输出(如果使用双向,需要拼接正反两个方向的最后输出,但这里直接用lstm_out[:, -1, :]即可)
        last_hidden = lstm_out[:, -1, :]             # (batch, hidden_dim * num_directions)
        last_hidden = self.dropout(last_hidden)
        logits = self.fc(last_hidden)
        return logits

# 超参数设置
embed_dim = 128
hidden_dim = 64
num_layers = 1
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers)
print("模型结构:\n", model)

# ==================== 6. 训练准备 ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")

criterion = nn.CrossEntropyLoss()
# 可以适当调高学习率,加快收敛
optimizer = optim.Adam(model.parameters(), lr=0.005)

# ==================== 7. 训练循环 ====================
num_epochs = 30   # 增加训练轮数,让损失充分下降
print("开始训练...")
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

# ==================== 8. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f"\n测试集准确率: {accuracy:.2f}%")

# ==================== 9. 推理示例 ====================
def predict(text, model, max_len=200):
    model.eval()
    seq = text_to_sequence(text, max_len)
    input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
    with torch.no_grad():
        logits = model(input_tensor)
        pred_idx = torch.argmax(logits, dim=1).item()
    return train_data.target_names[pred_idx]

sample_text = test_data.data[0]
true_label = test_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")

输出:

(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_3_lstm_classifier_fixed.py"

正在加载 20newsgroups 数据...

训练样本数: 11314

测试样本数: 7532

类别数: 20

词表大小(含填充符0): 10001

正在转换训练集...

正在转换测试集...

第一条训练样本的有效词比例: 0.28

模型结构:

LSTMClassifier(

(embedding): Embedding(10001, 128, padding_idx=0)

(lstm): LSTM(128, 64, batch_first=True, bidirectional=True)

(dropout): Dropout(p=0.5, inplace=False)

(fc): Linear(in_features=128, out_features=20, bias=True)

)

使用设备: cuda

开始训练...

Epoch [1/30], Loss: 2.9771

Epoch [2/30], Loss: 2.8490

Epoch [3/30], Loss: 2.6860

Epoch [4/30], Loss: 2.6366

Epoch [5/30], Loss: 2.5454

Epoch [6/30], Loss: 2.4195

Epoch [7/30], Loss: 2.3379

Epoch [8/30], Loss: 2.2556

Epoch [9/30], Loss: 2.3179

Epoch [10/30], Loss: 2.3072

Epoch [11/30], Loss: 2.0003

Epoch [12/30], Loss: 1.7037

Epoch [13/30], Loss: 1.3988

Epoch [14/30], Loss: 1.1945

Epoch [15/30], Loss: 1.1195

Epoch [16/30], Loss: 0.9292

Epoch [17/30], Loss: 0.8037

Epoch [18/30], Loss: 0.7597

Epoch [19/30], Loss: 0.6750

Epoch [20/30], Loss: 0.6337

Epoch [21/30], Loss: 0.5436

Epoch [22/30], Loss: 0.4728

Epoch [23/30], Loss: 0.4769

Epoch [24/30], Loss: 0.5828

Epoch [25/30], Loss: 0.5785

Epoch [26/30], Loss: 0.4222

Epoch [27/30], Loss: 0.3580

Epoch [28/30], Loss: 0.3220

Epoch [29/30], Loss: 0.2826

Epoch [30/30], Loss: 0.2464

测试集准确率: 50.57%

推理示例:

文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)

Subject: Need info on 88-89 Bonneville

Organization: University at Buffalo

Lines: 10

News-Software: VAX/VMS VNEWS 1.41

Nntp-Posting-Host: ubvmsd....

真实类别: rec.autos

预测类别: comp.sys.mac.hardware

📊 输出结果解释

1. 数据预处理

text

复制代码
第一条训练样本的有效词比例: 0.28
  • 这意味着第一条训练样本中,有 28% 的词(56 个)是有效词(非填充符 0),其余是填充。这个比例正常,说明文本没有因为分词问题变成全零。

2. 模型结构

text

复制代码
LSTMClassifier(
  (embedding): Embedding(10001, 128, padding_idx=0)
  (lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=128, out_features=20, bias=True)
)
  • 使用双向 LSTM(bidirectional=True),因此最后全连接层的输入维度是 hidden_dim * 2 = 128,正确。

3. 训练损失

text

复制代码
Epoch [1/30], Loss: 2.9771
...
Epoch [30/30], Loss: 0.2464
  • 损失从 2.98 稳步下降到 0.25,说明模型在学习,并且拟合得不错(训练集上损失很低)。

4. 测试准确率

text

复制代码
测试集准确率: 50.57%
  • 在 20 个类别中,随机猜对的概率是 5%,50.57% 意味着模型确实学到了很多东西,但还有很大提升空间。

5. 推理示例

text

复制代码
真实类别: rec.autos
预测类别: comp.sys.mac.hardware
  • 模型把一篇关于汽车的文章误判为计算机硬件,说明它还没有完全理解这两个领域的细微差别,或者训练数据中这两个类别有相似之处。

🔍 为什么准确率只有 50%?

这是一个典型的过拟合现象:训练损失降到了 0.25(非常低),但测试准确率只有 50%,说明模型在训练集上表现得很好,但在未见过的测试集上泛化能力不足。可能的原因有:

  1. 模型容量过大:双向 LSTM 加上 Dropout 可能还是不够强,或者 hidden_dim 128 对于这个数据集来说足够,但训练时间可能还不够(30 个 epoch 对于 LSTM 来说不算多,有些模型需要 50+ epoch)。

  2. 超参数未调优:学习率、隐藏层大小、dropout 比例等可以进一步优化。

  3. 文本表示仍较简单:我们只用了简单的词 ID 序列,没有使用预训练的词向量(如 GloVe 或 fastText),也没有考虑词的重要性(如 TF-IDF 加权)。

  4. 序列长度截断max_len=200 可能丢失了部分长文本的尾部信息,但影响不会太大。


🚀 改进建议

1. 增加训练轮数

当前 30 个 epoch 可能还不够,可以尝试 50 或 100 个 epoch,同时观察验证集损失(可以用一部分训练数据作为验证集),防止过拟合。

2. 使用验证集和早停(Early Stopping)

从训练集中分出一部分(比如 20%)作为验证集,在每个 epoch 后计算验证集准确率,当验证集准确率不再提升时停止训练,可以有效防止过拟合。

3. 调整 Dropout 比例

当前 Dropout=0.5,可以尝试增大到 0.6 或 0.7,或者增加 Dropout 的位置(例如在 embedding 后也加 Dropout)。

4. 使用预训练词向量

加载预训练的 GloVe 或 fastText 词向量作为 Embedding 的初始值,可以大大提升模型的泛化能力(尤其对于新闻文本)。

5. 调整 LSTM 参数

  • 增加 hidden_dim 到 128 或 256。

  • 增加 LSTM 层数(num_layers=2),但要注意过拟合风险。

6. 使用梯度裁剪(Gradient Clipping)

python 复制代码
"""
lstm_classifier_improved.py
改进版 LSTM 文本分类器(20newsgroups)
- 增加了验证集和早停机制
- 增加了梯度裁剪
- 调整了 Dropout 和训练轮数
- 保留了双向 LSTM 结构
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# ==================== 1. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

X_train_texts_full = train_data.data      # 原始训练文本(全部)
y_train_full = train_data.target          # 原始训练标签
X_test_texts = test_data.data
y_test = test_data.target

num_classes = len(train_data.target_names)
print(f"原始训练样本数: {len(X_train_texts_full)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")

# ==================== 2. 划分训练集和验证集 ====================
# 从原始训练集中分出 20% 作为验证集
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
    X_train_texts_full, y_train_full, test_size=0.2, random_state=42
)
print(f"训练集大小: {len(X_train_texts)}")
print(f"验证集大小: {len(X_val_texts)}")

# ==================== 3. 构建词表与分词器 ====================
max_features = 10000
# 词表必须在全部原始训练集上构建,以保证覆盖所有可能出现的词(验证集和测试集可能出现训练集没见过的词,但这是合理的)
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts_full)   # 使用全部原始训练集构建词表

# 获取词表映射
feature_names = vectorizer.get_feature_names_out()
word2idx = {word: i+1 for i, word in enumerate(feature_names)}  # 0 留给填充和未知词
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")

# 获取与 CountVectorizer 一致的分词器
tokenizer = vectorizer.build_tokenizer()

# ==================== 4. 文本转整数序列 ====================
max_len = 200  # 统一序列长度

def text_to_sequence(text, max_len=200):
    """将文本转为整数ID序列,使用与词表一致的分词器"""
    tokens = tokenizer(text.lower())
    ids = [word2idx.get(token, 0) for token in tokens]   # 未知词用0
    # 截断或填充
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids = ids + [0] * (max_len - len(ids))
    return ids

print("正在转换训练集...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
print("正在转换验证集...")
X_val_seq = np.array([text_to_sequence(text, max_len) for text in X_val_texts])
print("正在转换测试集...")
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])

# 检查一下有效词比例(可选)
nonzero_ratio = np.count_nonzero(X_train_seq[0]) / max_len
print(f"第一条训练样本的有效词比例: {nonzero_ratio:.2f}")

# ==================== 5. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ==================== 6. 定义 LSTM 模型 ====================
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # 使用双向 LSTM
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout if num_layers>1 else 0)
        self.dropout = nn.Dropout(dropout)
        # 双向LSTM的输出维度是 hidden_dim * 2
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)                     # (batch, seq_len, embed_dim)
        lstm_out, (h_n, c_n) = self.lstm(embedded)       # lstm_out: (batch, seq_len, hidden_dim*2)
        # 取最后一个时间步的输出作为整句表示
        last_hidden = lstm_out[:, -1, :]                  # (batch, hidden_dim*2)
        last_hidden = self.dropout(last_hidden)
        logits = self.fc(last_hidden)
        return logits

# 超参数设置
embed_dim = 128
hidden_dim = 64
num_layers = 1
dropout = 0.6        # 增加 dropout 防止过拟合
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers, dropout)
print("模型结构:\n", model)

# ==================== 7. 训练准备 ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

# 早停相关参数
best_val_acc = 0.0
patience = 5         # 连续5个epoch验证准确率不提升则停止
patience_counter = 0
num_epochs = 50      # 最大训练轮数

# ==================== 8. 训练循环(含验证和早停) ====================
print("开始训练...")
for epoch in range(num_epochs):
    # 训练阶段
    model.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        # 梯度裁剪(防止梯度爆炸)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

    # 验证阶段
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_acc = 100 * correct / total
    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")

    # 早停判断:保存最佳模型
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        # 保存最佳模型参数(可选)
        best_model_state = model.state_dict()
        print(f"  -> 新的最佳验证准确率: {best_val_acc:.2f}%")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"验证准确率连续 {patience} 个epoch未提升,提前停止训练。")
            break

# 加载最佳模型(用于测试)
model.load_state_dict(best_model_state)

# ==================== 9. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"\n测试集准确率: {test_acc:.2f}%")

# ==================== 10. 推理示例 ====================
def predict(text, model, max_len=200):
    model.eval()
    seq = text_to_sequence(text, max_len)
    input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
    with torch.no_grad():
        logits = model(input_tensor)
        pred_idx = torch.argmax(logits, dim=1).item()
    return train_data.target_names[pred_idx]

sample_text = test_data.data[0]
true_label = train_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")

# (可选)保存最终模型
# torch.save(model.state_dict(), "lstm_classifier_best.pth")
# print("最佳模型已保存为 lstm_classifier_best.pth")

输出:

(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_4_lstm_classifier_improved.py"

正在加载 20newsgroups 数据...

原始训练样本数: 11314

测试样本数: 7532

类别数: 20

训练集大小: 9051

验证集大小: 2263

词表大小(含填充符0): 10001

正在转换训练集...

正在转换验证集...

正在转换测试集...

第一条训练样本的有效词比例: 0.16

模型结构:

LSTMClassifier(

(embedding): Embedding(10001, 128, padding_idx=0)

(lstm): LSTM(128, 64, batch_first=True, bidirectional=True)

(dropout): Dropout(p=0.6, inplace=False)

(fc): Linear(in_features=128, out_features=20, bias=True)

)

使用设备: cuda

开始训练...

Epoch [1/50], Train Loss: 3.0008, Val Loss: 2.9961, Val Acc: 5.88%

-> 新的最佳验证准确率: 5.88%

Epoch [2/50], Train Loss: 2.9373, Val Loss: 2.9798, Val Acc: 8.13%

-> 新的最佳验证准确率: 8.13%

Epoch [3/50], Train Loss: 2.7867, Val Loss: 2.9330, Val Acc: 9.50%

-> 新的最佳验证准确率: 9.50%

Epoch [4/50], Train Loss: 2.6558, Val Loss: 2.9762, Val Acc: 9.59%

-> 新的最佳验证准确率: 9.59%

Epoch [5/50], Train Loss: 2.4854, Val Loss: 2.9485, Val Acc: 11.27%

-> 新的最佳验证准确率: 11.27%

Epoch [6/50], Train Loss: 2.3580, Val Loss: 2.9656, Val Acc: 12.15%

-> 新的最佳验证准确率: 12.15%

Epoch [7/50], Train Loss: 2.2207, Val Loss: 2.9712, Val Acc: 12.86%

-> 新的最佳验证准确率: 12.86%

Epoch [8/50], Train Loss: 2.1157, Val Loss: 2.8745, Val Acc: 15.73%

-> 新的最佳验证准确率: 15.73%

Epoch [9/50], Train Loss: 2.0042, Val Loss: 2.7749, Val Acc: 18.74%

-> 新的最佳验证准确率: 18.74%

Epoch [10/50], Train Loss: 1.9047, Val Loss: 2.8088, Val Acc: 20.15%

-> 新的最佳验证准确率: 20.15%

Epoch [11/50], Train Loss: 1.8215, Val Loss: 2.7210, Val Acc: 22.85%

-> 新的最佳验证准确率: 22.85%

Epoch [12/50], Train Loss: 1.7812, Val Loss: 2.7424, Val Acc: 21.70%

Epoch [13/50], Train Loss: 1.6840, Val Loss: 2.7344, Val Acc: 23.20%

-> 新的最佳验证准确率: 23.20%

Epoch [14/50], Train Loss: 1.6036, Val Loss: 2.7079, Val Acc: 24.88%

-> 新的最佳验证准确率: 24.88%

Epoch [15/50], Train Loss: 1.5676, Val Loss: 2.7765, Val Acc: 23.42%

Epoch [16/50], Train Loss: 1.5323, Val Loss: 2.7000, Val Acc: 25.89%

-> 新的最佳验证准确率: 25.89%

Epoch [17/50], Train Loss: 1.5011, Val Loss: 2.7656, Val Acc: 25.76%

Epoch [18/50], Train Loss: 1.4730, Val Loss: 2.6730, Val Acc: 27.31%

-> 新的最佳验证准确率: 27.31%

Epoch [19/50], Train Loss: 1.3944, Val Loss: 2.6543, Val Acc: 28.68%

-> 新的最佳验证准确率: 28.68%

Epoch [20/50], Train Loss: 1.3701, Val Loss: 2.7498, Val Acc: 27.04%

Epoch [21/50], Train Loss: 1.3178, Val Loss: 2.6921, Val Acc: 27.00%

Epoch [22/50], Train Loss: 1.3029, Val Loss: 2.7079, Val Acc: 28.37%

Epoch [23/50], Train Loss: 1.2716, Val Loss: 2.7714, Val Acc: 28.68%

Epoch [24/50], Train Loss: 1.2674, Val Loss: 2.7252, Val Acc: 29.03%

-> 新的最佳验证准确率: 29.03%

Epoch [25/50], Train Loss: 1.2675, Val Loss: 2.7603, Val Acc: 28.28%

Epoch [26/50], Train Loss: 1.2475, Val Loss: 2.7091, Val Acc: 29.03%

Epoch [27/50], Train Loss: 1.1894, Val Loss: 2.7516, Val Acc: 31.06%

-> 新的最佳验证准确率: 31.06%

Epoch [28/50], Train Loss: 1.1878, Val Loss: 2.7256, Val Acc: 30.76%

Epoch [29/50], Train Loss: 1.1927, Val Loss: 2.8200, Val Acc: 30.49%

Epoch [30/50], Train Loss: 1.1764, Val Loss: 2.7709, Val Acc: 30.89%

Epoch [31/50], Train Loss: 1.1317, Val Loss: 2.9031, Val Acc: 29.12%

Epoch [32/50], Train Loss: 1.1335, Val Loss: 2.8258, Val Acc: 30.40%

验证准确率连续 5 个epoch未提升,提前停止训练。

测试集准确率: 26.83%

推理示例:

文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)

Subject: Need info on 88-89 Bonneville

Organization: University at Buffalo

Lines: 10

News-Software: VAX/VMS VNEWS 1.41

Nntp-Posting-Host: ubvmsd....

真实类别: rec.autos

预测类别: sci.med

输出结果可以看出,模型仍然处于欠拟合 状态(训练损失下降缓慢,验证准确率最高只有 31%,测试准确率 26.8%)。这说明模型没有足够的能力从数据中学习到有效的分类特征。我们需要在保持 LSTM 模型结构不变的前提下,调整数据预处理和超参数来提升性能。

python 复制代码
"""
lstm_classifier_tuned.py
调优版 LSTM 文本分类器(20newsgroups)
- 增大 max_len, max_features, hidden_dim
- 降低学习率,增大 patience
- 保持 LSTM 结构不变,仅优化超参数
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

# ==================== 1. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

X_train_texts_full = train_data.data
y_train_full = train_data.target
X_test_texts = test_data.data
y_test = test_data.target

num_classes = len(train_data.target_names)
print(f"原始训练样本数: {len(X_train_texts_full)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")

# ==================== 2. 划分训练集和验证集 ====================
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
    X_train_texts_full, y_train_full, test_size=0.2, random_state=42
)
print(f"训练集大小: {len(X_train_texts)}")
print(f"验证集大小: {len(X_val_texts)}")

# ==================== 3. 构建词表与分词器(增大 max_features)====================
max_features = 20000   # 从 10000 增大到 20000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts_full)   # 使用全部原始训练集构建词表

feature_names = vectorizer.get_feature_names_out()
word2idx = {word: i+1 for i, word in enumerate(feature_names)}
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")

tokenizer = vectorizer.build_tokenizer()

# ==================== 4. 文本转整数序列(增大 max_len)====================
max_len = 400   # 从 200 增大到 400

def text_to_sequence(text, max_len=400):
    tokens = tokenizer(text.lower())
    ids = [word2idx.get(token, 0) for token in tokens]
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids = ids + [0] * (max_len - len(ids))
    return ids

print("正在转换训练集...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
print("正在转换验证集...")
X_val_seq = np.array([text_to_sequence(text, max_len) for text in X_val_texts])
print("正在转换测试集...")
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])

# 检查有效词比例
nonzero_ratio = np.count_nonzero(X_train_seq[0]) / max_len
print(f"第一条训练样本的有效词比例: {nonzero_ratio:.2f}")

# ==================== 5. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# ==================== 6. 定义 LSTM 模型(增大 hidden_dim)====================
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        # 在 Embedding 后增加 Dropout(可选)
        self.embed_dropout = nn.Dropout(dropout)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, 
                            bidirectional=True, dropout=dropout if num_layers>1 else 0)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = self.embed_dropout(x)          # 新增:embedding 后 dropout
        lstm_out, (h_n, c_n) = self.lstm(x)
        last_hidden = lstm_out[:, -1, :]
        last_hidden = self.dropout(last_hidden)
        logits = self.fc(last_hidden)
        return logits

# 超参数设置
embed_dim = 128
hidden_dim = 128          # 从 64 增大到 128
num_layers = 1
dropout = 0.6
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers, dropout)
print("模型结构:\n", model)

# ==================== 7. 训练准备 ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)   # 学习率从 0.005 降到 0.001

# 早停参数
best_val_acc = 0.0
patience = 10          # 从 5 增大到 10
patience_counter = 0
num_epochs = 100       # 最大训练轮数增加到 100

# ==================== 8. 训练循环 ====================
print("开始训练...")
for epoch in range(num_epochs):
    # 训练
    model.train()
    total_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        total_loss += loss.item()
    avg_train_loss = total_loss / len(train_loader)

    # 验证
    model.eval()
    correct = 0
    total = 0
    val_loss = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_acc = 100 * correct / total
    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")

    # 早停判断
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        patience_counter = 0
        best_model_state = model.state_dict()
        print(f"  -> 新的最佳验证准确率: {best_val_acc:.2f}%")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print(f"验证准确率连续 {patience} 个epoch未提升,提前停止训练。")
            break

# 加载最佳模型
model.load_state_dict(best_model_state)

# ==================== 9. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"\n测试集准确率: {test_acc:.2f}%")

# ==================== 10. 推理示例 ====================
def predict(text, model, max_len=400):
    model.eval()
    seq = text_to_sequence(text, max_len)
    input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
    with torch.no_grad():
        logits = model(input_tensor)
        pred_idx = torch.argmax(logits, dim=1).item()
    return train_data.target_names[pred_idx]

sample_text = test_data.data[0]
true_label = train_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")

# 可选保存模型
# torch.save(model.state_dict(), "lstm_classifier_tuned.pth")

输出:

(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_5_lstm_classifier_tuned.py"

正在加载 20newsgroups 数据...

原始训练样本数: 11314

测试样本数: 7532

类别数: 20

训练集大小: 9051

验证集大小: 2263

词表大小(含填充符0): 20001

正在转换训练集...

正在转换验证集...

正在转换测试集...

第一条训练样本的有效词比例: 0.09

模型结构:

LSTMClassifier(

(embedding): Embedding(20001, 128, padding_idx=0)

(embed_dropout): Dropout(p=0.6, inplace=False)

(lstm): LSTM(128, 128, batch_first=True, bidirectional=True)

(dropout): Dropout(p=0.6, inplace=False)

(fc): Linear(in_features=256, out_features=20, bias=True)

)

使用设备: cuda

开始训练...

Epoch [1/100], Train Loss: 2.9934, Val Loss: 2.9925, Val Acc: 5.30%

-> 新的最佳验证准确率: 5.30%

Epoch [2/100], Train Loss: 2.9904, Val Loss: 2.9934, Val Acc: 5.04%

Epoch [3/100], Train Loss: 2.9851, Val Loss: 2.9895, Val Acc: 5.52%

-> 新的最佳验证准确率: 5.52%

Epoch [4/100], Train Loss: 2.9772, Val Loss: 2.9923, Val Acc: 5.92%

-> 新的最佳验证准确率: 5.92%

Epoch [5/100], Train Loss: 2.9688, Val Loss: 2.9891, Val Acc: 5.92%

Epoch [6/100], Train Loss: 2.9609, Val Loss: 3.0199, Val Acc: 5.66%

Epoch [7/100], Train Loss: 2.9602, Val Loss: 2.9890, Val Acc: 5.52%

Epoch [8/100], Train Loss: 2.9398, Val Loss: 2.9826, Val Acc: 5.92%

Epoch [9/100], Train Loss: 2.9257, Val Loss: 2.9847, Val Acc: 6.50%

-> 新的最佳验证准确率: 6.50%

Epoch [10/100], Train Loss: 2.9158, Val Loss: 2.9717, Val Acc: 6.05%

Epoch [11/100], Train Loss: 2.9083, Val Loss: 2.9788, Val Acc: 6.27%

Epoch [12/100], Train Loss: 2.8983, Val Loss: 2.9757, Val Acc: 5.74%

Epoch [13/100], Train Loss: 2.8955, Val Loss: 2.9722, Val Acc: 7.03%

-> 新的最佳验证准确率: 7.03%

Epoch [14/100], Train Loss: 2.8858, Val Loss: 2.9745, Val Acc: 6.54%

Epoch [15/100], Train Loss: 2.8636, Val Loss: 2.9735, Val Acc: 6.50%

Epoch [16/100], Train Loss: 2.8723, Val Loss: 2.9780, Val Acc: 6.45%

Epoch [17/100], Train Loss: 2.8414, Val Loss: 2.9722, Val Acc: 6.76%

Epoch [18/100], Train Loss: 2.8444, Val Loss: 2.9771, Val Acc: 6.72%

Epoch [19/100], Train Loss: 2.8255, Val Loss: 2.9778, Val Acc: 6.63%

Epoch [20/100], Train Loss: 2.8099, Val Loss: 2.9900, Val Acc: 6.89%

Epoch [21/100], Train Loss: 2.8012, Val Loss: 2.9472, Val Acc: 7.56%

-> 新的最佳验证准确率: 7.56%

Epoch [22/100], Train Loss: 2.7923, Val Loss: 2.9121, Val Acc: 8.53%

-> 新的最佳验证准确率: 8.53%

Epoch [23/100], Train Loss: 2.7379, Val Loss: 2.8737, Val Acc: 9.54%

-> 新的最佳验证准确率: 9.54%

Epoch [24/100], Train Loss: 2.7570, Val Loss: 2.8585, Val Acc: 7.78%

Epoch [25/100], Train Loss: 2.7018, Val Loss: 2.7924, Val Acc: 10.34%

-> 新的最佳验证准确率: 10.34%

Epoch [26/100], Train Loss: 2.6723, Val Loss: 2.7729, Val Acc: 10.30%

Epoch [27/100], Train Loss: 2.6525, Val Loss: 2.7687, Val Acc: 10.47%

-> 新的最佳验证准确率: 10.47%

Epoch [28/100], Train Loss: 2.6096, Val Loss: 2.7053, Val Acc: 12.77%

-> 新的最佳验证准确率: 12.77%

Epoch [29/100], Train Loss: 2.5758, Val Loss: 2.6341, Val Acc: 15.33%

-> 新的最佳验证准确率: 15.33%

Epoch [30/100], Train Loss: 2.5617, Val Loss: 2.6345, Val Acc: 14.94%

Epoch [31/100], Train Loss: 2.5537, Val Loss: 2.6234, Val Acc: 14.76%

Epoch [32/100], Train Loss: 2.5496, Val Loss: 2.5999, Val Acc: 15.25%

Epoch [33/100], Train Loss: 2.4813, Val Loss: 2.5597, Val Acc: 17.54%

-> 新的最佳验证准确率: 17.54%

Epoch [34/100], Train Loss: 2.4723, Val Loss: 2.5195, Val Acc: 17.32%

Epoch [35/100], Train Loss: 2.4294, Val Loss: 2.5008, Val Acc: 18.34%

-> 新的最佳验证准确率: 18.34%

Epoch [36/100], Train Loss: 2.3932, Val Loss: 2.4861, Val Acc: 20.19%

-> 新的最佳验证准确率: 20.19%

Epoch [37/100], Train Loss: 2.3806, Val Loss: 2.3994, Val Acc: 21.21%

-> 新的最佳验证准确率: 21.21%

Epoch [38/100], Train Loss: 2.3594, Val Loss: 2.4138, Val Acc: 19.97%

Epoch [39/100], Train Loss: 2.3324, Val Loss: 2.3612, Val Acc: 22.05%

-> 新的最佳验证准确率: 22.05%

Epoch [40/100], Train Loss: 2.3284, Val Loss: 2.3772, Val Acc: 21.87%

Epoch [41/100], Train Loss: 2.3104, Val Loss: 2.4241, Val Acc: 21.08%

Epoch [42/100], Train Loss: 2.2869, Val Loss: 2.3247, Val Acc: 22.62%

-> 新的最佳验证准确率: 22.62%

Epoch [43/100], Train Loss: 2.2690, Val Loss: 2.3552, Val Acc: 21.48%

Epoch [44/100], Train Loss: 2.2540, Val Loss: 2.2830, Val Acc: 22.98%

-> 新的最佳验证准确率: 22.98%

Epoch [45/100], Train Loss: 2.2201, Val Loss: 2.2846, Val Acc: 24.48%

-> 新的最佳验证准确率: 24.48%

Epoch [46/100], Train Loss: 2.2100, Val Loss: 2.2677, Val Acc: 25.01%

-> 新的最佳验证准确率: 25.01%

Epoch [47/100], Train Loss: 2.2050, Val Loss: 2.2163, Val Acc: 25.36%

-> 新的最佳验证准确率: 25.36%

Epoch [48/100], Train Loss: 2.1736, Val Loss: 2.2280, Val Acc: 24.66%

Epoch [49/100], Train Loss: 2.1752, Val Loss: 2.2007, Val Acc: 25.50%

-> 新的最佳验证准确率: 25.50%

Epoch [50/100], Train Loss: 2.1482, Val Loss: 2.1867, Val Acc: 25.01%

Epoch [51/100], Train Loss: 2.1357, Val Loss: 2.1761, Val Acc: 27.35%

-> 新的最佳验证准确率: 27.35%

Epoch [52/100], Train Loss: 2.1297, Val Loss: 2.1515, Val Acc: 27.35%

Epoch [53/100], Train Loss: 2.1202, Val Loss: 2.1627, Val Acc: 27.79%

-> 新的最佳验证准确率: 27.79%

Epoch [54/100], Train Loss: 2.1075, Val Loss: 2.1540, Val Acc: 26.69%

Epoch [55/100], Train Loss: 2.0704, Val Loss: 2.1215, Val Acc: 30.05%

-> 新的最佳验证准确率: 30.05%

Epoch [56/100], Train Loss: 2.0713, Val Loss: 2.1024, Val Acc: 28.24%

Epoch [57/100], Train Loss: 2.0405, Val Loss: 2.1385, Val Acc: 28.06%

Epoch [58/100], Train Loss: 2.0408, Val Loss: 2.1059, Val Acc: 28.02%

Epoch [59/100], Train Loss: 2.0342, Val Loss: 2.0812, Val Acc: 29.43%

Epoch [60/100], Train Loss: 2.0772, Val Loss: 2.0958, Val Acc: 28.46%

Epoch [61/100], Train Loss: 2.0226, Val Loss: 2.0878, Val Acc: 28.81%

Epoch [62/100], Train Loss: 2.0026, Val Loss: 2.0587, Val Acc: 29.47%

Epoch [63/100], Train Loss: 1.9920, Val Loss: 2.0425, Val Acc: 30.53%

-> 新的最佳验证准确率: 30.53%

Epoch [64/100], Train Loss: 1.9639, Val Loss: 2.0443, Val Acc: 31.29%

-> 新的最佳验证准确率: 31.29%

Epoch [65/100], Train Loss: 1.9620, Val Loss: 2.0068, Val Acc: 32.17%

-> 新的最佳验证准确率: 32.17%

Epoch [66/100], Train Loss: 1.9541, Val Loss: 2.0236, Val Acc: 32.21%

-> 新的最佳验证准确率: 32.21%

Epoch [67/100], Train Loss: 1.9310, Val Loss: 1.9914, Val Acc: 32.30%

-> 新的最佳验证准确率: 32.30%

Epoch [68/100], Train Loss: 1.9216, Val Loss: 1.9637, Val Acc: 33.01%

-> 新的最佳验证准确率: 33.01%

Epoch [69/100], Train Loss: 1.8879, Val Loss: 2.0479, Val Acc: 33.36%

-> 新的最佳验证准确率: 33.36%

Epoch [70/100], Train Loss: 1.8780, Val Loss: 1.9131, Val Acc: 34.56%

-> 新的最佳验证准确率: 34.56%

Epoch [71/100], Train Loss: 1.8952, Val Loss: 1.9649, Val Acc: 35.00%

-> 新的最佳验证准确率: 35.00%

Epoch [72/100], Train Loss: 1.8307, Val Loss: 1.8973, Val Acc: 35.04%

-> 新的最佳验证准确率: 35.04%

Epoch [73/100], Train Loss: 1.8079, Val Loss: 1.8552, Val Acc: 36.63%

-> 新的最佳验证准确率: 36.63%

Epoch [74/100], Train Loss: 1.7991, Val Loss: 1.8432, Val Acc: 36.94%

-> 新的最佳验证准确率: 36.94%

Epoch [75/100], Train Loss: 1.7814, Val Loss: 1.8238, Val Acc: 37.47%

-> 新的最佳验证准确率: 37.47%

Epoch [76/100], Train Loss: 1.7588, Val Loss: 1.7813, Val Acc: 38.40%

-> 新的最佳验证准确率: 38.40%

Epoch [77/100], Train Loss: 1.7229, Val Loss: 1.7557, Val Acc: 39.06%

-> 新的最佳验证准确率: 39.06%

Epoch [78/100], Train Loss: 1.7006, Val Loss: 1.7336, Val Acc: 40.30%

-> 新的最佳验证准确率: 40.30%

Epoch [79/100], Train Loss: 1.6861, Val Loss: 1.7128, Val Acc: 39.86%

Epoch [80/100], Train Loss: 1.6666, Val Loss: 1.7038, Val Acc: 40.48%

-> 新的最佳验证准确率: 40.48%

Epoch [81/100], Train Loss: 1.6702, Val Loss: 1.7240, Val Acc: 41.41%

-> 新的最佳验证准确率: 41.41%

Epoch [82/100], Train Loss: 1.6383, Val Loss: 1.6604, Val Acc: 41.45%

-> 新的最佳验证准确率: 41.45%

Epoch [83/100], Train Loss: 1.6507, Val Loss: 1.6693, Val Acc: 41.32%

Epoch [84/100], Train Loss: 1.6065, Val Loss: 1.6255, Val Acc: 41.80%

-> 新的最佳验证准确率: 41.80%

Epoch [85/100], Train Loss: 1.5862, Val Loss: 1.6410, Val Acc: 42.38%

-> 新的最佳验证准确率: 42.38%

Epoch [86/100], Train Loss: 1.5491, Val Loss: 1.5925, Val Acc: 44.41%

-> 新的最佳验证准确率: 44.41%

Epoch [87/100], Train Loss: 1.5447, Val Loss: 1.5986, Val Acc: 43.39%

Epoch [88/100], Train Loss: 1.5309, Val Loss: 1.5973, Val Acc: 44.68%

-> 新的最佳验证准确率: 44.68%

Epoch [89/100], Train Loss: 1.5058, Val Loss: 1.5403, Val Acc: 45.43%

-> 新的最佳验证准确率: 45.43%

Epoch [90/100], Train Loss: 1.5130, Val Loss: 1.5496, Val Acc: 44.59%

Epoch [91/100], Train Loss: 1.4709, Val Loss: 1.5136, Val Acc: 45.38%

Epoch [92/100], Train Loss: 1.4614, Val Loss: 1.5028, Val Acc: 46.49%

-> 新的最佳验证准确率: 46.49%

Epoch [93/100], Train Loss: 1.4471, Val Loss: 1.4874, Val Acc: 47.24%

-> 新的最佳验证准确率: 47.24%

Epoch [94/100], Train Loss: 1.4297, Val Loss: 1.4721, Val Acc: 46.88%

Epoch [95/100], Train Loss: 1.4230, Val Loss: 1.4720, Val Acc: 47.06%

Epoch [96/100], Train Loss: 1.4120, Val Loss: 1.4906, Val Acc: 47.95%

-> 新的最佳验证准确率: 47.95%

Epoch [97/100], Train Loss: 1.4216, Val Loss: 1.4673, Val Acc: 48.30%

-> 新的最佳验证准确率: 48.30%

Epoch [98/100], Train Loss: 1.3817, Val Loss: 1.4378, Val Acc: 48.43%

-> 新的最佳验证准确率: 48.43%

Epoch [99/100], Train Loss: 1.3652, Val Loss: 1.4280, Val Acc: 48.87%

-> 新的最佳验证准确率: 48.87%

Epoch [100/100], Train Loss: 1.3617, Val Loss: 1.4077, Val Acc: 50.33%

-> 新的最佳验证准确率: 50.33%

测试集准确率: 43.75%

推理示例:

文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)

Subject: Need info on 88-89 Bonneville

Organization: University at Buffalo

Lines: 10

News-Software: VAX/VMS VNEWS 1.41

Nntp-Posting-Host: ubvmsd....

真实类别: rec.autos

预测类别: rec.sport.baseball


模型训练了100个epoch,验证准确率最终达到50.33%,但测试准确率只有43.75%,且推理示例将汽车类新闻误判为棒球类。这说明:

  • 模型已经学到了一些模式:验证准确率从5%逐渐上升到50%,证明LSTM确实在捕捉文本特征。

  • 存在一定程度的过拟合:验证准确率在后期仍在上升(最后10个epoch从47%升到50%),但测试准确率明显低于最佳验证准确率,说明模型对训练/验证集的数据特性有所"死记硬背",泛化到新数据的能力不足。

  • 推理错误:将汽车相关新闻误判为棒球,可能是因为训练数据中这两个类别有部分词语重合(如"车"和"球"都可能出现在体育类),或者模型没有学到足够区分的关键词。


参考老师的代码,并对比了一下:

一、核心

1. 正确处理变长序列(pack_padded_sequence

  • 问题 :之前的代码直接将填充后的序列输入 LSTM,没有使用 pack_padded_sequence。这会导致 LSTM 对填充部分也进行计算,污染隐藏状态,尤其在提取最后一个时间步的输出时(lstm_out[:, -1, :]),如果最后一个词是填充,则取到的是无意义的向量。

  • 参考资料方案 :通过 collate_fn 返回每个样本的真实长度 lengths,在模型 forward 中使用 pack_padded_sequence 打包,让 LSTM 只对真实序列进行计算,最后从打包后的输出中提取最终隐藏状态(使用 hidden 而非 lstm_out)。这才是 LSTM 的标准用法。

2. 正确提取 LSTM 最终隐藏状态

  • 问题 :使用了 lstm_out[:, -1, :] 提取最后一个时间步的输出。对于单向 LSTM,如果最后一个词是填充,这个值可能是 0 或噪声;对于双向 LSTM,lstm_out[:, -1, :] 只包含最后一个时间步的正向和反向输出,但反向部分的最后一个时间步对应序列的开头,并不是整个序列的语义表示。

  • 参考资料方案 :使用 LSTM 返回的 hidden 状态:

    • 单向 LSTM:hidden[-1, :, :] 是最后一层最后一个时间步的隐藏状态(真正的序列终点)。

    • 双向 LSTM:拼接 hidden[-2, :, :](正向最后)和 hidden[-1, :, :](反向最后),得到整个序列的完整表示。

3. 正则化策略的具体实现

  • 随机 Token 遮盖 :之前的代码没有做数据增强。参考资料给出了一个继承自 Dataset 的子类,在训练时随机将部分词替换为 <UNK>,防止模型过度依赖个别关键词,提升泛化能力。

  • 提前停止:虽然实现了早停,但参考资料中的实现更规范(继承 Trainer,保存最佳模型,监控验证准确率)。

  • Dropout 的合理位置 :模型只在最后的隐藏状态后加了 Dropout,但参考资料中在 LSTM 层间也启用了 Dropout(通过设置 num_layers>1dropout 参数),这能更好地防止过拟合。

4. 实验对比的启示

参考资料通过对比实验得出一个重要的结论:对于新闻分类这种关键词驱动的任务,简单的全连接模型(词袋)可能已经足够好,LSTM 的序列建模能力并未带来优势,甚至可能因过拟合而表现更差。这解释了为什么原先的 LSTM 准确率始终无法达到全连接模型的水平------即使正确实现,也可能只是略低于全连接模型(参考资料中正则化后 LSTM 为 84.15%,全连接为 84.69%)。所以,不要对 LSTM 的绝对准确率抱有过高期望,但正确实现是前提。

二、之前代码的主要问题总结

问题 原先的实现 正确做法(参考材料)
变长序列处理 未使用 pack_padded_sequence,直接输入填充后的序列 使用 pack_padded_sequence 打包,避免填充干扰
最终隐藏状态提取 使用 lstm_out[:, -1, :] 使用 hidden 状态,并根据双向性拼接
序列长度传递 未传递 lengths 给模型 collate_fn 返回 lengths,模型接收 lengths
数据增强 随机 Token 遮盖
Dropout 位置 仅最后全连接前 LSTM 层间 Dropout(num_layers>1 时)
训练/推理时长度处理 未传递长度,未使用 pack 在训练和推理时均需传递长度并使用 pack

正是这些细节的缺失,导致模型只学到了皮毛,验证准确率仅 50% 左右,而参考材料中正确实现的 LSTM 可达 84%。

代码关键点说明

  1. 数据集返回原始序列TextClassificationDatasetRaw 返回未填充的 ID 序列,由 collate_fn 统一填充并记录长度。

  2. collate_fn :计算批次最大长度,填充右侧,并返回 lengths 张量(真实长度)。

  3. LSTM 模型

    • 使用 pack_padded_sequence 打包,避免填充影响。

    • hidden 中提取最终隐藏状态:双向时拼接 hidden[-2](正向最后一层)和 hidden[-1](反向最后一层)。

    • 设置 batch_first=True,输入形状为 (batch, seq, feature)

  4. 随机 Token 遮盖 :仅在训练集启用,随机将部分词(非填充)替换为 <UNK> ID(此处设为 1),防止过拟合。

  5. 梯度裁剪clip_grad_norm_ 防止梯度爆炸。

  6. 提前停止 :监控验证集准确率,连续 patience 轮未提升则停止,并保存最佳模型。

  7. L2 正则化 :优化器中使用 weight_decay

python 复制代码
"""
lstm_text_classifier_complete.py
完整版 LSTM 文本分类器(20newsgroups)
- 使用 pack_padded_sequence 处理变长序列
- 正确提取 LSTM 最终隐藏状态(支持双向)
- 训练集随机 Token 遮盖(数据增强)
- Dropout 正则化
- 提前停止(Early Stopping)
- 输出验证集最佳模型,并在测试集上评估
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import random
import os

# ==================== 1. 设置随机种子(保证可复现) ====================
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

# ==================== 2. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

X_train_texts_full = train_data.data
y_train_full = train_data.target
X_test_texts = test_data.data
y_test = test_data.target

num_classes = len(train_data.target_names)
print(f"原始训练样本数: {len(X_train_texts_full)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")

# ==================== 3. 划分训练集和验证集 ====================
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
    X_train_texts_full, y_train_full, test_size=0.2, random_state=42
)
print(f"训练集大小: {len(X_train_texts)}")
print(f"验证集大小: {len(X_val_texts)}")

# ==================== 4. 构建词表与文本向量化 ====================
max_features = 20000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts_full)   # 使用全部训练文本构建词表

# 词表映射:词 -> 索引(从1开始,0留给填充符/未知词)
word2idx = {word: i+1 for i, word in enumerate(vectorizer.get_feature_names_out())}
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")

# 获取分词器(与 CountVectorizer 一致)
tokenizer = vectorizer.build_tokenizer()

def text_to_sequence(text, max_len=400):
    """将文本转为整数ID序列(填充/截断到固定长度)"""
    tokens = tokenizer(text.lower())
    ids = [word2idx.get(token, 0) for token in tokens]   # 未知词用0
    if len(ids) > max_len:
        ids = ids[:max_len]
    else:
        ids = ids + [0] * (max_len - len(ids))
    return ids

# ==================== 5. 定义数据集类(支持随机 Token 遮盖) ====================
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, max_len=400, is_train=False, mask_prob=0.1):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        self.is_train = is_train
        self.mask_prob = mask_prob
        self.unk_token_id = 1   # <UNK> 对应的 ID(这里设为1,因为词表从1开始)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # 获取原始文本和标签
        text = self.texts[idx]
        label = self.labels[idx]

        # 转为 ID 序列
        token_ids = text_to_sequence(text, self.max_len)

        # 训练时以一定概率随机遮盖部分词为 <UNK>
        if self.is_train and self.mask_prob > 0:
            # 创建副本,避免修改原始数据(因为不同 epoch 需要独立遮盖)
            token_ids = token_ids.copy()
            for i, tid in enumerate(token_ids):
                if tid != 0 and random.random() < self.mask_prob:   # 不遮盖填充符
                    token_ids[i] = self.unk_token_id

        return {
            "token_ids": token_ids,
            "label": label
        }

# ==================== 6. 定义 collate_fn(处理批次并返回 lengths) ====================
def collate_fn(batch):
    """
    batch: 列表,每个元素是 __getitem__ 返回的字典
    返回:
        token_ids: 填充后的序列张量 (batch_size, max_len)
        labels: 标签张量 (batch_size,)
        lengths: 每个序列的真实长度(不含填充) (batch_size,)
    """
    # 先找出批次中的最大长度
    max_len_in_batch = max(len(item["token_ids"]) for item in batch)

    token_ids_padded = []
    labels = []
    lengths = []

    for item in batch:
        ids = item["token_ids"]
        # 原始长度(注意:如果序列原本就截断到了 max_len,这里计算的是填充前的实际长度,但为了 pack_padded_sequence,我们需要知道非填充部分的长度)
        # 由于我们在 dataset 中已经统一填充/截断到了 max_len,但有些序列可能原本不足 max_len,填充了0,这里我们需要记录有效长度。
        # 但注意:我们已经在 dataset 中填充到了 max_len,所以有效长度就是 max_len 减去末尾的0吗?不一定,因为末尾可能有0是填充的,但中间的0可能是未知词。
        # 实际上,在 dataset 中我们填充的是末尾,所以有效长度就是序列中最后一个非0的位置?但未知词也可能是0,这会导致混淆。
        # 更好的做法是:在 dataset 中返回原始长度(截断前的长度),然后在 collate_fn 中记录该长度,并填充到批次最大长度。
        # 因此,我们需要修改 dataset,让它返回原始未填充的序列和原始长度。这样 collate_fn 再进行填充。
        # 为了简化,我们改为:在 dataset 中返回原始未填充的序列(不填充),在 collate_fn 中统一填充并记录长度。
        # 重新设计:
        # dataset.__getitem__ 返回原始 token_ids(未填充)和 label。
        # collate_fn 负责填充并记录 lengths。
        # 但上面的 dataset 已经填充了,不合适。我们重新实现一下 dataset,返回原始序列。
    # 为了遵循正确方法,我们重新实现一个返回原始序列的 dataset。

# 重写 dataset,使其返回原始序列(不填充)
class TextClassificationDatasetRaw(Dataset):
    def __init__(self, texts, labels, max_len=400, is_train=False, mask_prob=0.1):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len          # 用于截断,但不填充
        self.is_train = is_train
        self.mask_prob = mask_prob
        self.unk_token_id = 1

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = tokenizer(text.lower())
        # 截断到 max_len(不填充)
        token_ids = [word2idx.get(token, 0) for token in tokens][:self.max_len]

        if self.is_train and self.mask_prob > 0:
            # 随机遮盖
            token_ids = token_ids.copy()
            for i, tid in enumerate(token_ids):
                if tid != 0 and random.random() < self.mask_prob:
                    token_ids[i] = self.unk_token_id

        return {
            "token_ids": token_ids,   # 原始长度,未填充
            "label": label
        }

def collate_fn(batch):
    # 找出批次中的最大长度
    max_len_in_batch = max(len(item["token_ids"]) for item in batch)

    token_ids_padded = []
    labels = []
    lengths = []

    for item in batch:
        ids = item["token_ids"]
        lengths.append(len(ids))
        # 右侧填充 0 到 max_len_in_batch
        padding_len = max_len_in_batch - len(ids)
        padded_ids = ids + [0] * padding_len
        token_ids_padded.append(padded_ids)
        labels.append(item["label"])

    return {
        "token_ids": torch.tensor(token_ids_padded, dtype=torch.long),
        "labels": torch.tensor(labels, dtype=torch.long),
        "lengths": torch.tensor(lengths, dtype=torch.long)
    }

# ==================== 7. 定义 LSTM 模型(正确使用 pack_padded_sequence)====================
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
                 n_layers=2, dropout=0.3, bidirectional=True):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim, n_layers,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=dropout if n_layers > 1 else 0
        )
        self.dropout = nn.Dropout(dropout)
        num_directions = 2 if bidirectional else 1
        self.fc = nn.Linear(hidden_dim * num_directions, num_classes)

    def forward(self, token_ids, lengths):
        # token_ids: (batch, seq_len)
        embedded = self.embedding(token_ids)          # (batch, seq_len, embed_dim)

        # 打包变长序列
        packed = nn.utils.rnn.pack_padded_sequence(
            embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
        )

        # LSTM 前向
        packed_output, (hidden, cell) = self.lstm(packed)

        # hidden 形状: (n_layers * num_directions, batch, hidden_dim)
        if self.lstm.bidirectional:
            # 拼接正向最后一层和反向最后一层
            hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)  # (batch, hidden_dim*2)
        else:
            hidden = hidden[-1, :, :]                  # (batch, hidden_dim)

        hidden = self.dropout(hidden)
        logits = self.fc(hidden)                       # (batch, num_classes)
        return logits

# ==================== 8. 定义训练函数(含提前停止)====================
def train_model(model, train_loader, val_loader, criterion, optimizer,
                device, epochs=50, patience=5, output_dir="./output_lstm"):
    os.makedirs(output_dir, exist_ok=True)
    best_val_acc = 0.0
    patience_counter = 0
    best_model_path = os.path.join(output_dir, "best_model.pth")

    for epoch in range(1, epochs+1):
        # 训练阶段
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            token_ids = batch["token_ids"].to(device)
            labels = batch["labels"].to(device)
            lengths = batch["lengths"]                # 保持在 CPU 上(pack_padded_sequence 需要)

            optimizer.zero_grad()
            outputs = model(token_ids, lengths)
            loss = criterion(outputs, labels)
            loss.backward()
            # 梯度裁剪(防止梯度爆炸)
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
            optimizer.step()
            train_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)

        # 验证阶段
        model.eval()
        correct = 0
        total = 0
        val_loss = 0.0
        with torch.no_grad():
            for batch in val_loader:
                token_ids = batch["token_ids"].to(device)
                labels = batch["labels"].to(device)
                lengths = batch["lengths"]
                outputs = model(token_ids, lengths)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_acc = 100 * correct / total
        avg_val_loss = val_loss / len(val_loader)

        print(f"Epoch {epoch:2d}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")

        # 早停判断
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"  -> 新的最佳验证准确率!模型已保存。")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"验证准确率连续 {patience} 轮未提升,提前停止训练。")
                break

    # 加载最佳模型
    model.load_state_dict(torch.load(best_model_path))
    print(f"训练完成,最佳验证准确率: {best_val_acc:.2f}%")
    return model

# ==================== 9. 主程序 ====================
def main():
    # 超参数
    embed_dim = 128
    hidden_dim = 128
    n_layers = 2
    dropout = 0.3
    bidirectional = True
    batch_size = 64
    max_len = 400
    learning_rate = 0.001
    epochs = 50
    patience = 5
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"使用设备: {device}")

    # 创建数据集和数据加载器
    train_dataset = TextClassificationDatasetRaw(X_train_texts, y_train, max_len, is_train=True, mask_prob=0.1)
    val_dataset = TextClassificationDatasetRaw(X_val_texts, y_val, max_len, is_train=False)
    test_dataset = TextClassificationDatasetRaw(X_test_texts, y_test, max_len, is_train=False)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    # 初始化模型
    model = LSTMClassifier(
        vocab_size=vocab_size,
        embed_dim=embed_dim,
        hidden_dim=hidden_dim,
        num_classes=num_classes,
        n_layers=n_layers,
        dropout=dropout,
        bidirectional=bidirectional
    ).to(device)

    # 损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)   # 加入 L2 正则化

    # 训练
    model = train_model(model, train_loader, val_loader, criterion, optimizer,
                        device, epochs, patience, output_dir="./output_lstm_final")

    # 测试评估
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_loader:
            token_ids = batch["token_ids"].to(device)
            labels = batch["labels"].to(device)
            lengths = batch["lengths"]
            outputs = model(token_ids, lengths)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    test_acc = 100 * correct / total
    print(f"\n测试集准确率: {test_acc:.2f}%")

    # 推理示例
    def predict(text):
        model.eval()
        # 转为原始序列(不填充)
        tokens = tokenizer(text.lower())
        ids = [word2idx.get(token, 0) for token in tokens][:max_len]
        # 构造批次(batch_size=1)
        input_tensor = torch.tensor([ids], dtype=torch.long).to(device)
        length_tensor = torch.tensor([len(ids)], dtype=torch.long)
        with torch.no_grad():
            logits = model(input_tensor, length_tensor)
            pred_idx = torch.argmax(logits, dim=1).item()
        return train_data.target_names[pred_idx]

    sample_text = test_data.data[0]
    true_label = test_data.target_names[test_data.target[0]]
    pred_label = predict(sample_text)
    print("\n推理示例:")
    print(f"文本预览: {sample_text[:200]}...")
    print(f"真实类别: {true_label}")
    print(f"预测类别: {pred_label}")

if __name__ == "__main__":
    main()

输出:

(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_6_lstm_text_classifier_complete.py"

正在加载 20newsgroups 数据...

原始训练样本数: 11314

测试样本数: 7532

类别数: 20

训练集大小: 9051

验证集大小: 2263

词表大小(含填充符0): 20001

使用设备: cuda

Epoch 1/50 | Train Loss: 2.8804 | Val Loss: 2.6584 | Val Acc: 19.62%

-> 新的最佳验证准确率!模型已保存。

Epoch 2/50 | Train Loss: 2.2695 | Val Loss: 1.9502 | Val Acc: 41.71%

-> 新的最佳验证准确率!模型已保存。

Epoch 3/50 | Train Loss: 1.6842 | Val Loss: 1.6540 | Val Acc: 51.26%

-> 新的最佳验证准确率!模型已保存。

Epoch 4/50 | Train Loss: 1.3165 | Val Loss: 1.4273 | Val Acc: 57.71%

-> 新的最佳验证准确率!模型已保存。

Epoch 5/50 | Train Loss: 1.0305 | Val Loss: 1.2409 | Val Acc: 63.37%

-> 新的最佳验证准确率!模型已保存。

Epoch 6/50 | Train Loss: 0.8248 | Val Loss: 1.0151 | Val Acc: 70.92%

-> 新的最佳验证准确率!模型已保存。

Epoch 7/50 | Train Loss: 0.6379 | Val Loss: 0.9958 | Val Acc: 72.03%

-> 新的最佳验证准确率!模型已保存。

Epoch 8/50 | Train Loss: 0.5360 | Val Loss: 0.8986 | Val Acc: 74.50%

-> 新的最佳验证准确率!模型已保存。

Epoch 9/50 | Train Loss: 0.4318 | Val Loss: 0.8365 | Val Acc: 77.46%

-> 新的最佳验证准确率!模型已保存。

Epoch 10/50 | Train Loss: 0.3474 | Val Loss: 0.8558 | Val Acc: 77.95%

-> 新的最佳验证准确率!模型已保存。

Epoch 11/50 | Train Loss: 0.3088 | Val Loss: 0.8664 | Val Acc: 77.77%

Epoch 12/50 | Train Loss: 0.2585 | Val Loss: 0.8874 | Val Acc: 77.51%

Epoch 13/50 | Train Loss: 0.2270 | Val Loss: 0.7930 | Val Acc: 79.36%

-> 新的最佳验证准确率!模型已保存。

Epoch 14/50 | Train Loss: 0.1984 | Val Loss: 0.8261 | Val Acc: 80.34%

-> 新的最佳验证准确率!模型已保存。

Epoch 15/50 | Train Loss: 0.1794 | Val Loss: 0.8356 | Val Acc: 79.85%

Epoch 16/50 | Train Loss: 0.1738 | Val Loss: 0.8082 | Val Acc: 81.40%

-> 新的最佳验证准确率!模型已保存。

Epoch 17/50 | Train Loss: 0.1404 | Val Loss: 0.8216 | Val Acc: 81.62%

-> 新的最佳验证准确率!模型已保存。

Epoch 18/50 | Train Loss: 0.1419 | Val Loss: 0.7745 | Val Acc: 82.50%

-> 新的最佳验证准确率!模型已保存。

Epoch 19/50 | Train Loss: 0.1236 | Val Loss: 0.7664 | Val Acc: 83.65%

-> 新的最佳验证准确率!模型已保存。

Epoch 20/50 | Train Loss: 0.1149 | Val Loss: 0.8000 | Val Acc: 82.63%

Epoch 21/50 | Train Loss: 0.1133 | Val Loss: 0.7914 | Val Acc: 83.69%

-> 新的最佳验证准确率!模型已保存。

Epoch 22/50 | Train Loss: 0.1037 | Val Loss: 0.8187 | Val Acc: 81.88%

Epoch 23/50 | Train Loss: 0.1065 | Val Loss: 0.7871 | Val Acc: 82.85%

Epoch 24/50 | Train Loss: 0.0891 | Val Loss: 0.7629 | Val Acc: 84.71%

-> 新的最佳验证准确率!模型已保存。

Epoch 25/50 | Train Loss: 0.0962 | Val Loss: 0.7454 | Val Acc: 83.87%

Epoch 26/50 | Train Loss: 0.0957 | Val Loss: 0.7652 | Val Acc: 83.16%

Epoch 27/50 | Train Loss: 0.0974 | Val Loss: 0.8072 | Val Acc: 83.08%

Epoch 28/50 | Train Loss: 0.0874 | Val Loss: 0.7594 | Val Acc: 83.78%

Epoch 29/50 | Train Loss: 0.0974 | Val Loss: 0.7651 | Val Acc: 83.87%

验证准确率连续 5 轮未提升,提前停止训练。

训练完成,最佳验证准确率: 84.71%

测试集准确率: 72.24%

推理示例:

文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)

Subject: Need info on 88-89 Bonneville

Organization: University at Buffalo

Lines: 10

News-Software: VAX/VMS VNEWS 1.41

Nntp-Posting-Host: ubvmsd....

真实类别: rec.autos

预测类别: rec.motorcycles


📊 输出结果解读

1. 训练过程

  • 验证准确率稳步提升 :从第1轮的 19.62% 一直上升到第24轮的 84.71%,说明模型在学习,而且没有出现严重的过拟合(验证损失整体呈下降趋势)。

  • 早停触发:在第29轮因为验证准确率连续5轮未提升而停止,保存的最佳模型是第24轮的 84.71%。这表明早停机制发挥了作用,防止了无效训练。

  • 训练损失:从 2.88 降到 0.09,说明模型在训练集上拟合得很好(甚至可能有点过拟合,但早停控制住了)。

2. 测试结果

  • 测试准确率 72.24%:比最佳验证准确率低约 12%,这是正常现象,因为测试集是模型从未见过的新数据。72% 的准确率对于 20 类分类任务来说已经是不错的成绩,尤其是在没有使用预训练词向量的情况下。

  • 推理示例

    • 文本是关于 88-89 年庞蒂亚克 Bonneville 汽车的咨询。

    • 真实类别:rec.autos(汽车)

    • 预测类别:rec.motorcycles(摩托车)

    • 这两个类别都属于"交通工具"大类,模型至少判断对了大方向。可能的原因是文本中某些词(如 "Need info" 或年份数字)与摩托车类别中的某些帖子相似,导致误判。

3. 与参考资料对比

  • 参考资料中,经过正则化的 LSTM 验证准确率约 84.15%,模型达到了 84.71%,甚至略高!

  • 测试准确率 72.24% 虽然低于验证准确率,但考虑到没有使用预训练词向量、也没有进行细致的超参数调优,这个结果已经非常令人满意了。


🔍 为什么测试准确率比验证准确率低?

这是很常见的现象,原因可能有:

  1. 验证集是从训练集中划分出来的,与训练集分布更接近;而测试集是独立的数据,可能存在分布差异(新闻组话题随时间变化等)。

  2. 模型对训练集和验证集仍有一定程度的过拟合(尽管有正则化),导致在全新测试集上性能下降。

  3. 某些类别本身就容易混淆 ,例如 rec.autosrec.motorcycles,或者 sci.medsci.space 等。


笔记:

第一节 循环神经网络

一、为什么要让计算机记住"顺序"?

计算机处理一句话时,不能只看单个的词,还要看词的顺序。比如:

  • "我爱你" 和 "你爱我" 意思完全不一样,但里面的词是一样的。

  • 如果我们只是把每个词的意思简单加起来,这两句话就会变成一模一样,计算机就分不清了。

所以,我们需要一种方法,让计算机一边读句子,一边记住前面读过的内容,这样才能理解整句话的意思。

二、RNN:一个有"记忆"的小助手

RNN(循环神经网络)就是专门干这个的。它像一个认真听讲的小朋友:

  • 每听到一个词,他会把这个词和之前记住的东西结合起来,记在脑子里。

  • 然后接着听下一个词,再用新词和刚才记住的东西一起更新记忆。

  • 听完整个句子,他脑子里的"最终记忆"就是整句话的意思。

举个栗子 🌰

句子:"播放 周杰伦 的 《稻香》"

  • 第1步:听到"播放",记忆里记下"播放"。

  • 第2步:听到"周杰伦",结合之前的"播放",现在记忆里是"播放周杰伦"。

  • 第3步:听到"的",记忆更新为"播放周杰伦的"。

  • 第4步:听到"《稻香》",最终记忆是"播放周杰伦的《稻香》"------这就是整个指令的意思!

把这个最终记忆送进分类器,计算机就能知道这是一条"音乐播放"指令了。

三、从"死的"词向量到"活的"词表示

以前的方法(比如 Word2Vec)会给每个词一个固定的向量,不管这个词出现在哪里,向量都一样。这叫"静态词向量"。

但 RNN 不一样!同一个词在不同句子里的"记忆"是不同的。比如:

  • "苹果 真好吃" → "苹果"的记忆里带着"好吃",所以它是水果。

  • "苹果 发布新手机" → "苹果"的记忆里带着"手机",所以它是公司。

这种根据上下文变化的表示,就叫动态表示,让计算机能真正理解词在句子里的含义。

四、双向 RNN:不但看前面,还看后面

有时候,一个词的意思需要看后面的词才能确定。比如:

  • "味道不错" → 如果先看到"苹果",还不知道是水果还是公司,但看到后面的"味道不错",就知道是水果了。

双向 RNN 有两个小助手:

  • 一个从左到右读句子(记住前面)。

  • 一个从右到左读句子(记住后面)。

  • 最后把两人的记忆合并,就知道这个词的完整上下文了。

五、RNN 的小麻烦

  • 长句子会忘事儿:如果句子特别长(比如一个长长的故事),读到后面可能就把开头忘了。这就是"长距离依赖问题"。

  • 有时候会糊涂:计算过程中,如果数字太小(梯度消失)或太大(梯度爆炸),模型就学不好了。

不过别担心,科学家们后来又发明了更厉害的"门控 RNN"(比如 LSTM),专门解决这些问题。

第二节 LSTM 与 GRU

一、RNN 有什么烦恼?

RNN 像一个认真听故事的小朋友,他一边听一边记在脑子里。但是,如果故事特别特别长(比如讲了 100 句话),他可能就忘了开头说了什么。这叫"长句子会忘事儿",科学家叫它"长距离依赖问题"。

为什么会忘呢?因为 RNN 的记忆方式太简单了,它只是把每个新词和旧记忆"混在一起",时间久了,开头的信息就被后来的信息"冲淡"了,就像在黑板上写字,写满之后最早的笔迹就看不见了。

二、LSTM:一个有"门"的聪明笔记本

LSTM(长短期记忆网络)就像一个带三个门的智能笔记本,它能自己决定:哪些旧信息要擦掉、哪些新信息要写下来、哪些信息要读出来给别人看。

LSTM 的"门"

  • 遗忘门(擦掉什么):看到新词时,它先看看旧记忆里哪些是不重要的,打个叉叉准备忘掉。

  • 输入门(记下什么):再决定新来的信息里哪些值得记住,打个勾勾准备写进笔记本。

  • 输出门(说出什么):最后,看看当前最重要的信息是什么,把它作为"输出"告诉别人。

因为笔记本里有一条专门的"高速路"(细胞状态),重要的信息可以沿着这条路一直传下去,不会被新信息冲淡。所以 LSTM 能记住很长很长的句子。

举个例子

句子:"小明昨天在公园里看到一只可爱的柯基,那只柯基的名字叫......"

读到后面"那只柯基的名字叫"时,LSTM 能通过遗忘门知道前面说的"柯基"很重要,要留着;通过输入门把新词"名字"加进来;最后通过输出门把"柯基"和"名字"结合起来,知道这里要问的是"狗的名字"。而普通的 RNN 可能已经把"柯基"忘了。

三、GRU:简化版的 LSTM

GRU(门控循环单元)是 LSTM 的"精简版",它只有两个门:

  • 更新门:相当于 LSTM 的遗忘门+输入门,同时决定忘掉多少旧信息、加入多少新信息。

  • 重置门:决定在算新信息时,要不要忽略旧信息。

GRU 比 LSTM 少了一个"细胞状态",只有一个隐藏状态,所以计算更快,效果和 LSTM 差不多。就像小一号的笔记本,轻便好用。

四、LSTM 和 GRU 的作用

它们都能很好地处理长句子,在语音识别、机器翻译、情感分析等任务中表现优秀。因为它们能"有选择地记住该记的,忘掉该忘的"。

相关推荐
紫陌涵光1 小时前
538. 把二叉搜索树转换为累加树
c++·算法·leetcode
Zik----1 小时前
Leetcode35 —— 搜索插入位置(二分查找)
数据结构·算法·leetcode
ding_zhikai1 小时前
【Web应用开发笔记】Django笔记3:模版的用法-实现一个简单的网页
笔记·后端·python·django
yi.Ist1 小时前
牛客寒假训练营3
c++·学习·算法
24白菜头1 小时前
2026-2-23:LeetCode每日一题(动态规划专项)
笔记·学习·算法·leetcode·动态规划
土拨鼠烧电路2 小时前
笔记09:产品与研发:爆款、成分与上市生死时速
笔记
小龙报2 小时前
【算法通关指南:数据结构与算法篇】二叉树相关算法题:1.美国血统 American Heritage 2.二叉树问题
c语言·数据结构·c++·算法·深度优先·广度优先·宽度优先
啊阿狸不会拉杆2 小时前
《计算机视觉:模型、学习和推理》第 9 章-分类模型
人工智能·python·学习·算法·机器学习·计算机视觉·分类