第三节 循环神经网络
作业:
my_rnn.py
python
import numpy as np
import torch
import torch.nn as nn
# ---------- 数据准备(直接复制)----------
def prepare_inputs():
np.random.seed(42)
vocab = {"播放": 0, "周杰伦": 1, "的": 2, "《稻香》": 3}
tokens = ["播放", "周杰伦", "的", "《稻香》"]
ids = [vocab[t] for t in tokens]
V = len(vocab) # 词表大小
E = 128 # 词向量维度
H = 3 # 隐藏状态维度(为了好算,设小一点)
emb_table = np.random.randn(V, E).astype(np.float32)
# 取出序列词向量,加上 batch 维度 → (B, T, E) B=1
x_np = emb_table[ids][None] # shape: (1, 4, 128)
return tokens, x_np, H, E
# ---------- 手写 RNN(用 NumPy)----------
def manual_rnn_numpy(x_np, U_np, W_np):
B, T, E = x_np.shape
H = W_np.shape[0] # 隐藏维度
h_prev = np.zeros((B, H), dtype=np.float32) # 初始记忆为0
steps = []
for t in range(T):
x_t = x_np[:, t, :] # 当前词向量
h_t = np.tanh(x_t @ U_np + h_prev @ W_np) # 核心公式
steps.append(h_t)
h_prev = h_t # 更新记忆
return np.stack(steps, axis=1), h_prev # 返回所有步的输出和最后记忆
# ---------- PyTorch 官方 RNN ----------
def pytorch_rnn_forward(x_torch, U_torch, W_torch):
E, H = U_torch.shape # U 的尺寸是 (E, H)
rnn = nn.RNN(
input_size=E,
hidden_size=H,
num_layers=1,
nonlinearity='tanh',
bias=False,
batch_first=True,
bidirectional=False,
)
with torch.no_grad():
# PyTorch 内部权重是转置后的,所以这里要转置
rnn.weight_ih_l0.copy_(U_torch.T)
rnn.weight_hh_l0.copy_(W_torch.T)
y, h_n = rnn(x_torch)
return y, h_n.squeeze(0) # h_n 形状是 (1, B, H),去掉第一维
# ---------- 主程序 ----------
if __name__ == "__main__":
# 1. 准备数据
tokens, x_np, H, E = prepare_inputs()
print("句子:", tokens)
print("输入形状 (B,T,E):", x_np.shape)
# 2. 初始化 RNN 权重(随机,但要固定种子)
np.random.seed(42)
U_np = np.random.randn(E, H).astype(np.float32) # 输入权重
W_np = np.random.randn(H, H).astype(np.float32) # 循环权重
# 3. 手写 RNN 计算
out_manual_np, last_manual = manual_rnn_numpy(x_np, U_np, W_np)
print("手写 RNN 输出形状:", out_manual_np.shape) # (1,4,3)
# 4. PyTorch RNN 计算
x_torch = torch.from_numpy(x_np)
U_torch = torch.from_numpy(U_np)
W_torch = torch.from_numpy(W_np)
out_torch, last_torch = pytorch_rnn_forward(x_torch, U_torch, W_torch)
print("PyTorch RNN 输出形状:", out_torch.shape)
# 5. 验证是否一致
out_manual = torch.from_numpy(out_manual_np)
print("两个结果是否一致?", torch.allclose(out_manual, out_torch, atol=1e-6))
输出:
(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_my_rnn.py"
句子: ['播放', '周杰伦', '的', '《稻香》']
输入形状 (B,T,E): (1, 4, 128)
手写 RNN 输出形状: (1, 4, 3)
PyTorch RNN 输出形状: torch.Size([1, 4, 3])
两个结果是否一致? True
第二节 LSTM 与 GRU
pip install scikit-learn torch numpy
lstm_text_classifier.py
python
"""
lstm_text_classifier.py
基于 LSTM 的 20newsgroups 文本分类器
(对比之前的全连接模型,LSTM 能利用词语顺序信息)
"""
# ==================== 1. 导入所需库 ====================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# ==================== 2. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
# 训练集和测试集
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
X_train_texts = train_data.data # 原始文本列表
y_train = train_data.target # 标签(0~19)
X_test_texts = test_data.data
y_test = test_data.target
num_classes = len(train_data.target_names) # 20
print(f"训练样本数: {len(X_train_texts)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")
print("类别名称:", train_data.target_names)
# ==================== 3. 文本预处理:词表构建与序列化 ====================
# 使用 CountVectorizer 构建词表(只保留最常用的 10000 个词)
# 注意:CountVectorizer 默认会对英文文本进行合理的分词(按单词边界分割,忽略标点)
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts) # 只在训练集上拟合,构建词表
# 获取词表(特征名称)
feature_names = vectorizer.get_feature_names_out()
# 构建单词到索引的映射,索引从1开始,保留0给填充符和未知词
word2idx = {word: i+1 for i, word in enumerate(feature_names)}
vocab_size = len(word2idx) + 1 # 词表大小(含0)
print(f"词表大小(含填充符0): {vocab_size}")
# 定义将文本转为固定长度整数序列的函数
def text_to_sequence(text, max_len=200):
"""
将输入文本转为整数ID序列,并进行截断/填充至固定长度max_len
- 未知词用0表示(填充符)
- 已知词用word2idx中的ID表示
"""
# 简单按空格分词(CountVectorizer内部的分词方式更复杂,但这里为了快速对齐,直接split)
# 注意:实际应用时建议使用与CountVectorizer一致的分词器,但为了简洁我们采用简单方式
tokens = text.lower().split()
ids = [word2idx.get(token, 0) for token in tokens] # 未知词映射为0
# 截断或填充
if len(ids) > max_len:
ids = ids[:max_len]
else:
ids = ids + [0] * (max_len - len(ids))
return ids
# 设置最大序列长度(根据数据分布可调整,这里取200,覆盖大部分文章)
max_len = 200
# 将训练集和测试集全部转换为整数序列
print("正在转换文本为整数序列...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])
# ==================== 4. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
# 创建数据集和数据加载器
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# ==================== 5. 定义 LSTM 模型 ====================
class LSTMClassifier(nn.Module):
"""
基于 LSTM 的文本分类器
结构:Embedding -> LSTM -> Dropout -> Linear
"""
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1):
super(LSTMClassifier, self).__init__()
# 词嵌入层:将单词ID映射为稠密向量
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# LSTM层:batch_first=True 表示输入形状为 (batch, seq_len, embed_dim)
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
# Dropout层,防止过拟合
self.dropout = nn.Dropout(0.5)
# 全连接分类层
self.fc = nn.Linear(hidden_dim, num_classes)
def forward(self, x):
# x 形状: (batch_size, seq_len)
embedded = self.embedding(x) # (batch, seq_len, embed_dim)
lstm_out, (h_n, c_n) = self.lstm(embedded) # lstm_out: (batch, seq_len, hidden_dim)
# 取最后一个时间步的隐藏状态作为整句的表示
last_hidden = lstm_out[:, -1, :] # (batch, hidden_dim)
last_hidden = self.dropout(last_hidden)
logits = self.fc(last_hidden) # (batch, num_classes)
return logits
# 超参数设置
embed_dim = 128 # 词嵌入维度
hidden_dim = 64 # LSTM 隐藏状态维度
num_layers = 1 # LSTM 层数
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers)
print("模型结构:\n", model)
# ==================== 6. 训练准备 ====================
# 选择设备(GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# ==================== 7. 训练循环 ====================
num_epochs = 10
print("开始训练...")
for epoch in range(num_epochs):
model.train()
total_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
# 前向传播
outputs = model(inputs)
loss = criterion(outputs, labels)
# 反向传播与优化
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
# ==================== 8. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f"\n测试集准确率: {accuracy:.2f}%")
# ==================== 9. 推理示例 ====================
def predict(text, model, word2idx, max_len=200):
"""对单个文本进行预测,返回类别名称"""
model.eval()
seq = text_to_sequence(text, max_len) # 转换为整数序列
input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
with torch.no_grad():
logits = model(input_tensor)
pred_idx = torch.argmax(logits, dim=1).item()
return train_data.target_names[pred_idx]
# 用测试集中的第一个文本进行演示
sample_text = test_data.data[0]
true_label = test_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model, word2idx)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")
# (可选)保存模型
# torch.save(model.state_dict(), "lstm_classifier.pth")
# print("模型已保存为 lstm_classifier.pth")
输出:
(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_2_lstm_text_classifier.py"
正在加载 20newsgroups 数据...
训练样本数: 11314
测试样本数: 7532
类别数: 20
类别名称: ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos',
'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
词表大小(含填充符0): 10001
正在转换文本为整数序列...
模型结构:
LSTMClassifier(
(embedding): Embedding(10001, 128, padding_idx=0)
(lstm): LSTM(128, 64, batch_first=True)
(dropout): Dropout(p=0.5, inplace=False)
(fc): Linear(in_features=64, out_features=20, bias=True)
)
使用设备: cuda
开始训练...
Epoch [1/10], Loss: 2.9933
Epoch [2/10], Loss: 2.9779
Epoch [3/10], Loss: 2.9484
Epoch [4/10], Loss: 2.8657
Epoch [5/10], Loss: 2.8117
Epoch [6/10], Loss: 2.8019
Epoch [7/10], Loss: 2.7235
Epoch [8/10], Loss: 2.7693
Epoch [9/10], Loss: 2.6299
Epoch [10/10], Loss: 2.6277
测试集准确率: 8.95%
推理示例:
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd....
真实类别: rec.autos
预测类别: rec.autos
📋 输出解释
1. 数据加载
text
训练样本数: 11314
测试样本数: 7532
类别数: 20
类别名称: ['alt.atheism', 'comp.graphics', ...]
正常,数据加载成功。
2. 词表大小
text
词表大小(含填充符0): 10001
因为设置了 max_features=10000,加上填充符0,词表共10001个词。
3. 模型结构
text
LSTMClassifier(
(embedding): Embedding(10001, 128, padding_idx=0)
(lstm): LSTM(128, 64, batch_first=True)
(dropout): Dropout(p=0.5, inplace=False)
(fc): Linear(in_features=64, out_features=20, bias=True)
)
-
词嵌入维度 128
-
LSTM 隐藏层维度 64
-
Dropout 0.5
-
全连接输出 20 类
结构正确。
4. 训练损失
text
Epoch [1/10], Loss: 2.9933
...
Epoch [10/10], Loss: 2.6277
损失从 2.99 降到 2.62,下降非常缓慢,说明模型几乎没有学到有效特征(正常文本分类损失应该能降到 1.0 以下)。
5. 测试准确率
text
测试集准确率: 8.95%
20 个类别,随机猜测的正确率是 5%,8.95% 仅比随机好一点点,基本等于没学会。
6. 推理示例
text
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)...
真实类别: rec.autos
预测类别: rec.autos
这个样本预测正确,但整体准确率极低,说明只是个别样本运气好。
使用 vectorizer.build_tokenizer() 获取与词表构建一致的分词器,保证训练和推理时文本转序列的一致性。修改为:
python
"""
lstm_classifier_fixed.py
修复版 LSTM 文本分类器(20newsgroups)
主要修复:使用与 CountVectorizer 一致的 tokenizer,确保分词一致性
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
# ==================== 1. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
X_train_texts = train_data.data
y_train = train_data.target
X_test_texts = test_data.data
y_test = test_data.target
num_classes = len(train_data.target_names)
print(f"训练样本数: {len(X_train_texts)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")
# ==================== 2. 构建词表与分词器 ====================
max_features = 10000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts) # 只拟合训练集
# 获取词表(特征名称)
feature_names = vectorizer.get_feature_names_out()
word2idx = {word: i+1 for i, word in enumerate(feature_names)} # 索引从1开始,0留给填充符/未知词
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")
# 获取与 CountVectorizer 一致的分词器
tokenizer = vectorizer.build_tokenizer()
# ==================== 3. 文本转整数序列 ====================
max_len = 200 # 统一序列长度
def text_to_sequence(text, max_len=200):
"""
将文本转为整数ID序列,使用与词表一致的分词器
"""
tokens = tokenizer(text.lower()) # 先小写,再分词(CountVectorizer默认也会小写)
ids = [word2idx.get(token, 0) for token in tokens] # 未知词用0
# 截断或填充
if len(ids) > max_len:
ids = ids[:max_len]
else:
ids = ids + [0] * (max_len - len(ids))
return ids
print("正在转换训练集...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
print("正在转换测试集...")
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])
# 可选:检查一下第一条序列的有效词比例(非零)
nonzero_ratio = np.count_nonzero(X_train_seq[0]) / max_len
print(f"第一条训练样本的有效词比例: {nonzero_ratio:.2f}")
# ==================== 4. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# ==================== 5. 定义 LSTM 模型 ====================
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1):
super(LSTMClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# 可选用双向LSTM(注释掉其中一行)
# self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=False)
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
self.dropout = nn.Dropout(0.5)
# 如果使用双向LSTM,则隐藏维度为 hidden_dim * 2
lstm_output_dim = hidden_dim * (2 if self.lstm.bidirectional else 1)
self.fc = nn.Linear(lstm_output_dim, num_classes)
def forward(self, x):
embedded = self.embedding(x) # (batch, seq_len, embed_dim)
lstm_out, (h_n, c_n) = self.lstm(embedded) # lstm_out: (batch, seq_len, hidden_dim*num_directions)
# 取最后一个时间步的输出(如果使用双向,需要拼接正反两个方向的最后输出,但这里直接用lstm_out[:, -1, :]即可)
last_hidden = lstm_out[:, -1, :] # (batch, hidden_dim * num_directions)
last_hidden = self.dropout(last_hidden)
logits = self.fc(last_hidden)
return logits
# 超参数设置
embed_dim = 128
hidden_dim = 64
num_layers = 1
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers)
print("模型结构:\n", model)
# ==================== 6. 训练准备 ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")
criterion = nn.CrossEntropyLoss()
# 可以适当调高学习率,加快收敛
optimizer = optim.Adam(model.parameters(), lr=0.005)
# ==================== 7. 训练循环 ====================
num_epochs = 30 # 增加训练轮数,让损失充分下降
print("开始训练...")
for epoch in range(num_epochs):
model.train()
total_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
# ==================== 8. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
accuracy = 100 * correct / total
print(f"\n测试集准确率: {accuracy:.2f}%")
# ==================== 9. 推理示例 ====================
def predict(text, model, max_len=200):
model.eval()
seq = text_to_sequence(text, max_len)
input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
with torch.no_grad():
logits = model(input_tensor)
pred_idx = torch.argmax(logits, dim=1).item()
return train_data.target_names[pred_idx]
sample_text = test_data.data[0]
true_label = test_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")
输出:
(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_3_lstm_classifier_fixed.py"
正在加载 20newsgroups 数据...
训练样本数: 11314
测试样本数: 7532
类别数: 20
词表大小(含填充符0): 10001
正在转换训练集...
正在转换测试集...
第一条训练样本的有效词比例: 0.28
模型结构:
LSTMClassifier(
(embedding): Embedding(10001, 128, padding_idx=0)
(lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
(dropout): Dropout(p=0.5, inplace=False)
(fc): Linear(in_features=128, out_features=20, bias=True)
)
使用设备: cuda
开始训练...
Epoch [1/30], Loss: 2.9771
Epoch [2/30], Loss: 2.8490
Epoch [3/30], Loss: 2.6860
Epoch [4/30], Loss: 2.6366
Epoch [5/30], Loss: 2.5454
Epoch [6/30], Loss: 2.4195
Epoch [7/30], Loss: 2.3379
Epoch [8/30], Loss: 2.2556
Epoch [9/30], Loss: 2.3179
Epoch [10/30], Loss: 2.3072
Epoch [11/30], Loss: 2.0003
Epoch [12/30], Loss: 1.7037
Epoch [13/30], Loss: 1.3988
Epoch [14/30], Loss: 1.1945
Epoch [15/30], Loss: 1.1195
Epoch [16/30], Loss: 0.9292
Epoch [17/30], Loss: 0.8037
Epoch [18/30], Loss: 0.7597
Epoch [19/30], Loss: 0.6750
Epoch [20/30], Loss: 0.6337
Epoch [21/30], Loss: 0.5436
Epoch [22/30], Loss: 0.4728
Epoch [23/30], Loss: 0.4769
Epoch [24/30], Loss: 0.5828
Epoch [25/30], Loss: 0.5785
Epoch [26/30], Loss: 0.4222
Epoch [27/30], Loss: 0.3580
Epoch [28/30], Loss: 0.3220
Epoch [29/30], Loss: 0.2826
Epoch [30/30], Loss: 0.2464
测试集准确率: 50.57%
推理示例:
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd....
真实类别: rec.autos
预测类别: comp.sys.mac.hardware
📊 输出结果解释
1. 数据预处理
text
第一条训练样本的有效词比例: 0.28
- 这意味着第一条训练样本中,有 28% 的词(56 个)是有效词(非填充符 0),其余是填充。这个比例正常,说明文本没有因为分词问题变成全零。
2. 模型结构
text
LSTMClassifier(
(embedding): Embedding(10001, 128, padding_idx=0)
(lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
(dropout): Dropout(p=0.5, inplace=False)
(fc): Linear(in_features=128, out_features=20, bias=True)
)
- 使用双向 LSTM(
bidirectional=True),因此最后全连接层的输入维度是hidden_dim * 2 = 128,正确。
3. 训练损失
text
Epoch [1/30], Loss: 2.9771
...
Epoch [30/30], Loss: 0.2464
- 损失从 2.98 稳步下降到 0.25,说明模型在学习,并且拟合得不错(训练集上损失很低)。
4. 测试准确率
text
测试集准确率: 50.57%
- 在 20 个类别中,随机猜对的概率是 5%,50.57% 意味着模型确实学到了很多东西,但还有很大提升空间。
5. 推理示例
text
真实类别: rec.autos
预测类别: comp.sys.mac.hardware
- 模型把一篇关于汽车的文章误判为计算机硬件,说明它还没有完全理解这两个领域的细微差别,或者训练数据中这两个类别有相似之处。
🔍 为什么准确率只有 50%?
这是一个典型的过拟合现象:训练损失降到了 0.25(非常低),但测试准确率只有 50%,说明模型在训练集上表现得很好,但在未见过的测试集上泛化能力不足。可能的原因有:
-
模型容量过大:双向 LSTM 加上 Dropout 可能还是不够强,或者 hidden_dim 128 对于这个数据集来说足够,但训练时间可能还不够(30 个 epoch 对于 LSTM 来说不算多,有些模型需要 50+ epoch)。
-
超参数未调优:学习率、隐藏层大小、dropout 比例等可以进一步优化。
-
文本表示仍较简单:我们只用了简单的词 ID 序列,没有使用预训练的词向量(如 GloVe 或 fastText),也没有考虑词的重要性(如 TF-IDF 加权)。
-
序列长度截断 :
max_len=200可能丢失了部分长文本的尾部信息,但影响不会太大。
🚀 改进建议
1. 增加训练轮数
当前 30 个 epoch 可能还不够,可以尝试 50 或 100 个 epoch,同时观察验证集损失(可以用一部分训练数据作为验证集),防止过拟合。
2. 使用验证集和早停(Early Stopping)
从训练集中分出一部分(比如 20%)作为验证集,在每个 epoch 后计算验证集准确率,当验证集准确率不再提升时停止训练,可以有效防止过拟合。
3. 调整 Dropout 比例
当前 Dropout=0.5,可以尝试增大到 0.6 或 0.7,或者增加 Dropout 的位置(例如在 embedding 后也加 Dropout)。
4. 使用预训练词向量
加载预训练的 GloVe 或 fastText 词向量作为 Embedding 的初始值,可以大大提升模型的泛化能力(尤其对于新闻文本)。
5. 调整 LSTM 参数
-
增加 hidden_dim 到 128 或 256。
-
增加 LSTM 层数(
num_layers=2),但要注意过拟合风险。
6. 使用梯度裁剪(Gradient Clipping)
python
"""
lstm_classifier_improved.py
改进版 LSTM 文本分类器(20newsgroups)
- 增加了验证集和早停机制
- 增加了梯度裁剪
- 调整了 Dropout 和训练轮数
- 保留了双向 LSTM 结构
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
# ==================== 1. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
X_train_texts_full = train_data.data # 原始训练文本(全部)
y_train_full = train_data.target # 原始训练标签
X_test_texts = test_data.data
y_test = test_data.target
num_classes = len(train_data.target_names)
print(f"原始训练样本数: {len(X_train_texts_full)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")
# ==================== 2. 划分训练集和验证集 ====================
# 从原始训练集中分出 20% 作为验证集
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
X_train_texts_full, y_train_full, test_size=0.2, random_state=42
)
print(f"训练集大小: {len(X_train_texts)}")
print(f"验证集大小: {len(X_val_texts)}")
# ==================== 3. 构建词表与分词器 ====================
max_features = 10000
# 词表必须在全部原始训练集上构建,以保证覆盖所有可能出现的词(验证集和测试集可能出现训练集没见过的词,但这是合理的)
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts_full) # 使用全部原始训练集构建词表
# 获取词表映射
feature_names = vectorizer.get_feature_names_out()
word2idx = {word: i+1 for i, word in enumerate(feature_names)} # 0 留给填充和未知词
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")
# 获取与 CountVectorizer 一致的分词器
tokenizer = vectorizer.build_tokenizer()
# ==================== 4. 文本转整数序列 ====================
max_len = 200 # 统一序列长度
def text_to_sequence(text, max_len=200):
"""将文本转为整数ID序列,使用与词表一致的分词器"""
tokens = tokenizer(text.lower())
ids = [word2idx.get(token, 0) for token in tokens] # 未知词用0
# 截断或填充
if len(ids) > max_len:
ids = ids[:max_len]
else:
ids = ids + [0] * (max_len - len(ids))
return ids
print("正在转换训练集...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
print("正在转换验证集...")
X_val_seq = np.array([text_to_sequence(text, max_len) for text in X_val_texts])
print("正在转换测试集...")
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])
# 检查一下有效词比例(可选)
nonzero_ratio = np.count_nonzero(X_train_seq[0]) / max_len
print(f"第一条训练样本的有效词比例: {nonzero_ratio:.2f}")
# ==================== 5. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# ==================== 6. 定义 LSTM 模型 ====================
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
super(LSTMClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# 使用双向 LSTM
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True, dropout=dropout if num_layers>1 else 0)
self.dropout = nn.Dropout(dropout)
# 双向LSTM的输出维度是 hidden_dim * 2
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def forward(self, x):
embedded = self.embedding(x) # (batch, seq_len, embed_dim)
lstm_out, (h_n, c_n) = self.lstm(embedded) # lstm_out: (batch, seq_len, hidden_dim*2)
# 取最后一个时间步的输出作为整句表示
last_hidden = lstm_out[:, -1, :] # (batch, hidden_dim*2)
last_hidden = self.dropout(last_hidden)
logits = self.fc(last_hidden)
return logits
# 超参数设置
embed_dim = 128
hidden_dim = 64
num_layers = 1
dropout = 0.6 # 增加 dropout 防止过拟合
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers, dropout)
print("模型结构:\n", model)
# ==================== 7. 训练准备 ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)
# 早停相关参数
best_val_acc = 0.0
patience = 5 # 连续5个epoch验证准确率不提升则停止
patience_counter = 0
num_epochs = 50 # 最大训练轮数
# ==================== 8. 训练循环(含验证和早停) ====================
print("开始训练...")
for epoch in range(num_epochs):
# 训练阶段
model.train()
total_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
# 梯度裁剪(防止梯度爆炸)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
optimizer.step()
total_loss += loss.item()
avg_train_loss = total_loss / len(train_loader)
# 验证阶段
model.eval()
correct = 0
total = 0
val_loss = 0.0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
val_acc = 100 * correct / total
avg_val_loss = val_loss / len(val_loader)
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")
# 早停判断:保存最佳模型
if val_acc > best_val_acc:
best_val_acc = val_acc
patience_counter = 0
# 保存最佳模型参数(可选)
best_model_state = model.state_dict()
print(f" -> 新的最佳验证准确率: {best_val_acc:.2f}%")
else:
patience_counter += 1
if patience_counter >= patience:
print(f"验证准确率连续 {patience} 个epoch未提升,提前停止训练。")
break
# 加载最佳模型(用于测试)
model.load_state_dict(best_model_state)
# ==================== 9. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"\n测试集准确率: {test_acc:.2f}%")
# ==================== 10. 推理示例 ====================
def predict(text, model, max_len=200):
model.eval()
seq = text_to_sequence(text, max_len)
input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
with torch.no_grad():
logits = model(input_tensor)
pred_idx = torch.argmax(logits, dim=1).item()
return train_data.target_names[pred_idx]
sample_text = test_data.data[0]
true_label = train_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")
# (可选)保存最终模型
# torch.save(model.state_dict(), "lstm_classifier_best.pth")
# print("最佳模型已保存为 lstm_classifier_best.pth")
输出:
(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_4_lstm_classifier_improved.py"
正在加载 20newsgroups 数据...
原始训练样本数: 11314
测试样本数: 7532
类别数: 20
训练集大小: 9051
验证集大小: 2263
词表大小(含填充符0): 10001
正在转换训练集...
正在转换验证集...
正在转换测试集...
第一条训练样本的有效词比例: 0.16
模型结构:
LSTMClassifier(
(embedding): Embedding(10001, 128, padding_idx=0)
(lstm): LSTM(128, 64, batch_first=True, bidirectional=True)
(dropout): Dropout(p=0.6, inplace=False)
(fc): Linear(in_features=128, out_features=20, bias=True)
)
使用设备: cuda
开始训练...
Epoch [1/50], Train Loss: 3.0008, Val Loss: 2.9961, Val Acc: 5.88%
-> 新的最佳验证准确率: 5.88%
Epoch [2/50], Train Loss: 2.9373, Val Loss: 2.9798, Val Acc: 8.13%
-> 新的最佳验证准确率: 8.13%
Epoch [3/50], Train Loss: 2.7867, Val Loss: 2.9330, Val Acc: 9.50%
-> 新的最佳验证准确率: 9.50%
Epoch [4/50], Train Loss: 2.6558, Val Loss: 2.9762, Val Acc: 9.59%
-> 新的最佳验证准确率: 9.59%
Epoch [5/50], Train Loss: 2.4854, Val Loss: 2.9485, Val Acc: 11.27%
-> 新的最佳验证准确率: 11.27%
Epoch [6/50], Train Loss: 2.3580, Val Loss: 2.9656, Val Acc: 12.15%
-> 新的最佳验证准确率: 12.15%
Epoch [7/50], Train Loss: 2.2207, Val Loss: 2.9712, Val Acc: 12.86%
-> 新的最佳验证准确率: 12.86%
Epoch [8/50], Train Loss: 2.1157, Val Loss: 2.8745, Val Acc: 15.73%
-> 新的最佳验证准确率: 15.73%
Epoch [9/50], Train Loss: 2.0042, Val Loss: 2.7749, Val Acc: 18.74%
-> 新的最佳验证准确率: 18.74%
Epoch [10/50], Train Loss: 1.9047, Val Loss: 2.8088, Val Acc: 20.15%
-> 新的最佳验证准确率: 20.15%
Epoch [11/50], Train Loss: 1.8215, Val Loss: 2.7210, Val Acc: 22.85%
-> 新的最佳验证准确率: 22.85%
Epoch [12/50], Train Loss: 1.7812, Val Loss: 2.7424, Val Acc: 21.70%
Epoch [13/50], Train Loss: 1.6840, Val Loss: 2.7344, Val Acc: 23.20%
-> 新的最佳验证准确率: 23.20%
Epoch [14/50], Train Loss: 1.6036, Val Loss: 2.7079, Val Acc: 24.88%
-> 新的最佳验证准确率: 24.88%
Epoch [15/50], Train Loss: 1.5676, Val Loss: 2.7765, Val Acc: 23.42%
Epoch [16/50], Train Loss: 1.5323, Val Loss: 2.7000, Val Acc: 25.89%
-> 新的最佳验证准确率: 25.89%
Epoch [17/50], Train Loss: 1.5011, Val Loss: 2.7656, Val Acc: 25.76%
Epoch [18/50], Train Loss: 1.4730, Val Loss: 2.6730, Val Acc: 27.31%
-> 新的最佳验证准确率: 27.31%
Epoch [19/50], Train Loss: 1.3944, Val Loss: 2.6543, Val Acc: 28.68%
-> 新的最佳验证准确率: 28.68%
Epoch [20/50], Train Loss: 1.3701, Val Loss: 2.7498, Val Acc: 27.04%
Epoch [21/50], Train Loss: 1.3178, Val Loss: 2.6921, Val Acc: 27.00%
Epoch [22/50], Train Loss: 1.3029, Val Loss: 2.7079, Val Acc: 28.37%
Epoch [23/50], Train Loss: 1.2716, Val Loss: 2.7714, Val Acc: 28.68%
Epoch [24/50], Train Loss: 1.2674, Val Loss: 2.7252, Val Acc: 29.03%
-> 新的最佳验证准确率: 29.03%
Epoch [25/50], Train Loss: 1.2675, Val Loss: 2.7603, Val Acc: 28.28%
Epoch [26/50], Train Loss: 1.2475, Val Loss: 2.7091, Val Acc: 29.03%
Epoch [27/50], Train Loss: 1.1894, Val Loss: 2.7516, Val Acc: 31.06%
-> 新的最佳验证准确率: 31.06%
Epoch [28/50], Train Loss: 1.1878, Val Loss: 2.7256, Val Acc: 30.76%
Epoch [29/50], Train Loss: 1.1927, Val Loss: 2.8200, Val Acc: 30.49%
Epoch [30/50], Train Loss: 1.1764, Val Loss: 2.7709, Val Acc: 30.89%
Epoch [31/50], Train Loss: 1.1317, Val Loss: 2.9031, Val Acc: 29.12%
Epoch [32/50], Train Loss: 1.1335, Val Loss: 2.8258, Val Acc: 30.40%
验证准确率连续 5 个epoch未提升,提前停止训练。
测试集准确率: 26.83%
推理示例:
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd....
真实类别: rec.autos
预测类别: sci.med
输出结果可以看出,模型仍然处于欠拟合 状态(训练损失下降缓慢,验证准确率最高只有 31%,测试准确率 26.8%)。这说明模型没有足够的能力从数据中学习到有效的分类特征。我们需要在保持 LSTM 模型结构不变的前提下,调整数据预处理和超参数来提升性能。
python
"""
lstm_classifier_tuned.py
调优版 LSTM 文本分类器(20newsgroups)
- 增大 max_len, max_features, hidden_dim
- 降低学习率,增大 patience
- 保持 LSTM 结构不变,仅优化超参数
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
# ==================== 1. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
X_train_texts_full = train_data.data
y_train_full = train_data.target
X_test_texts = test_data.data
y_test = test_data.target
num_classes = len(train_data.target_names)
print(f"原始训练样本数: {len(X_train_texts_full)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")
# ==================== 2. 划分训练集和验证集 ====================
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
X_train_texts_full, y_train_full, test_size=0.2, random_state=42
)
print(f"训练集大小: {len(X_train_texts)}")
print(f"验证集大小: {len(X_val_texts)}")
# ==================== 3. 构建词表与分词器(增大 max_features)====================
max_features = 20000 # 从 10000 增大到 20000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts_full) # 使用全部原始训练集构建词表
feature_names = vectorizer.get_feature_names_out()
word2idx = {word: i+1 for i, word in enumerate(feature_names)}
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")
tokenizer = vectorizer.build_tokenizer()
# ==================== 4. 文本转整数序列(增大 max_len)====================
max_len = 400 # 从 200 增大到 400
def text_to_sequence(text, max_len=400):
tokens = tokenizer(text.lower())
ids = [word2idx.get(token, 0) for token in tokens]
if len(ids) > max_len:
ids = ids[:max_len]
else:
ids = ids + [0] * (max_len - len(ids))
return ids
print("正在转换训练集...")
X_train_seq = np.array([text_to_sequence(text, max_len) for text in X_train_texts])
print("正在转换验证集...")
X_val_seq = np.array([text_to_sequence(text, max_len) for text in X_val_texts])
print("正在转换测试集...")
X_test_seq = np.array([text_to_sequence(text, max_len) for text in X_test_texts])
# 检查有效词比例
nonzero_ratio = np.count_nonzero(X_train_seq[0]) / max_len
print(f"第一条训练样本的有效词比例: {nonzero_ratio:.2f}")
# ==================== 5. 转换为 PyTorch 张量 ====================
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.long)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# ==================== 6. 定义 LSTM 模型(增大 hidden_dim)====================
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, num_layers=1, dropout=0.5):
super(LSTMClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
# 在 Embedding 后增加 Dropout(可选)
self.embed_dropout = nn.Dropout(dropout)
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True,
bidirectional=True, dropout=dropout if num_layers>1 else 0)
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def forward(self, x):
x = self.embedding(x)
x = self.embed_dropout(x) # 新增:embedding 后 dropout
lstm_out, (h_n, c_n) = self.lstm(x)
last_hidden = lstm_out[:, -1, :]
last_hidden = self.dropout(last_hidden)
logits = self.fc(last_hidden)
return logits
# 超参数设置
embed_dim = 128
hidden_dim = 128 # 从 64 增大到 128
num_layers = 1
dropout = 0.6
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, num_classes, num_layers, dropout)
print("模型结构:\n", model)
# ==================== 7. 训练准备 ====================
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"使用设备: {device}")
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001) # 学习率从 0.005 降到 0.001
# 早停参数
best_val_acc = 0.0
patience = 10 # 从 5 增大到 10
patience_counter = 0
num_epochs = 100 # 最大训练轮数增加到 100
# ==================== 8. 训练循环 ====================
print("开始训练...")
for epoch in range(num_epochs):
# 训练
model.train()
total_loss = 0.0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
optimizer.step()
total_loss += loss.item()
avg_train_loss = total_loss / len(train_loader)
# 验证
model.eval()
correct = 0
total = 0
val_loss = 0.0
with torch.no_grad():
for inputs, labels in val_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
val_acc = 100 * correct / total
avg_val_loss = val_loss / len(val_loader)
print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%")
# 早停判断
if val_acc > best_val_acc:
best_val_acc = val_acc
patience_counter = 0
best_model_state = model.state_dict()
print(f" -> 新的最佳验证准确率: {best_val_acc:.2f}%")
else:
patience_counter += 1
if patience_counter >= patience:
print(f"验证准确率连续 {patience} 个epoch未提升,提前停止训练。")
break
# 加载最佳模型
model.load_state_dict(best_model_state)
# ==================== 9. 测试评估 ====================
model.eval()
correct = 0
total = 0
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"\n测试集准确率: {test_acc:.2f}%")
# ==================== 10. 推理示例 ====================
def predict(text, model, max_len=400):
model.eval()
seq = text_to_sequence(text, max_len)
input_tensor = torch.tensor([seq], dtype=torch.long).to(device)
with torch.no_grad():
logits = model(input_tensor)
pred_idx = torch.argmax(logits, dim=1).item()
return train_data.target_names[pred_idx]
sample_text = test_data.data[0]
true_label = train_data.target_names[test_data.target[0]]
pred_label = predict(sample_text, model)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")
# 可选保存模型
# torch.save(model.state_dict(), "lstm_classifier_tuned.pth")
输出:
(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_5_lstm_classifier_tuned.py"
正在加载 20newsgroups 数据...
原始训练样本数: 11314
测试样本数: 7532
类别数: 20
训练集大小: 9051
验证集大小: 2263
词表大小(含填充符0): 20001
正在转换训练集...
正在转换验证集...
正在转换测试集...
第一条训练样本的有效词比例: 0.09
模型结构:
LSTMClassifier(
(embedding): Embedding(20001, 128, padding_idx=0)
(embed_dropout): Dropout(p=0.6, inplace=False)
(lstm): LSTM(128, 128, batch_first=True, bidirectional=True)
(dropout): Dropout(p=0.6, inplace=False)
(fc): Linear(in_features=256, out_features=20, bias=True)
)
使用设备: cuda
开始训练...
Epoch [1/100], Train Loss: 2.9934, Val Loss: 2.9925, Val Acc: 5.30%
-> 新的最佳验证准确率: 5.30%
Epoch [2/100], Train Loss: 2.9904, Val Loss: 2.9934, Val Acc: 5.04%
Epoch [3/100], Train Loss: 2.9851, Val Loss: 2.9895, Val Acc: 5.52%
-> 新的最佳验证准确率: 5.52%
Epoch [4/100], Train Loss: 2.9772, Val Loss: 2.9923, Val Acc: 5.92%
-> 新的最佳验证准确率: 5.92%
Epoch [5/100], Train Loss: 2.9688, Val Loss: 2.9891, Val Acc: 5.92%
Epoch [6/100], Train Loss: 2.9609, Val Loss: 3.0199, Val Acc: 5.66%
Epoch [7/100], Train Loss: 2.9602, Val Loss: 2.9890, Val Acc: 5.52%
Epoch [8/100], Train Loss: 2.9398, Val Loss: 2.9826, Val Acc: 5.92%
Epoch [9/100], Train Loss: 2.9257, Val Loss: 2.9847, Val Acc: 6.50%
-> 新的最佳验证准确率: 6.50%
Epoch [10/100], Train Loss: 2.9158, Val Loss: 2.9717, Val Acc: 6.05%
Epoch [11/100], Train Loss: 2.9083, Val Loss: 2.9788, Val Acc: 6.27%
Epoch [12/100], Train Loss: 2.8983, Val Loss: 2.9757, Val Acc: 5.74%
Epoch [13/100], Train Loss: 2.8955, Val Loss: 2.9722, Val Acc: 7.03%
-> 新的最佳验证准确率: 7.03%
Epoch [14/100], Train Loss: 2.8858, Val Loss: 2.9745, Val Acc: 6.54%
Epoch [15/100], Train Loss: 2.8636, Val Loss: 2.9735, Val Acc: 6.50%
Epoch [16/100], Train Loss: 2.8723, Val Loss: 2.9780, Val Acc: 6.45%
Epoch [17/100], Train Loss: 2.8414, Val Loss: 2.9722, Val Acc: 6.76%
Epoch [18/100], Train Loss: 2.8444, Val Loss: 2.9771, Val Acc: 6.72%
Epoch [19/100], Train Loss: 2.8255, Val Loss: 2.9778, Val Acc: 6.63%
Epoch [20/100], Train Loss: 2.8099, Val Loss: 2.9900, Val Acc: 6.89%
Epoch [21/100], Train Loss: 2.8012, Val Loss: 2.9472, Val Acc: 7.56%
-> 新的最佳验证准确率: 7.56%
Epoch [22/100], Train Loss: 2.7923, Val Loss: 2.9121, Val Acc: 8.53%
-> 新的最佳验证准确率: 8.53%
Epoch [23/100], Train Loss: 2.7379, Val Loss: 2.8737, Val Acc: 9.54%
-> 新的最佳验证准确率: 9.54%
Epoch [24/100], Train Loss: 2.7570, Val Loss: 2.8585, Val Acc: 7.78%
Epoch [25/100], Train Loss: 2.7018, Val Loss: 2.7924, Val Acc: 10.34%
-> 新的最佳验证准确率: 10.34%
Epoch [26/100], Train Loss: 2.6723, Val Loss: 2.7729, Val Acc: 10.30%
Epoch [27/100], Train Loss: 2.6525, Val Loss: 2.7687, Val Acc: 10.47%
-> 新的最佳验证准确率: 10.47%
Epoch [28/100], Train Loss: 2.6096, Val Loss: 2.7053, Val Acc: 12.77%
-> 新的最佳验证准确率: 12.77%
Epoch [29/100], Train Loss: 2.5758, Val Loss: 2.6341, Val Acc: 15.33%
-> 新的最佳验证准确率: 15.33%
Epoch [30/100], Train Loss: 2.5617, Val Loss: 2.6345, Val Acc: 14.94%
Epoch [31/100], Train Loss: 2.5537, Val Loss: 2.6234, Val Acc: 14.76%
Epoch [32/100], Train Loss: 2.5496, Val Loss: 2.5999, Val Acc: 15.25%
Epoch [33/100], Train Loss: 2.4813, Val Loss: 2.5597, Val Acc: 17.54%
-> 新的最佳验证准确率: 17.54%
Epoch [34/100], Train Loss: 2.4723, Val Loss: 2.5195, Val Acc: 17.32%
Epoch [35/100], Train Loss: 2.4294, Val Loss: 2.5008, Val Acc: 18.34%
-> 新的最佳验证准确率: 18.34%
Epoch [36/100], Train Loss: 2.3932, Val Loss: 2.4861, Val Acc: 20.19%
-> 新的最佳验证准确率: 20.19%
Epoch [37/100], Train Loss: 2.3806, Val Loss: 2.3994, Val Acc: 21.21%
-> 新的最佳验证准确率: 21.21%
Epoch [38/100], Train Loss: 2.3594, Val Loss: 2.4138, Val Acc: 19.97%
Epoch [39/100], Train Loss: 2.3324, Val Loss: 2.3612, Val Acc: 22.05%
-> 新的最佳验证准确率: 22.05%
Epoch [40/100], Train Loss: 2.3284, Val Loss: 2.3772, Val Acc: 21.87%
Epoch [41/100], Train Loss: 2.3104, Val Loss: 2.4241, Val Acc: 21.08%
Epoch [42/100], Train Loss: 2.2869, Val Loss: 2.3247, Val Acc: 22.62%
-> 新的最佳验证准确率: 22.62%
Epoch [43/100], Train Loss: 2.2690, Val Loss: 2.3552, Val Acc: 21.48%
Epoch [44/100], Train Loss: 2.2540, Val Loss: 2.2830, Val Acc: 22.98%
-> 新的最佳验证准确率: 22.98%
Epoch [45/100], Train Loss: 2.2201, Val Loss: 2.2846, Val Acc: 24.48%
-> 新的最佳验证准确率: 24.48%
Epoch [46/100], Train Loss: 2.2100, Val Loss: 2.2677, Val Acc: 25.01%
-> 新的最佳验证准确率: 25.01%
Epoch [47/100], Train Loss: 2.2050, Val Loss: 2.2163, Val Acc: 25.36%
-> 新的最佳验证准确率: 25.36%
Epoch [48/100], Train Loss: 2.1736, Val Loss: 2.2280, Val Acc: 24.66%
Epoch [49/100], Train Loss: 2.1752, Val Loss: 2.2007, Val Acc: 25.50%
-> 新的最佳验证准确率: 25.50%
Epoch [50/100], Train Loss: 2.1482, Val Loss: 2.1867, Val Acc: 25.01%
Epoch [51/100], Train Loss: 2.1357, Val Loss: 2.1761, Val Acc: 27.35%
-> 新的最佳验证准确率: 27.35%
Epoch [52/100], Train Loss: 2.1297, Val Loss: 2.1515, Val Acc: 27.35%
Epoch [53/100], Train Loss: 2.1202, Val Loss: 2.1627, Val Acc: 27.79%
-> 新的最佳验证准确率: 27.79%
Epoch [54/100], Train Loss: 2.1075, Val Loss: 2.1540, Val Acc: 26.69%
Epoch [55/100], Train Loss: 2.0704, Val Loss: 2.1215, Val Acc: 30.05%
-> 新的最佳验证准确率: 30.05%
Epoch [56/100], Train Loss: 2.0713, Val Loss: 2.1024, Val Acc: 28.24%
Epoch [57/100], Train Loss: 2.0405, Val Loss: 2.1385, Val Acc: 28.06%
Epoch [58/100], Train Loss: 2.0408, Val Loss: 2.1059, Val Acc: 28.02%
Epoch [59/100], Train Loss: 2.0342, Val Loss: 2.0812, Val Acc: 29.43%
Epoch [60/100], Train Loss: 2.0772, Val Loss: 2.0958, Val Acc: 28.46%
Epoch [61/100], Train Loss: 2.0226, Val Loss: 2.0878, Val Acc: 28.81%
Epoch [62/100], Train Loss: 2.0026, Val Loss: 2.0587, Val Acc: 29.47%
Epoch [63/100], Train Loss: 1.9920, Val Loss: 2.0425, Val Acc: 30.53%
-> 新的最佳验证准确率: 30.53%
Epoch [64/100], Train Loss: 1.9639, Val Loss: 2.0443, Val Acc: 31.29%
-> 新的最佳验证准确率: 31.29%
Epoch [65/100], Train Loss: 1.9620, Val Loss: 2.0068, Val Acc: 32.17%
-> 新的最佳验证准确率: 32.17%
Epoch [66/100], Train Loss: 1.9541, Val Loss: 2.0236, Val Acc: 32.21%
-> 新的最佳验证准确率: 32.21%
Epoch [67/100], Train Loss: 1.9310, Val Loss: 1.9914, Val Acc: 32.30%
-> 新的最佳验证准确率: 32.30%
Epoch [68/100], Train Loss: 1.9216, Val Loss: 1.9637, Val Acc: 33.01%
-> 新的最佳验证准确率: 33.01%
Epoch [69/100], Train Loss: 1.8879, Val Loss: 2.0479, Val Acc: 33.36%
-> 新的最佳验证准确率: 33.36%
Epoch [70/100], Train Loss: 1.8780, Val Loss: 1.9131, Val Acc: 34.56%
-> 新的最佳验证准确率: 34.56%
Epoch [71/100], Train Loss: 1.8952, Val Loss: 1.9649, Val Acc: 35.00%
-> 新的最佳验证准确率: 35.00%
Epoch [72/100], Train Loss: 1.8307, Val Loss: 1.8973, Val Acc: 35.04%
-> 新的最佳验证准确率: 35.04%
Epoch [73/100], Train Loss: 1.8079, Val Loss: 1.8552, Val Acc: 36.63%
-> 新的最佳验证准确率: 36.63%
Epoch [74/100], Train Loss: 1.7991, Val Loss: 1.8432, Val Acc: 36.94%
-> 新的最佳验证准确率: 36.94%
Epoch [75/100], Train Loss: 1.7814, Val Loss: 1.8238, Val Acc: 37.47%
-> 新的最佳验证准确率: 37.47%
Epoch [76/100], Train Loss: 1.7588, Val Loss: 1.7813, Val Acc: 38.40%
-> 新的最佳验证准确率: 38.40%
Epoch [77/100], Train Loss: 1.7229, Val Loss: 1.7557, Val Acc: 39.06%
-> 新的最佳验证准确率: 39.06%
Epoch [78/100], Train Loss: 1.7006, Val Loss: 1.7336, Val Acc: 40.30%
-> 新的最佳验证准确率: 40.30%
Epoch [79/100], Train Loss: 1.6861, Val Loss: 1.7128, Val Acc: 39.86%
Epoch [80/100], Train Loss: 1.6666, Val Loss: 1.7038, Val Acc: 40.48%
-> 新的最佳验证准确率: 40.48%
Epoch [81/100], Train Loss: 1.6702, Val Loss: 1.7240, Val Acc: 41.41%
-> 新的最佳验证准确率: 41.41%
Epoch [82/100], Train Loss: 1.6383, Val Loss: 1.6604, Val Acc: 41.45%
-> 新的最佳验证准确率: 41.45%
Epoch [83/100], Train Loss: 1.6507, Val Loss: 1.6693, Val Acc: 41.32%
Epoch [84/100], Train Loss: 1.6065, Val Loss: 1.6255, Val Acc: 41.80%
-> 新的最佳验证准确率: 41.80%
Epoch [85/100], Train Loss: 1.5862, Val Loss: 1.6410, Val Acc: 42.38%
-> 新的最佳验证准确率: 42.38%
Epoch [86/100], Train Loss: 1.5491, Val Loss: 1.5925, Val Acc: 44.41%
-> 新的最佳验证准确率: 44.41%
Epoch [87/100], Train Loss: 1.5447, Val Loss: 1.5986, Val Acc: 43.39%
Epoch [88/100], Train Loss: 1.5309, Val Loss: 1.5973, Val Acc: 44.68%
-> 新的最佳验证准确率: 44.68%
Epoch [89/100], Train Loss: 1.5058, Val Loss: 1.5403, Val Acc: 45.43%
-> 新的最佳验证准确率: 45.43%
Epoch [90/100], Train Loss: 1.5130, Val Loss: 1.5496, Val Acc: 44.59%
Epoch [91/100], Train Loss: 1.4709, Val Loss: 1.5136, Val Acc: 45.38%
Epoch [92/100], Train Loss: 1.4614, Val Loss: 1.5028, Val Acc: 46.49%
-> 新的最佳验证准确率: 46.49%
Epoch [93/100], Train Loss: 1.4471, Val Loss: 1.4874, Val Acc: 47.24%
-> 新的最佳验证准确率: 47.24%
Epoch [94/100], Train Loss: 1.4297, Val Loss: 1.4721, Val Acc: 46.88%
Epoch [95/100], Train Loss: 1.4230, Val Loss: 1.4720, Val Acc: 47.06%
Epoch [96/100], Train Loss: 1.4120, Val Loss: 1.4906, Val Acc: 47.95%
-> 新的最佳验证准确率: 47.95%
Epoch [97/100], Train Loss: 1.4216, Val Loss: 1.4673, Val Acc: 48.30%
-> 新的最佳验证准确率: 48.30%
Epoch [98/100], Train Loss: 1.3817, Val Loss: 1.4378, Val Acc: 48.43%
-> 新的最佳验证准确率: 48.43%
Epoch [99/100], Train Loss: 1.3652, Val Loss: 1.4280, Val Acc: 48.87%
-> 新的最佳验证准确率: 48.87%
Epoch [100/100], Train Loss: 1.3617, Val Loss: 1.4077, Val Acc: 50.33%
-> 新的最佳验证准确率: 50.33%
测试集准确率: 43.75%
推理示例:
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd....
真实类别: rec.autos
预测类别: rec.sport.baseball
模型训练了100个epoch,验证准确率最终达到50.33%,但测试准确率只有43.75%,且推理示例将汽车类新闻误判为棒球类。这说明:
-
模型已经学到了一些模式:验证准确率从5%逐渐上升到50%,证明LSTM确实在捕捉文本特征。
-
存在一定程度的过拟合:验证准确率在后期仍在上升(最后10个epoch从47%升到50%),但测试准确率明显低于最佳验证准确率,说明模型对训练/验证集的数据特性有所"死记硬背",泛化到新数据的能力不足。
-
推理错误:将汽车相关新闻误判为棒球,可能是因为训练数据中这两个类别有部分词语重合(如"车"和"球"都可能出现在体育类),或者模型没有学到足够区分的关键词。
参考老师的代码,并对比了一下:
一、核心
1. 正确处理变长序列(pack_padded_sequence)
-
问题 :之前的代码直接将填充后的序列输入 LSTM,没有使用
pack_padded_sequence。这会导致 LSTM 对填充部分也进行计算,污染隐藏状态,尤其在提取最后一个时间步的输出时(lstm_out[:, -1, :]),如果最后一个词是填充,则取到的是无意义的向量。 -
参考资料方案 :通过
collate_fn返回每个样本的真实长度lengths,在模型 forward 中使用pack_padded_sequence打包,让 LSTM 只对真实序列进行计算,最后从打包后的输出中提取最终隐藏状态(使用hidden而非lstm_out)。这才是 LSTM 的标准用法。
2. 正确提取 LSTM 最终隐藏状态
-
问题 :使用了
lstm_out[:, -1, :]提取最后一个时间步的输出。对于单向 LSTM,如果最后一个词是填充,这个值可能是 0 或噪声;对于双向 LSTM,lstm_out[:, -1, :]只包含最后一个时间步的正向和反向输出,但反向部分的最后一个时间步对应序列的开头,并不是整个序列的语义表示。 -
参考资料方案 :使用 LSTM 返回的
hidden状态:-
单向 LSTM:
hidden[-1, :, :]是最后一层最后一个时间步的隐藏状态(真正的序列终点)。 -
双向 LSTM:拼接
hidden[-2, :, :](正向最后)和hidden[-1, :, :](反向最后),得到整个序列的完整表示。
-
3. 正则化策略的具体实现
-
随机 Token 遮盖 :之前的代码没有做数据增强。参考资料给出了一个继承自
Dataset的子类,在训练时随机将部分词替换为<UNK>,防止模型过度依赖个别关键词,提升泛化能力。 -
提前停止:虽然实现了早停,但参考资料中的实现更规范(继承 Trainer,保存最佳模型,监控验证准确率)。
-
Dropout 的合理位置 :模型只在最后的隐藏状态后加了 Dropout,但参考资料中在 LSTM 层间也启用了 Dropout(通过设置
num_layers>1和dropout参数),这能更好地防止过拟合。
4. 实验对比的启示
参考资料通过对比实验得出一个重要的结论:对于新闻分类这种关键词驱动的任务,简单的全连接模型(词袋)可能已经足够好,LSTM 的序列建模能力并未带来优势,甚至可能因过拟合而表现更差。这解释了为什么原先的 LSTM 准确率始终无法达到全连接模型的水平------即使正确实现,也可能只是略低于全连接模型(参考资料中正则化后 LSTM 为 84.15%,全连接为 84.69%)。所以,不要对 LSTM 的绝对准确率抱有过高期望,但正确实现是前提。
二、之前代码的主要问题总结
| 问题 | 原先的实现 | 正确做法(参考材料) |
|---|---|---|
| 变长序列处理 | 未使用 pack_padded_sequence,直接输入填充后的序列 |
使用 pack_padded_sequence 打包,避免填充干扰 |
| 最终隐藏状态提取 | 使用 lstm_out[:, -1, :] |
使用 hidden 状态,并根据双向性拼接 |
| 序列长度传递 | 未传递 lengths 给模型 |
collate_fn 返回 lengths,模型接收 lengths |
| 数据增强 | 无 | 随机 Token 遮盖 |
| Dropout 位置 | 仅最后全连接前 | LSTM 层间 Dropout(num_layers>1 时) |
| 训练/推理时长度处理 | 未传递长度,未使用 pack |
在训练和推理时均需传递长度并使用 pack |
正是这些细节的缺失,导致模型只学到了皮毛,验证准确率仅 50% 左右,而参考材料中正确实现的 LSTM 可达 84%。
代码关键点说明
-
数据集返回原始序列 :
TextClassificationDatasetRaw返回未填充的 ID 序列,由collate_fn统一填充并记录长度。 -
collate_fn:计算批次最大长度,填充右侧,并返回lengths张量(真实长度)。 -
LSTM 模型:
-
使用
pack_padded_sequence打包,避免填充影响。 -
从
hidden中提取最终隐藏状态:双向时拼接hidden[-2](正向最后一层)和hidden[-1](反向最后一层)。 -
设置
batch_first=True,输入形状为(batch, seq, feature)。
-
-
随机 Token 遮盖 :仅在训练集启用,随机将部分词(非填充)替换为
<UNK>ID(此处设为 1),防止过拟合。 -
梯度裁剪 :
clip_grad_norm_防止梯度爆炸。 -
提前停止 :监控验证集准确率,连续
patience轮未提升则停止,并保存最佳模型。 -
L2 正则化 :优化器中使用
weight_decay。
python
"""
lstm_text_classifier_complete.py
完整版 LSTM 文本分类器(20newsgroups)
- 使用 pack_padded_sequence 处理变长序列
- 正确提取 LSTM 最终隐藏状态(支持双向)
- 训练集随机 Token 遮盖(数据增强)
- Dropout 正则化
- 提前停止(Early Stopping)
- 输出验证集最佳模型,并在测试集上评估
"""
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import random
import os
# ==================== 1. 设置随机种子(保证可复现) ====================
def set_seed(seed=42):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
# ==================== 2. 加载数据 ====================
print("正在加载 20newsgroups 数据...")
train_data = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
test_data = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)
X_train_texts_full = train_data.data
y_train_full = train_data.target
X_test_texts = test_data.data
y_test = test_data.target
num_classes = len(train_data.target_names)
print(f"原始训练样本数: {len(X_train_texts_full)}")
print(f"测试样本数: {len(X_test_texts)}")
print(f"类别数: {num_classes}")
# ==================== 3. 划分训练集和验证集 ====================
X_train_texts, X_val_texts, y_train, y_val = train_test_split(
X_train_texts_full, y_train_full, test_size=0.2, random_state=42
)
print(f"训练集大小: {len(X_train_texts)}")
print(f"验证集大小: {len(X_val_texts)}")
# ==================== 4. 构建词表与文本向量化 ====================
max_features = 20000
vectorizer = CountVectorizer(max_features=max_features, stop_words='english')
vectorizer.fit(X_train_texts_full) # 使用全部训练文本构建词表
# 词表映射:词 -> 索引(从1开始,0留给填充符/未知词)
word2idx = {word: i+1 for i, word in enumerate(vectorizer.get_feature_names_out())}
vocab_size = len(word2idx) + 1
print(f"词表大小(含填充符0): {vocab_size}")
# 获取分词器(与 CountVectorizer 一致)
tokenizer = vectorizer.build_tokenizer()
def text_to_sequence(text, max_len=400):
"""将文本转为整数ID序列(填充/截断到固定长度)"""
tokens = tokenizer(text.lower())
ids = [word2idx.get(token, 0) for token in tokens] # 未知词用0
if len(ids) > max_len:
ids = ids[:max_len]
else:
ids = ids + [0] * (max_len - len(ids))
return ids
# ==================== 5. 定义数据集类(支持随机 Token 遮盖) ====================
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, max_len=400, is_train=False, mask_prob=0.1):
self.texts = texts
self.labels = labels
self.max_len = max_len
self.is_train = is_train
self.mask_prob = mask_prob
self.unk_token_id = 1 # <UNK> 对应的 ID(这里设为1,因为词表从1开始)
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
# 获取原始文本和标签
text = self.texts[idx]
label = self.labels[idx]
# 转为 ID 序列
token_ids = text_to_sequence(text, self.max_len)
# 训练时以一定概率随机遮盖部分词为 <UNK>
if self.is_train and self.mask_prob > 0:
# 创建副本,避免修改原始数据(因为不同 epoch 需要独立遮盖)
token_ids = token_ids.copy()
for i, tid in enumerate(token_ids):
if tid != 0 and random.random() < self.mask_prob: # 不遮盖填充符
token_ids[i] = self.unk_token_id
return {
"token_ids": token_ids,
"label": label
}
# ==================== 6. 定义 collate_fn(处理批次并返回 lengths) ====================
def collate_fn(batch):
"""
batch: 列表,每个元素是 __getitem__ 返回的字典
返回:
token_ids: 填充后的序列张量 (batch_size, max_len)
labels: 标签张量 (batch_size,)
lengths: 每个序列的真实长度(不含填充) (batch_size,)
"""
# 先找出批次中的最大长度
max_len_in_batch = max(len(item["token_ids"]) for item in batch)
token_ids_padded = []
labels = []
lengths = []
for item in batch:
ids = item["token_ids"]
# 原始长度(注意:如果序列原本就截断到了 max_len,这里计算的是填充前的实际长度,但为了 pack_padded_sequence,我们需要知道非填充部分的长度)
# 由于我们在 dataset 中已经统一填充/截断到了 max_len,但有些序列可能原本不足 max_len,填充了0,这里我们需要记录有效长度。
# 但注意:我们已经在 dataset 中填充到了 max_len,所以有效长度就是 max_len 减去末尾的0吗?不一定,因为末尾可能有0是填充的,但中间的0可能是未知词。
# 实际上,在 dataset 中我们填充的是末尾,所以有效长度就是序列中最后一个非0的位置?但未知词也可能是0,这会导致混淆。
# 更好的做法是:在 dataset 中返回原始长度(截断前的长度),然后在 collate_fn 中记录该长度,并填充到批次最大长度。
# 因此,我们需要修改 dataset,让它返回原始未填充的序列和原始长度。这样 collate_fn 再进行填充。
# 为了简化,我们改为:在 dataset 中返回原始未填充的序列(不填充),在 collate_fn 中统一填充并记录长度。
# 重新设计:
# dataset.__getitem__ 返回原始 token_ids(未填充)和 label。
# collate_fn 负责填充并记录 lengths。
# 但上面的 dataset 已经填充了,不合适。我们重新实现一下 dataset,返回原始序列。
# 为了遵循正确方法,我们重新实现一个返回原始序列的 dataset。
# 重写 dataset,使其返回原始序列(不填充)
class TextClassificationDatasetRaw(Dataset):
def __init__(self, texts, labels, max_len=400, is_train=False, mask_prob=0.1):
self.texts = texts
self.labels = labels
self.max_len = max_len # 用于截断,但不填充
self.is_train = is_train
self.mask_prob = mask_prob
self.unk_token_id = 1
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
tokens = tokenizer(text.lower())
# 截断到 max_len(不填充)
token_ids = [word2idx.get(token, 0) for token in tokens][:self.max_len]
if self.is_train and self.mask_prob > 0:
# 随机遮盖
token_ids = token_ids.copy()
for i, tid in enumerate(token_ids):
if tid != 0 and random.random() < self.mask_prob:
token_ids[i] = self.unk_token_id
return {
"token_ids": token_ids, # 原始长度,未填充
"label": label
}
def collate_fn(batch):
# 找出批次中的最大长度
max_len_in_batch = max(len(item["token_ids"]) for item in batch)
token_ids_padded = []
labels = []
lengths = []
for item in batch:
ids = item["token_ids"]
lengths.append(len(ids))
# 右侧填充 0 到 max_len_in_batch
padding_len = max_len_in_batch - len(ids)
padded_ids = ids + [0] * padding_len
token_ids_padded.append(padded_ids)
labels.append(item["label"])
return {
"token_ids": torch.tensor(token_ids_padded, dtype=torch.long),
"labels": torch.tensor(labels, dtype=torch.long),
"lengths": torch.tensor(lengths, dtype=torch.long)
}
# ==================== 7. 定义 LSTM 模型(正确使用 pack_padded_sequence)====================
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
n_layers=2, dropout=0.3, bidirectional=True):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(
embed_dim, hidden_dim, n_layers,
batch_first=True,
bidirectional=bidirectional,
dropout=dropout if n_layers > 1 else 0
)
self.dropout = nn.Dropout(dropout)
num_directions = 2 if bidirectional else 1
self.fc = nn.Linear(hidden_dim * num_directions, num_classes)
def forward(self, token_ids, lengths):
# token_ids: (batch, seq_len)
embedded = self.embedding(token_ids) # (batch, seq_len, embed_dim)
# 打包变长序列
packed = nn.utils.rnn.pack_padded_sequence(
embedded, lengths.cpu(), batch_first=True, enforce_sorted=False
)
# LSTM 前向
packed_output, (hidden, cell) = self.lstm(packed)
# hidden 形状: (n_layers * num_directions, batch, hidden_dim)
if self.lstm.bidirectional:
# 拼接正向最后一层和反向最后一层
hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1) # (batch, hidden_dim*2)
else:
hidden = hidden[-1, :, :] # (batch, hidden_dim)
hidden = self.dropout(hidden)
logits = self.fc(hidden) # (batch, num_classes)
return logits
# ==================== 8. 定义训练函数(含提前停止)====================
def train_model(model, train_loader, val_loader, criterion, optimizer,
device, epochs=50, patience=5, output_dir="./output_lstm"):
os.makedirs(output_dir, exist_ok=True)
best_val_acc = 0.0
patience_counter = 0
best_model_path = os.path.join(output_dir, "best_model.pth")
for epoch in range(1, epochs+1):
# 训练阶段
model.train()
train_loss = 0.0
for batch in train_loader:
token_ids = batch["token_ids"].to(device)
labels = batch["labels"].to(device)
lengths = batch["lengths"] # 保持在 CPU 上(pack_padded_sequence 需要)
optimizer.zero_grad()
outputs = model(token_ids, lengths)
loss = criterion(outputs, labels)
loss.backward()
# 梯度裁剪(防止梯度爆炸)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
optimizer.step()
train_loss += loss.item()
avg_train_loss = train_loss / len(train_loader)
# 验证阶段
model.eval()
correct = 0
total = 0
val_loss = 0.0
with torch.no_grad():
for batch in val_loader:
token_ids = batch["token_ids"].to(device)
labels = batch["labels"].to(device)
lengths = batch["lengths"]
outputs = model(token_ids, lengths)
loss = criterion(outputs, labels)
val_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
val_acc = 100 * correct / total
avg_val_loss = val_loss / len(val_loader)
print(f"Epoch {epoch:2d}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.2f}%")
# 早停判断
if val_acc > best_val_acc:
best_val_acc = val_acc
patience_counter = 0
torch.save(model.state_dict(), best_model_path)
print(f" -> 新的最佳验证准确率!模型已保存。")
else:
patience_counter += 1
if patience_counter >= patience:
print(f"验证准确率连续 {patience} 轮未提升,提前停止训练。")
break
# 加载最佳模型
model.load_state_dict(torch.load(best_model_path))
print(f"训练完成,最佳验证准确率: {best_val_acc:.2f}%")
return model
# ==================== 9. 主程序 ====================
def main():
# 超参数
embed_dim = 128
hidden_dim = 128
n_layers = 2
dropout = 0.3
bidirectional = True
batch_size = 64
max_len = 400
learning_rate = 0.001
epochs = 50
patience = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用设备: {device}")
# 创建数据集和数据加载器
train_dataset = TextClassificationDatasetRaw(X_train_texts, y_train, max_len, is_train=True, mask_prob=0.1)
val_dataset = TextClassificationDatasetRaw(X_val_texts, y_val, max_len, is_train=False)
test_dataset = TextClassificationDatasetRaw(X_test_texts, y_test, max_len, is_train=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
# 初始化模型
model = LSTMClassifier(
vocab_size=vocab_size,
embed_dim=embed_dim,
hidden_dim=hidden_dim,
num_classes=num_classes,
n_layers=n_layers,
dropout=dropout,
bidirectional=bidirectional
).to(device)
# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4) # 加入 L2 正则化
# 训练
model = train_model(model, train_loader, val_loader, criterion, optimizer,
device, epochs, patience, output_dir="./output_lstm_final")
# 测试评估
model.eval()
correct = 0
total = 0
with torch.no_grad():
for batch in test_loader:
token_ids = batch["token_ids"].to(device)
labels = batch["labels"].to(device)
lengths = batch["lengths"]
outputs = model(token_ids, lengths)
_, predicted = torch.max(outputs, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
test_acc = 100 * correct / total
print(f"\n测试集准确率: {test_acc:.2f}%")
# 推理示例
def predict(text):
model.eval()
# 转为原始序列(不填充)
tokens = tokenizer(text.lower())
ids = [word2idx.get(token, 0) for token in tokens][:max_len]
# 构造批次(batch_size=1)
input_tensor = torch.tensor([ids], dtype=torch.long).to(device)
length_tensor = torch.tensor([len(ids)], dtype=torch.long)
with torch.no_grad():
logits = model(input_tensor, length_tensor)
pred_idx = torch.argmax(logits, dim=1).item()
return train_data.target_names[pred_idx]
sample_text = test_data.data[0]
true_label = test_data.target_names[test_data.target[0]]
pred_label = predict(sample_text)
print("\n推理示例:")
print(f"文本预览: {sample_text[:200]}...")
print(f"真实类别: {true_label}")
print(f"预测类别: {pred_label}")
if __name__ == "__main__":
main()
输出:
(base) PS E:\Datawhale 2026\base-llm202602> & D:/Users/app/miniconda3/envs/base-llm/python.exe "e:/Datawhale 2026/base-llm202602/03_6_lstm_text_classifier_complete.py"
正在加载 20newsgroups 数据...
原始训练样本数: 11314
测试样本数: 7532
类别数: 20
训练集大小: 9051
验证集大小: 2263
词表大小(含填充符0): 20001
使用设备: cuda
Epoch 1/50 | Train Loss: 2.8804 | Val Loss: 2.6584 | Val Acc: 19.62%
-> 新的最佳验证准确率!模型已保存。
Epoch 2/50 | Train Loss: 2.2695 | Val Loss: 1.9502 | Val Acc: 41.71%
-> 新的最佳验证准确率!模型已保存。
Epoch 3/50 | Train Loss: 1.6842 | Val Loss: 1.6540 | Val Acc: 51.26%
-> 新的最佳验证准确率!模型已保存。
Epoch 4/50 | Train Loss: 1.3165 | Val Loss: 1.4273 | Val Acc: 57.71%
-> 新的最佳验证准确率!模型已保存。
Epoch 5/50 | Train Loss: 1.0305 | Val Loss: 1.2409 | Val Acc: 63.37%
-> 新的最佳验证准确率!模型已保存。
Epoch 6/50 | Train Loss: 0.8248 | Val Loss: 1.0151 | Val Acc: 70.92%
-> 新的最佳验证准确率!模型已保存。
Epoch 7/50 | Train Loss: 0.6379 | Val Loss: 0.9958 | Val Acc: 72.03%
-> 新的最佳验证准确率!模型已保存。
Epoch 8/50 | Train Loss: 0.5360 | Val Loss: 0.8986 | Val Acc: 74.50%
-> 新的最佳验证准确率!模型已保存。
Epoch 9/50 | Train Loss: 0.4318 | Val Loss: 0.8365 | Val Acc: 77.46%
-> 新的最佳验证准确率!模型已保存。
Epoch 10/50 | Train Loss: 0.3474 | Val Loss: 0.8558 | Val Acc: 77.95%
-> 新的最佳验证准确率!模型已保存。
Epoch 11/50 | Train Loss: 0.3088 | Val Loss: 0.8664 | Val Acc: 77.77%
Epoch 12/50 | Train Loss: 0.2585 | Val Loss: 0.8874 | Val Acc: 77.51%
Epoch 13/50 | Train Loss: 0.2270 | Val Loss: 0.7930 | Val Acc: 79.36%
-> 新的最佳验证准确率!模型已保存。
Epoch 14/50 | Train Loss: 0.1984 | Val Loss: 0.8261 | Val Acc: 80.34%
-> 新的最佳验证准确率!模型已保存。
Epoch 15/50 | Train Loss: 0.1794 | Val Loss: 0.8356 | Val Acc: 79.85%
Epoch 16/50 | Train Loss: 0.1738 | Val Loss: 0.8082 | Val Acc: 81.40%
-> 新的最佳验证准确率!模型已保存。
Epoch 17/50 | Train Loss: 0.1404 | Val Loss: 0.8216 | Val Acc: 81.62%
-> 新的最佳验证准确率!模型已保存。
Epoch 18/50 | Train Loss: 0.1419 | Val Loss: 0.7745 | Val Acc: 82.50%
-> 新的最佳验证准确率!模型已保存。
Epoch 19/50 | Train Loss: 0.1236 | Val Loss: 0.7664 | Val Acc: 83.65%
-> 新的最佳验证准确率!模型已保存。
Epoch 20/50 | Train Loss: 0.1149 | Val Loss: 0.8000 | Val Acc: 82.63%
Epoch 21/50 | Train Loss: 0.1133 | Val Loss: 0.7914 | Val Acc: 83.69%
-> 新的最佳验证准确率!模型已保存。
Epoch 22/50 | Train Loss: 0.1037 | Val Loss: 0.8187 | Val Acc: 81.88%
Epoch 23/50 | Train Loss: 0.1065 | Val Loss: 0.7871 | Val Acc: 82.85%
Epoch 24/50 | Train Loss: 0.0891 | Val Loss: 0.7629 | Val Acc: 84.71%
-> 新的最佳验证准确率!模型已保存。
Epoch 25/50 | Train Loss: 0.0962 | Val Loss: 0.7454 | Val Acc: 83.87%
Epoch 26/50 | Train Loss: 0.0957 | Val Loss: 0.7652 | Val Acc: 83.16%
Epoch 27/50 | Train Loss: 0.0974 | Val Loss: 0.8072 | Val Acc: 83.08%
Epoch 28/50 | Train Loss: 0.0874 | Val Loss: 0.7594 | Val Acc: 83.78%
Epoch 29/50 | Train Loss: 0.0974 | Val Loss: 0.7651 | Val Acc: 83.87%
验证准确率连续 5 轮未提升,提前停止训练。
训练完成,最佳验证准确率: 84.71%
测试集准确率: 72.24%
推理示例:
文本预览: From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. GANDLER)
Subject: Need info on 88-89 Bonneville
Organization: University at Buffalo
Lines: 10
News-Software: VAX/VMS VNEWS 1.41
Nntp-Posting-Host: ubvmsd....
真实类别: rec.autos
预测类别: rec.motorcycles
📊 输出结果解读
1. 训练过程
-
验证准确率稳步提升 :从第1轮的 19.62% 一直上升到第24轮的 84.71%,说明模型在学习,而且没有出现严重的过拟合(验证损失整体呈下降趋势)。
-
早停触发:在第29轮因为验证准确率连续5轮未提升而停止,保存的最佳模型是第24轮的 84.71%。这表明早停机制发挥了作用,防止了无效训练。
-
训练损失:从 2.88 降到 0.09,说明模型在训练集上拟合得很好(甚至可能有点过拟合,但早停控制住了)。
2. 测试结果
-
测试准确率 72.24%:比最佳验证准确率低约 12%,这是正常现象,因为测试集是模型从未见过的新数据。72% 的准确率对于 20 类分类任务来说已经是不错的成绩,尤其是在没有使用预训练词向量的情况下。
-
推理示例:
-
文本是关于 88-89 年庞蒂亚克 Bonneville 汽车的咨询。
-
真实类别:
rec.autos(汽车) -
预测类别:
rec.motorcycles(摩托车) -
这两个类别都属于"交通工具"大类,模型至少判断对了大方向。可能的原因是文本中某些词(如 "Need info" 或年份数字)与摩托车类别中的某些帖子相似,导致误判。
-
3. 与参考资料对比
-
参考资料中,经过正则化的 LSTM 验证准确率约 84.15%,模型达到了 84.71%,甚至略高!
-
测试准确率 72.24% 虽然低于验证准确率,但考虑到没有使用预训练词向量、也没有进行细致的超参数调优,这个结果已经非常令人满意了。
🔍 为什么测试准确率比验证准确率低?
这是很常见的现象,原因可能有:
-
验证集是从训练集中划分出来的,与训练集分布更接近;而测试集是独立的数据,可能存在分布差异(新闻组话题随时间变化等)。
-
模型对训练集和验证集仍有一定程度的过拟合(尽管有正则化),导致在全新测试集上性能下降。
-
某些类别本身就容易混淆 ,例如
rec.autos和rec.motorcycles,或者sci.med和sci.space等。
笔记:
第一节 循环神经网络
一、为什么要让计算机记住"顺序"?
计算机处理一句话时,不能只看单个的词,还要看词的顺序。比如:
-
"我爱你" 和 "你爱我" 意思完全不一样,但里面的词是一样的。
-
如果我们只是把每个词的意思简单加起来,这两句话就会变成一模一样,计算机就分不清了。
所以,我们需要一种方法,让计算机一边读句子,一边记住前面读过的内容,这样才能理解整句话的意思。
二、RNN:一个有"记忆"的小助手
RNN(循环神经网络)就是专门干这个的。它像一个认真听讲的小朋友:
-
每听到一个词,他会把这个词和之前记住的东西结合起来,记在脑子里。
-
然后接着听下一个词,再用新词和刚才记住的东西一起更新记忆。
-
听完整个句子,他脑子里的"最终记忆"就是整句话的意思。
举个栗子 🌰
句子:"播放 周杰伦 的 《稻香》"
-
第1步:听到"播放",记忆里记下"播放"。
-
第2步:听到"周杰伦",结合之前的"播放",现在记忆里是"播放周杰伦"。
-
第3步:听到"的",记忆更新为"播放周杰伦的"。
-
第4步:听到"《稻香》",最终记忆是"播放周杰伦的《稻香》"------这就是整个指令的意思!
把这个最终记忆送进分类器,计算机就能知道这是一条"音乐播放"指令了。
三、从"死的"词向量到"活的"词表示
以前的方法(比如 Word2Vec)会给每个词一个固定的向量,不管这个词出现在哪里,向量都一样。这叫"静态词向量"。
但 RNN 不一样!同一个词在不同句子里的"记忆"是不同的。比如:
-
"苹果 真好吃" → "苹果"的记忆里带着"好吃",所以它是水果。
-
"苹果 发布新手机" → "苹果"的记忆里带着"手机",所以它是公司。
这种根据上下文变化的表示,就叫动态表示,让计算机能真正理解词在句子里的含义。
四、双向 RNN:不但看前面,还看后面
有时候,一个词的意思需要看后面的词才能确定。比如:
- "味道不错" → 如果先看到"苹果",还不知道是水果还是公司,但看到后面的"味道不错",就知道是水果了。
双向 RNN 有两个小助手:
-
一个从左到右读句子(记住前面)。
-
一个从右到左读句子(记住后面)。
-
最后把两人的记忆合并,就知道这个词的完整上下文了。
五、RNN 的小麻烦
-
长句子会忘事儿:如果句子特别长(比如一个长长的故事),读到后面可能就把开头忘了。这就是"长距离依赖问题"。
-
有时候会糊涂:计算过程中,如果数字太小(梯度消失)或太大(梯度爆炸),模型就学不好了。
不过别担心,科学家们后来又发明了更厉害的"门控 RNN"(比如 LSTM),专门解决这些问题。
第二节 LSTM 与 GRU
一、RNN 有什么烦恼?
RNN 像一个认真听故事的小朋友,他一边听一边记在脑子里。但是,如果故事特别特别长(比如讲了 100 句话),他可能就忘了开头说了什么。这叫"长句子会忘事儿",科学家叫它"长距离依赖问题"。
为什么会忘呢?因为 RNN 的记忆方式太简单了,它只是把每个新词和旧记忆"混在一起",时间久了,开头的信息就被后来的信息"冲淡"了,就像在黑板上写字,写满之后最早的笔迹就看不见了。
二、LSTM:一个有"门"的聪明笔记本
LSTM(长短期记忆网络)就像一个带三个门的智能笔记本,它能自己决定:哪些旧信息要擦掉、哪些新信息要写下来、哪些信息要读出来给别人看。
LSTM 的"门"
-
遗忘门(擦掉什么):看到新词时,它先看看旧记忆里哪些是不重要的,打个叉叉准备忘掉。
-
输入门(记下什么):再决定新来的信息里哪些值得记住,打个勾勾准备写进笔记本。
-
输出门(说出什么):最后,看看当前最重要的信息是什么,把它作为"输出"告诉别人。
因为笔记本里有一条专门的"高速路"(细胞状态),重要的信息可以沿着这条路一直传下去,不会被新信息冲淡。所以 LSTM 能记住很长很长的句子。
举个例子
句子:"小明昨天在公园里看到一只可爱的柯基,那只柯基的名字叫......"
读到后面"那只柯基的名字叫"时,LSTM 能通过遗忘门知道前面说的"柯基"很重要,要留着;通过输入门把新词"名字"加进来;最后通过输出门把"柯基"和"名字"结合起来,知道这里要问的是"狗的名字"。而普通的 RNN 可能已经把"柯基"忘了。
三、GRU:简化版的 LSTM
GRU(门控循环单元)是 LSTM 的"精简版",它只有两个门:
-
更新门:相当于 LSTM 的遗忘门+输入门,同时决定忘掉多少旧信息、加入多少新信息。
-
重置门:决定在算新信息时,要不要忽略旧信息。
GRU 比 LSTM 少了一个"细胞状态",只有一个隐藏状态,所以计算更快,效果和 LSTM 差不多。就像小一号的笔记本,轻便好用。
四、LSTM 和 GRU 的作用
它们都能很好地处理长句子,在语音识别、机器翻译、情感分析等任务中表现优秀。因为它们能"有选择地记住该记的,忘掉该忘的"。