获课地址:666it.top/4842/
入门基石------使用PyTorch构建IMDb电影评论情感分析模型
引言
在自然语言处理领域,情感分析是最基础也是最广泛的应用之一。无论是分析用户对产品的评论、社交媒体上的舆论倾向,还是监控客户反馈,情感分析都扮演着至关重要的角色。对于求职者而言,掌握一个完整的情感分析项目,是向面试官展示你NLP实战能力的"敲门砖"。本篇将带你从零开始,使用PyTorch构建一个基于IMDb数据集的电影评论情感二分类模型,拆解从数据预处理到模型训练的每一个关键步骤。
项目拆解:核心流程
- 数据准备与预处理:文本数据无法直接被模型理解,我们需要将其转换为数值形式。这包括分词、构建词汇表、将文本转换为数字序列以及填充(Padding)以保证序列长度一致。
- 模型构建 :我们将使用一个包含
nn.Embedding
层和nn.LSTM
层的经典模型。Embedding
层将数字索引映射为密集向量,LSTM
(长短期记忆网络)则能有效捕捉文本中的长距离依赖关系。 - 训练循环 :定义损失函数(
BCEWithLogitsLoss
)和优化器(Adam
),编写标准的训练循环,包括前向传播、损失计算、反向传播和参数更新。 - 评估:在测试集上评估模型的性能,通常使用准确率作为衡量指标。
PyTorch实战代码
ini
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import re
from collections import Counter
import random
# --- 1. 数据准备与预处理 ---
# 简化的IMDb数据集 (实际项目中应从torchtext或Hugging Face加载)
# 这里我们手动创建一个微型数据集用于演示
train_data = [
("this movie is fantastic", 1),
("i loved this film", 1),
("a great movie", 1),
("terrible acting", 0),
("i hate this plot", 0),
("what a waste of time", 0),
("brilliant screenplay", 1),
("the worst movie ever", 0),
("so boring and slow", 0),
("an absolute masterpiece", 1)
]
test_data = [
("this was a good experience", 1),
("i would not recommend it", 0),
("the acting was superb", 1),
("a complete disaster", 0)
]
# 构建词汇表
def build_vocab(data):
all_words = " ".join([text for text, label in data]).split()
word_counts = Counter(all_words)
# 过滤低频词,并添加特殊token
vocab = sorted([w for w, c in word_counts.items() if c > 0])
vocab = ['<pad>', '<unk>'] + vocab
word_to_idx = {word: i for i, word in enumerate(vocab)}
return word_to_idx
word_to_idx = build_vocab(train_data)
vocab_size = len(word_to_idx)
# 文本预处理和数字化
def text_pipeline(text):
text = re.sub(r'[^\w\s]', '', text.lower())
tokens = text.split()
indices = [word_to_idx.get(token, word_to_idx['<unk>']) for token in tokens]
return torch.tensor(indices, dtype=torch.long)
# 自定义Dataset
class SentimentDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
text, label = self.data[idx]
return text_pipeline(text), torch.tensor(label, dtype=torch.float32)
# 创建DataLoader和collate_fn用于填充
def collate_batch(batch):
label_list, text_list = [], []
for (_text, _label) in batch:
label_list.append(_label)
text_list.append(_text)
# 填充文本序列,使它们长度一致
text_padded = pad_sequence(text_list, batch_first=True, padding_value=word_to_idx['<pad>'])
return torch.stack(label_list), text_padded
train_dataset = SentimentDataset(train_data)
test_dataset = SentimentDataset(test_data)
batch_size = 2
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
# --- 2. 模型构建 ---
class SentimentLSTM(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
super(SentimentLSTM, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=1, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, text):
# text shape: [batch_size, seq_len]
embedded = self.embedding(text)
# embedded shape: [batch_size, seq_len, embed_dim]
lstm_out, (hidden, _) = self.lstm(embedded)
# 我们使用最后一个时间步的hidden state作为分类特征
# hidden shape: [1, batch_size, hidden_dim]
return self.fc(hidden.squeeze(0))
# --- 3. 训练循环 ---
# 超参数
EMBED_DIM = 64
HIDDEN_DIM = 128
OUTPUT_DIM = 1
N_EPOCHS = 20
# 实例化模型、损失函数和优化器
model = SentimentLSTM(vocab_size, EMBED_DIM, HIDDEN_DIM, OUTPUT_DIM)
criterion = nn.BCEWithLogitsLoss() # 适用于二分类,内部包含Sigmoid
optimizer = optim.Adam(model.parameters(), lr=0.01)
def train(model, loader, optimizer, criterion):
model.train()
epoch_loss = 0
for labels, texts in loader:
optimizer.zero_grad()
predictions = model(texts).squeeze(1)
loss = criterion(predictions, labels)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(loader)
def evaluate(model, loader, criterion):
model.eval()
epoch_loss = 0
correct = 0
total = 0
with torch.no_grad():
for labels, texts in loader:
predictions = model(texts).squeeze(1)
loss = criterion(predictions, labels)
epoch_loss += loss.item()
# 计算准确率
predicted_labels = torch.round(torch.sigmoid(predictions))
correct += (predicted_labels == labels).sum().item()
total += labels.size(0)
return epoch_loss / len(loader), correct / total
# 训练模型
for epoch in range(N_EPOCHS):
train_loss = train(model, train_loader, optimizer, criterion)
test_loss, test_acc = evaluate(model, test_loader, criterion)
print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')
# --- 4. 模型预测 ---
def predict_sentiment(model, sentence):
model.eval()
processed_text = text_pipeline(sentence).unsqueeze(0) # 增加batch维度
prediction = torch.sigmoid(model(processed_text).item())
return "Positive" if prediction > 0.5 else "Negative", prediction
print("\n--- Prediction ---")
print(predict_sentiment(model, "this film is a masterpiece"))
print(predict_sentiment(model, "it was a total waste of time"))
总结与就业对标
本篇文章完整地实现了一个情感分析项目。在面试中,你可以清晰地阐述:
- 数据流 :原始文本如何经过
text_pipeline
和collate_fn
变成模型可用的tensor
。 - 模型结构 :
nn.Embedding
为何能捕捉词语语义,nn.LSTM
如何处理序列信息,以及为何使用最后一个隐藏状态进行分类。 - 技术细节 :为何选择
BCEWithLogitsLoss
而非BCELoss
+Sigmoid
(数值稳定性),pad_sequence
的作用,以及collate_fn
在DataLoader
中的重要性。
这个项目虽然基础,但涵盖了NLP任务的核心范式,是你简历上不可或缺的一笔。