标题
引言
自然语言处理(NLP)是人工智能领域中致力于使计算机能够理解、解释和生成人类语言的技术。从早期的基于规则的方法,到统计机器学习方法,再到如今的深度学习时代,NLP技术经历了巨大的变革。
Transformer架构的出现彻底改变了NLP领域。自2017年Google提出"Attention Is All You Need"论文以来,Transformer及其变体(如BERT、GPT、T5等)在各种NLP任务中取得了突破性进展。本文将带你深入了解Transformer的核心机制,并从零开始构建一个基于Transformer的文本分类器。

Transformer架构解析
自注意力机制
Transformer的核心创新是自注意力(Self-Attention)机制。与传统的循环神经网络(RNN)和卷积神经网络(CNN)不同,自注意力机制能够并行处理序列中的所有元素,并捕捉它们之间的长距离依赖关系。
自注意力机制通过三个关键矩阵实现:
- Query(Q):表示当前词对其他词的"查询"
- Key(K):表示其他词作为被查询的"键"
- Value(V):表示其他词的"值"
注意力分数计算过程:
- Query与所有Key的点积得到注意力分数
- 通过softmax函数将分数归一化为权重
- 使用权重对Value进行加权求和
多头注意力
单个自注意力机制可能无法捕捉所有类型的依赖关系。Transformer通过多头注意力(Multi-Head Attention)使用不同的学习矩阵来并行执行多次自注意力计算,然后将结果合并。
位置编码
由于Transformer本身不包含序列的顺序信息,需要通过位置编码(Positional Encoding)向输入注入位置信息。位置编码使用不同频率的正弦和余弦函数来表示位置。
完整的Transformer架构
一个完整的Transformer编码器包含:
- 多头自注意力层
- 残差连接和层归一化
- 前馈神经网络
- 再次的残差连接和层归一化
构建文本分类器
环境准备
我们将使用PyTorch和Hugging Face的Transformers库来构建文本分类器:
python
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter
import re
import math
数据预处理
我们使用一个示例数据集(假设是一个情感分析数据集):
python
# 示例数据 - 在实际应用中应该从文件加载
texts = [
"这部电影太棒了!我非常喜欢。",
"演员的表演很糟糕,不值得看。",
"剧情紧凑,特效震撼,强烈推荐!",
"情节平淡,让人昏昏欲睡。",
"这是我今年看过最好的电影之一。",
"导演没有很好地把握故事的节奏。",
# ... 更多样本
]
labels = [1, 0, 1, 0, 1, 0] # 1: 正面, 0: 负面
# 文本预处理
class TextPreprocessor:
def __init__(self):
self.word_to_idx = {"<PAD>": 0, "<UNK>": 1}
self.idx_to_word = {0: "<PAD>", 1: "<UNK>"}
self.vocab_size = 2
def clean_text(self, text):
# 清理文本:移除标点、转小写等
text = re.sub(r'[^\w\s]', '', text)
text = text.lower()
return text.split()
def build_vocab(self, texts):
word_freq = Counter()
for text in texts:
words = self.clean_text(text)
word_freq.update(words)
# 只保留频率高于阈值的词
for word, freq in word_freq.items():
if freq >= 2: # 频率阈值
self.word_to_idx[word] = self.vocab_size
self.idx_to_word[self.vocab_size] = word
self.vocab_size += 1
def text_to_sequence(self, text):
words = self.clean_text(text)
return [self.word_to_idx.get(word, 1) for word in words]
# 创建预处理器并构建词汇表
preprocessor = TextPreprocessor()
preprocessor.build_vocab(texts)
# 将文本转换为序列
sequences = [preprocessor.text_to_sequence(text) for text in texts]
数据集类
python
class TextDataset(Dataset):
def __init__(self, texts, labels, preprocessor, max_length=50):
self.texts = texts
self.labels = labels
self.preprocessor = preprocessor
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
# 转换为序列
sequence = self.preprocessor.text_to_sequence(text)
# 截断或填充
if len(sequence) > self.max_length:
sequence = sequence[:self.max_length]
else:
sequence = sequence + [0] * (self.max_length - len(sequence))
return {
'input_ids': torch.tensor(sequence, dtype=torch.long),
'attention_mask': torch.tensor([1] * len(preprocessor.text_to_sequence(text)) +
[0] * (self.max_length - len(preprocessor.text_to_sequence(text))),
dtype=torch.long),
'labels': torch.tensor(label, dtype=torch.long)
}
# 创建数据集和数据加载器
dataset = TextDataset(texts, labels, preprocessor)
train_data, test_data = train_test_split(dataset, test_size=0.2, random_state=42)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)
位置编码实现
python
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_length=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_length, d_model)
position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
return x + self.pe[:x.size(0), :]
Transformer编码器实现
python
class TransformerEncoder(nn.Module):
def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, dropout=0.1):
super(TransformerEncoder, self).__init__()
# 词嵌入
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model)
# Transformer编码器层
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=nhead,
dim_feedforward=dim_feedforward,
dropout=dropout
)
self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
self.d_model = d_model
def forward(self, src, src_mask=None):
# 词嵌入
src = self.embedding(src) * math.sqrt(self.d_model)
# 添加位置编码
src = self.pos_encoder(src)
# 通过Transformer编码器
output = self.transformer_encoder(src, src_mask)
return output
文本分类器模型
python
class TextClassifier(nn.Module):
def __init__(self, vocab_size, d_model, nhead, num_layers, dim_feedforward, num_classes, dropout=0.1):
super(TextClassifier, self).__init__()
self.transformer_encoder = TransformerEncoder(
vocab_size=vocab_size,
d_model=d_model,
nhead=nhead,
num_layers=num_layers,
dim_feedforward=dim_feedforward,
dropout=dropout
)
# 分类头
self.classifier = nn.Sequential(
nn.Linear(d_model, d_model // 2),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(d_model // 2, num_classes)
)
def forward(self, input_ids, attention_mask=None):
# Transformer编码
# 转换维度:(batch_size, seq_len) -> (seq_len, batch_size)
input_ids = input_ids.transpose(0, 1)
encoded = self.transformer_encoder(input_ids)
# 转换回 (batch_size, seq_len, d_model)
encoded = encoded.transpose(0, 1)
# 使用第一个token的表示进行分类
# 或者使用平均池化
if attention_mask is not None:
# 使用注意力掩码进行加权平均
mask_expanded = attention_mask.unsqueeze(-1).expand(encoded.size()).float()
sum_embeddings = torch.sum(encoded * mask_expanded, 1)
sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
pooled_output = sum_embeddings / sum_mask
else:
# 简单平均池化
pooled_output = torch.mean(encoded, dim=1)
# 分类
logits = self.classifier(pooled_output)
return logits
训练函数
python
def train_model(model, train_loader, test_loader, num_epochs=10, learning_rate=1e-4):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train_losses = []
test_losses = []
train_accuracies = []
test_accuracies = []
for epoch in range(num_epochs):
# 训练阶段
model.train()
total_loss = 0
correct = 0
total = 0
for batch in train_loader:
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['labels']
optimizer.zero_grad()
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
train_loss = total_loss / len(train_loader)
train_acc = 100 * correct / total
# 测试阶段
model.eval()
test_loss = 0
correct = 0
total = 0
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids']
attention_mask = batch['attention_mask']
labels = batch['labels']
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
test_loss += loss.item()
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
test_loss = test_loss / len(test_loader)
test_acc = 100 * correct / total
train_losses.append(train_loss)
test_losses.append(test_loss)
train_accuracies.append(train_acc)
test_accuracies.append(test_acc)
print(f'Epoch [{epoch+1}/{num_epochs}]')
print(f'Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.2f}%')
print(f'Test Loss: {test_loss:.4f}, Test Acc: {test_acc:.2f}%')
print('-' * 50)
return train_losses, test_losses, train_accuracies, test_accuracies
模型训练
python
# 初始化模型
model = TextClassifier(
vocab_size=preprocessor.vocab_size,
d_model=128,
nhead=8,
num_layers=2,
dim_feedforward=512,
num_classes=2, # 二分类:正面/负面
dropout=0.1
)
# 训练模型
train_losses, test_losses, train_accuracies, test_accuracies = train_model(
model, train_loader, test_loader, num_epochs=10
)
结果可视化
python
import matplotlib.pyplot as plt
def plot_training_history(train_losses, test_losses, train_accuracies, test_accuracies):
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.title('Training and Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy')
plt.plot(test_accuracies, label='Test Accuracy')
plt.title('Training and Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.tight_layout()
plt.show()
# 绘制训练历史
plot_training_history(train_losses, test_losses, train_accuracies, test_accuracies)
模型预测
python
def predict_text(model, text, preprocessor, max_length=50):
model.eval()
# 预处理文本
sequence = preprocessor.text_to_sequence(text)
if len(sequence) > max_length:
sequence = sequence[:max_length]
else:
sequence = sequence + [0] * (max_length - len(sequence))
# 转换为tensor
input_ids = torch.tensor([sequence], dtype=torch.long)
attention_mask = torch.tensor([[1] * len(preprocessor.text_to_sequence(text)) +
[0] * (max_length - len(preprocessor.text_to_sequence(text)))],
dtype=torch.long)
# 预测
with torch.no_grad():
outputs = model(input_ids, attention_mask)
probabilities = torch.softmax(outputs, dim=1)
predicted_class = torch.argmax(probabilities, dim=1).item()
confidence = probabilities[0][predicted_class].item()
return predicted_class, confidence
# 测试预测
test_texts = [
"这部电影真的很精彩,我推荐大家去看。",
"浪费时间,完全不值票价。"
]
for text in test_texts:
pred_class, confidence = predict_text(model, text, preprocessor)
sentiment = "正面" if pred_class == 1 else "负面"
print(f"文本: {text}")
print(f"预测: {sentiment} (置信度: {confidence:.2f})")
print()
高级技巧与优化
预训练词嵌入
使用预训练的词嵌入(如Word2Vec、GloVe或FastText)可以提高模型性能:
python
# 加载预训练词嵌入
def load_pretrained_embeddings(word_to_idx, embedding_file):
embeddings = np.random.normal(0, 0.1, (len(word_to_idx), 300))
embeddings[0] = np.zeros(300) # PAD token
# 加载预训练向量
with open(embedding_file, 'r', encoding='utf-8') as f:
for line in f:
values = line.split()
word = values[0]
if word in word_to_idx:
vector = np.asarray(values[1:], dtype='float32')
embeddings[word_to_idx[word]] = vector
return torch.tensor(embeddings, dtype=torch.float)
# 在模型初始化时使用预训练嵌入
pretrained_embeddings = load_pretrained_embeddings(preprocessor.word_to_idx, 'glove.6B.300d.txt')
model.embedding.weight.data.copy_(pretrained_embeddings)
学习率调度
使用学习率预热和衰减:
python
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
optimizer, T_0=5, T_mult=2, eta_min=1e-6
)
# 在训练循环中
for epoch in range(num_epochs):
train_one_epoch()
scheduler.step()
模型集成
结合多个模型的预测结果:
python
class EnsembleClassifier:
def __init__(self, models):
self.models = models
def predict(self, input_ids, attention_mask):
predictions = []
for model in self.models:
model.eval()
with torch.no_grad():
output = model(input_ids, attention_mask)
predictions.append(torch.softmax(output, dim=1))
# 平均预测结果
avg_prediction = torch.mean(torch.stack(predictions), dim=0)
return avg_prediction
总结与展望
本文深入探讨了Transformer架构的核心原理,并从零开始构建了一个完整的文本分类系统。通过这个实践项目,你应该能够:
- 理解Transformer的自注意力机制和多头注意力原理
- 掌握位置编码的重要性和实现方法
- 学会构建基于Transformer的文本分类模型
- 了解文本预处理、模型训练和评估的完整流程
进一步探索方向
- 预训练模型:使用BERT、RoBERTa等预训练模型进行微调
- 多任务学习:同时解决多个NLP任务
- 低资源学习:研究少样本和零样本学习方法
- 模型压缩:探索知识蒸馏、量化等技术
- 多语言处理:扩展模型处理多语言文本的能力
- 可解释性:分析模型的注意力机制,理解其决策过程
Transformer架构不仅革新了NLP领域,还在计算机视觉、语音处理等多个领域产生了深远影响。随着技术的不断发展,掌握Transformer的原理和应用将成为AI从业者的重要技能。希望本文能够为你深入学习自然语言处理提供坚实的基础。