
编码器-解码器架构的基本概念
编码器-解码器(Encoder-Decoder)架构是深度学习中的经典范式,在Transformer出现之前就已在机器翻译、文本摘要等序列到序列(Seq2Seq)任务中广泛应用。Transformer架构将这一范式推向了新的高度,通过自注意力机制实现了更强大的序列建模能力。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class BaseEncoderDecoder(nn.Module):
"""编码器-解码器架构基类"""
def __init__(self, vocab_size, d_model):
super(BaseEncoderDecoder, self).__init__()
self.vocab_size = vocab_size
self.d_model = d_model
# 共享的嵌入层
self.embedding = nn.Embedding(vocab_size, d_model)
# 位置编码
self.pos_encoding = PositionalEncoding(d_model)
def encode(self, src):
"""编码器抽象方法"""
raise NotImplementedError
def decode(self, tgt, encoder_output):
"""解码器抽象方法"""
raise NotImplementedError
def forward(self, src, tgt):
encoder_output = self.encode(src)
decoder_output = self.decode(tgt, encoder_output)
return decoder_output
class PositionalEncoding(nn.Module):
"""位置编码"""
def __init__(self, d_model, max_len=5000):
super(PositionalEncoding, self).__init__()
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
(-torch.log(torch.tensor(10000.0)) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
return x + self.pe[:x.size(0), :]
# 测试基础架构
base_model = BaseEncoderDecoder(vocab_size=10000, d_model=512)
print(f"基础编码器-解码器架构创建完成")
print(f"词汇表大小: 10000, 模型维度: 512")
原始Transformer的编码器-解码器架构
完整架构实现
原始Transformer论文提出的是一个完整的编码器-解码器架构,专门为机器翻译任务设计。
python
class TransformerEncoderLayer(nn.Module):
"""Transformer编码器层"""
def __init__(self, d_model, nhead, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(d_ff, d_model)
)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, src, src_mask=None, src_key_padding_mask=None):
# 自注意力子层
src2, _ = self.self_attn(
src, src, src,
attn_mask=src_mask,
key_padding_mask=src_key_padding_mask
)
src = src + self.dropout(src2)
src = self.norm1(src)
# 前馈网络子层
src2 = self.ffn(src)
src = src + self.dropout(src2)
src = self.norm2(src)
return src
class TransformerDecoderLayer(nn.Module):
"""Transformer解码器层"""
def __init__(self, d_model, nhead, d_ff, dropout=0.1):
super(TransformerDecoderLayer, self).__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.cross_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(d_ff, d_model)
)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None,
tgt_key_padding_mask=None, memory_key_padding_mask=None):
# 自注意力(因果掩码)
tgt2, _ = self.self_attn(
tgt, tgt, tgt,
attn_mask=tgt_mask,
key_padding_mask=tgt_key_padding_mask
)
tgt = tgt + self.dropout(tgt2)
tgt = self.norm1(tgt)
# 交叉注意力(编码器-解码器注意力)
tgt2, _ = self.cross_attn(
tgt, memory, memory,
attn_mask=memory_mask,
key_padding_mask=memory_key_padding_mask
)
tgt = tgt + self.dropout(tgt2)
tgt = self.norm2(tgt)
# 前馈网络
tgt2 = self.ffn(tgt)
tgt = tgt + self.dropout(tgt2)
tgt = self.norm3(tgt)
return tgt
class FullTransformer(nn.Module):
"""完整的Transformer编码器-解码器架构"""
def __init__(self, src_vocab_size, tgt_vocab_size, d_model, nhead,
num_encoder_layers, num_decoder_layers, d_ff, max_seq_length=5000):
super(FullTransformer, self).__init__()
self.d_model = d_model
self.src_embedding = nn.Embedding(src_vocab_size, d_model)
self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
self.pos_encoding = PositionalEncoding(d_model, max_seq_length)
# 编码器
self.encoder_layers = nn.ModuleList([
TransformerEncoderLayer(d_model, nhead, d_ff)
for _ in range(num_encoder_layers)
])
# 解码器
self.decoder_layers = nn.ModuleList([
TransformerDecoderLayer(d_model, nhead, d_ff)
for _ in range(num_decoder_layers)
])
self.output_projection = nn.Linear(d_model, tgt_vocab_size)
def create_src_mask(self, src):
"""创建源序列掩码(可选)"""
return None
def create_tgt_mask(self, tgt):
"""创建目标序列因果掩码"""
seq_len = tgt.size(0)
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
return mask
def encode(self, src):
"""编码过程"""
src_emb = self.src_embedding(src) * torch.sqrt(torch.tensor(self.d_model))
src_emb = self.pos_encoding(src_emb)
src_mask = self.create_src_mask(src)
encoder_output = src_emb
for layer in self.encoder_layers:
encoder_output = layer(encoder_output, src_mask=src_mask)
return encoder_output
def decode(self, tgt, encoder_output):
"""解码过程"""
tgt_emb = self.tgt_embedding(tgt) * torch.sqrt(torch.tensor(self.d_model))
tgt_emb = self.pos_encoding(tgt_emb)
tgt_mask = self.create_tgt_mask(tgt)
decoder_output = tgt_emb
for layer in self.decoder_layers:
decoder_output = layer(
decoder_output, encoder_output,
tgt_mask=tgt_mask
)
return decoder_output
def forward(self, src, tgt):
encoder_output = self.encode(src)
decoder_output = self.decode(tgt, encoder_output)
output = self.output_projection(decoder_output)
return output
# 创建完整Transformer实例
full_transformer = FullTransformer(
src_vocab_size=10000,
tgt_vocab_size=10000,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
d_ff=2048
)
print("完整Transformer编码器-解码器架构:")
print(f"编码器层数: 6, 解码器层数: 6")
print(f"注意力头数: 8, 前馈网络维度: 2048")
编码器-only架构模式
BERT架构分析
编码器-only架构专注于理解输入序列的深层表示,适用于分类、标注等理解任务。
python
class EncoderOnlyTransformer(nn.Module):
"""编码器-only架构(BERT风格)"""
def __init__(self, vocab_size, d_model, nhead, num_layers, d_ff, max_seq_length=512):
super(EncoderOnlyTransformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = PositionalEncoding(d_model, max_seq_length)
self.segment_embedding = nn.Embedding(2, d_model) # 用于句子对任务
self.encoder_layers = nn.ModuleList([
TransformerEncoderLayer(d_model, nhead, d_ff)
for _ in range(num_layers)
])
self.pooler = nn.Linear(d_model, d_model) # 用于[CLS]标记
self.layer_norm = nn.LayerNorm(d_model)
def forward(self, input_ids, segment_ids=None, attention_mask=None):
# 词嵌入
token_emb = self.embedding(input_ids)
# 位置编码
token_emb = self.pos_encoding(token_emb)
# 段嵌入(如果有)
if segment_ids is not None:
segment_emb = self.segment_embedding(segment_ids)
token_emb += segment_emb
# 通过编码器层
x = token_emb
for layer in self.encoder_layers:
x = layer(x, src_key_padding_mask=attention_mask)
x = self.layer_norm(x)
# 池化输出(取[CLS]标记)
pooled_output = torch.tanh(self.pooler(x[0])) # 假设[CLS]在位置0
return {
'sequence_output': x,
'pooled_output': pooled_output
}
class MaskedLanguageModel(nn.Module):
"""掩码语言模型头"""
def __init__(self, d_model, vocab_size):
super(MaskedLanguageModel, self).__init__()
self.dense = nn.Linear(d_model, d_model)
self.layer_norm = nn.LayerNorm(d_model)
self.decoder = nn.Linear(d_model, vocab_size)
def forward(self, sequence_output):
hidden_states = self.dense(sequence_output)
hidden_states = F.gelu(hidden_states)
hidden_states = self.layer_norm(hidden_states)
logits = self.decoder(hidden_states)
return logits
# 编码器-only应用示例
encoder_model = EncoderOnlyTransformer(
vocab_size=30522,
d_model=768,
nhead=12,
num_layers=12,
d_ff=3072
)
print("\n编码器-only架构特点:")
print("• 双向上下文理解")
print("• 适合分类、标注任务")
print("• 预训练任务: 掩码语言建模")
print("• 典型代表: BERT, RoBERTa, ALBERT")
解码器-only架构模式
GPT架构分析
解码器-only架构专注于自回归生成,每个位置只能关注当前位置及之前的信息。
python
class DecoderOnlyTransformer(nn.Module):
"""解码器-only架构(GPT风格)"""
def __init__(self, vocab_size, d_model, nhead, num_layers, d_ff, max_seq_length=1024):
super(DecoderOnlyTransformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = PositionalEncoding(d_model, max_seq_length)
# 使用解码器层,但不使用交叉注意力
self.decoder_layers = nn.ModuleList([
TransformerDecoderLayer(d_model, nhead, d_ff)
for _ in range(num_layers)
])
self.layer_norm = nn.LayerNorm(d_model)
self.lm_head = nn.Linear(d_model, vocab_size)
def create_causal_mask(self, seq_len):
"""创建因果注意力掩码"""
mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
return mask
def forward(self, input_ids, past_key_values=None):
seq_len = input_ids.size(0)
# 嵌入层
token_emb = self.embedding(input_ids)
x = self.pos_encoding(token_emb)
# 因果掩码
causal_mask = self.create_causal_mask(seq_len)
# 通过解码器层
present_key_values = []
for i, layer in enumerate(self.decoder_layers):
# 简化实现,实际GPT使用仅自注意力的解码器层
x = layer(
x, x, # 不使用编码器输出
tgt_mask=causal_mask
)
x = self.layer_norm(x)
logits = self.lm_head(x)
return {
'logits': logits,
'hidden_states': x
}
class AutoregressiveGenerator:
"""自回归生成器"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def generate(self, prompt, max_length=100, temperature=1.0):
"""自回归生成文本"""
input_ids = self.tokenizer.encode(prompt)
generated = input_ids.copy()
for _ in range(max_length):
# 准备输入
inputs = torch.tensor([input_ids], dtype=torch.long)
# 前向传播
with torch.no_grad():
outputs = self.model(inputs)
next_token_logits = outputs['logits'][0, -1, :]
# 温度采样
next_token_logits = next_token_logits / temperature
next_token_probs = F.softmax(next_token_logits, dim=-1)
next_token = torch.multinomial(next_token_probs, num_samples=1)
# 更新序列
next_token_id = next_token.item()
generated.append(next_token_id)
input_ids.append(next_token_id)
# 停止条件(简化)
if next_token_id == self.tokenizer.eos_token_id:
break
return self.tokenizer.decode(generated)
# 解码器-only应用示例
decoder_model = DecoderOnlyTransformer(
vocab_size=50257, # GPT-2词汇表大小
d_model=768,
nhead=12,
num_layers=12,
d_ff=3072
)
print("\n解码器-only架构特点:")
print("• 因果注意力掩码")
print("• 自回归生成")
print("• 适合文本生成任务")
print("• 预训练任务: 语言建模")
print("• 典型代表: GPT系列, BLOOM, LLaMA")
两种模式的对比分析
架构特性详细比较
python
class ArchitectureComparator:
"""架构比较分析"""
@staticmethod
def detailed_comparison():
comparison_data = {
'组件': ['注意力机制', '上下文方向', '预训练任务', '典型应用', '代表模型'],
'编码器-only': [
'双向自注意力',
'全序列上下文',
'掩码语言建模',
'文本分类、NER、QA',
'BERT、RoBERTa'
],
'解码器-only': [
'因果自注意力',
'左侧上下文',
'语言建模',
'文本生成、对话',
'GPT系列、LLaMA'
],
'编码器-解码器': [
'编码器双向 + 解码器因果',
'编码全序列 + 解码左侧',
'序列到序列任务',
'翻译、摘要、问答',
'T5、BART、mT5'
]
}
print("\n三种架构模式详细比较:")
print("组件\t\t\t编码器-only\t\t解码器-only\t\t编码器-解码器")
print("-" * 100)
for i in range(len(comparison_data['组件'])):
component = comparison_data['组件'][i]
encoder = comparison_data['编码器-only'][i]
decoder = comparison_data['解码器-only'][i]
encoder_decoder = comparison_data['编码器-解码器'][i]
print(f"{component:15}\t{encoder:15}\t{decoder:15}\t{encoder_decoder:15}")
@staticmethod
def performance_characteristics():
"""性能特征分析"""
characteristics = {
'训练效率': {
'编码器-only': '高(可并行处理全序列)',
'解码器-only': '中(自回归,序列长度相关)',
'编码器-解码器': '低(两次前向传播)'
},
'推理速度': {
'编码器-only': '快(单次前向)',
'解码器-only': '慢(逐token生成)',
'编码器-解码器': '中(编码快,解码慢)'
},
'内存使用': {
'编码器-only': '中等',
'解码器-only': '高(长序列生成)',
'编码器-解码器': '高(两个模型)'
},
'任务适应性': {
'编码器-only': '理解任务',
'解码器-only': '生成任务',
'编码器-解码器': '序列转换任务'
}
}
print("\n性能特征比较:")
print("指标\t\t编码器-only\t\t解码器-only\t\t编码器-解码器")
print("-" * 90)
for metric, values in characteristics.items():
print(f"{metric:10}\t{values['编码器-only']:15}\t{values['解码器-only']:15}\t{values['编码器-解码器']:15}")
ArchitectureComparator.detailed_comparison()
ArchitectureComparator.performance_characteristics()
混合架构与统一范式
T5的统一文本到文本框架
python
class UnifiedTextToTextModel(nn.Module):
"""统一文本到文本框架(T5风格)"""
def __init__(self, vocab_size, d_model, nhead, num_layers, d_ff):
super(UnifiedTextToTextModel, self).__init__()
# 共享的编码器-解码器参数
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoding = PositionalEncoding(d_model)
# 统一的Transformer层
self.transformer_layers = nn.ModuleList([
T5TransformerLayer(d_model, nhead, d_ff)
for _ in range(num_layers)
])
self.lm_head = nn.Linear(d_model, vocab_size)
def forward(self, input_ids, target_ids=None, attention_mask=None):
# 编码器部分
encoder_emb = self.embedding(input_ids)
encoder_output = self.pos_encoding(encoder_emb)
for layer in self.transformer_layers:
encoder_output = layer(encoder_output, is_encoder=True)
# 解码器部分(如果有目标序列)
if target_ids is not None:
decoder_emb = self.embedding(target_ids)
decoder_output = self.pos_encoding(decoder_emb)
for layer in self.transformer_layers:
decoder_output = layer(
decoder_output, encoder_output, is_encoder=False
)
logits = self.lm_head(decoder_output)
return logits
else:
# 仅编码模式
return encoder_output
class T5TransformerLayer(nn.Module):
"""T5风格的统一Transformer层"""
def __init__(self, d_model, nhead, d_ff, dropout=0.1):
super(T5TransformerLayer, self).__init__()
self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.cross_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
self.ffn = nn.Sequential(
nn.Linear(d_model, d_ff),
nn.ReLU(), # T5使用ReLU
nn.Dropout(dropout),
nn.Linear(d_ff, d_model)
)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self, x, encoder_output=None, is_encoder=True):
if is_encoder:
# 编码器模式:仅自注意力
attn_output, _ = self.self_attn(x, x, x)
x = x + self.dropout(attn_output)
x = self.norm1(x)
else:
# 解码器模式:自注意力 + 交叉注意力
# 自注意力(带因果掩码)
seq_len = x.size(0)
causal_mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool()
self_attn_output, _ = self.self_attn(x, x, x, attn_mask=causal_mask)
x = x + self.dropout(self_attn_output)
x = self.norm1(x)
# 交叉注意力
cross_attn_output, _ = self.cross_attn(x, encoder_output, encoder_output)
x = x + self.dropout(cross_attn_output)
x = self.norm2(x)
# 前馈网络(共享)
ff_output = self.ffn(x)
x = x + self.dropout(ff_output)
x = self.norm3(x)
return x
# 统一架构示例
unified_model = UnifiedTextToTextModel(
vocab_size=32128, # T5词汇表大小
d_model=512,
nhead=8,
num_layers=6,
d_ff=2048
)
print("\n统一文本到文本框架优势:")
print("• 单一模型处理多种任务")
print("• 统一的预训练目标")
print("• 简化的模型架构")
print("• 更好的知识共享")
实际应用场景分析
任务特定架构选择
python
class TaskSpecificArchitecture:
"""任务特定的架构选择指南"""
@staticmethod
def recommend_architecture(task_type, requirements):
"""根据任务需求推荐架构"""
recommendations = {
'文本分类': {
'推荐架构': '编码器-only',
'理由': '需要理解全文语义,双向上下文很重要',
'代表模型': 'BERT, RoBERTa',
'注意事项': '使用[CLS]标记或平均池化'
},
'命名实体识别': {
'推荐架构': '编码器-only',
'理由': '序列标注任务,需要每个token的上下文信息',
'代表模型': 'BERT + CRF',
'注意事项': '使用每个token的隐藏状态'
},
'机器翻译': {
'推荐架构': '编码器-解码器',
'理由': '典型的序列到序列任务,需要理解源语言并生成目标语言',
'代表模型': '原始Transformer, T5, mBART',
'注意事项': '需要对齐的平行语料'
},
'文本摘要': {
'推荐架构': '编码器-解码器 或 解码器-only',
'理由': '需要理解长文档并生成简洁摘要',
'代表模型': 'BART, PEGASUS, GPT系列',
'注意事项': '抽象摘要需要更强的生成能力'
},
'对话系统': {
'推荐架构': '解码器-only',
'理由': '自回归生成适合多轮对话',
'代表模型': 'GPT, DialoGPT, BlenderBot',
'注意事项': '需要控制生成质量和一致性'
},
'代码生成': {
'推荐架构': '解码器-only',
'理由': '代码具有严格的顺序依赖',
'代表模型': 'Codex, CodeGen, StarCoder',
'注意事项': '需要代码特定的分词器和训练数据'
}
}
if task_type in recommendations:
rec = recommendations[task_type]
print(f"\n任务: {task_type}")
print(f"推荐架构: {rec['推荐架构']}")
print(f"理由: {rec['理由']}")
print(f"代表模型: {rec['代表模型']}")
print(f"注意事项: {rec['注意事项']}")
else:
print(f"未知任务类型: {task_type}")
# 应用示例
TaskSpecificArchitecture.recommend_architecture('文本分类', {})
TaskSpecificArchitecture.recommend_architecture('机器翻译', {})
TaskSpecificArchitecture.recommend_architecture('对话系统', {})
总结与展望
编码器-解码器架构的两种基本模式代表了Transformer在不同任务上的专业化发展:
核心洞察
- 架构专业化分工:编码器擅长理解,解码器擅长生成,完整架构擅长转换
- 注意力模式决定能力:双向注意力提供深度理解,因果注意力支持可控生成
- 统一化趋势:如T5所示,统一的文本到文本框架正在模糊传统架构边界
未来发展
- 更灵活的架构:动态选择编码/解码模式
- 多模态扩展:统一的视觉-语言编码器-解码器
- 效率优化:稀疏注意力、模型压缩等技术
- 领域自适应:针对特定领域优化的架构变体
理解这两种基本模式及其变体,对于选择适合特定任务的Transformer架构、设计有效的训练策略以及推动模型创新都具有重要意义。