Transformer 变体与扩展:BERT、GPT 与多模态模型
引言
在深度学习与自然语言处理领域,Transformer 架构无疑是最具革命性的突破之一。自从2017年 Vaswani 等人提出原始 Transformer 模型以来,各种基于 Transformer 的变体如雨后春笋般涌现,彻底改变了自然语言处理的格局。
在本章节中,我们将深入探讨三种最重要的 Transformer 变体:BERT、GPT 系列以及多模态模型。这些模型不仅在学术研究上取得了突破性进展,更在工业界得到了广泛应用,成为了现代 AI 系统的核心组件。
BERT:双向编码器的革命
BERT 的核心思想
BERT(Bidirectional Encoder Representations from Transformers)由 Google 在 2018 年提出,其最大的创新在于双向上下文理解。与传统的从左到右或从右到左的语言模型不同,BERT 能够同时考虑单词左右两侧的上下文信息。
python
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import numpy as np
class BERTExplainer:
def __init__(self, model_name='bert-base-uncased'):
self.tokenizer = BertTokenizer.from_pretrained(model_name)
self.model = BertModel.from_pretrained(model_name)
self.model.eval()
def demonstrate_masked_language_modeling(self, text):
"""
演示 BERT 的掩码语言建模能力
"""
# 将文本中的某个词替换为 [MASK]
masked_text = text.replace("language", "[MASK]")
# 编码输入
inputs = self.tokenizer(masked_text, return_tensors="pt")
with torch.no_grad():
outputs = self.model(**inputs)
# 获取预测结果
predictions = outputs.last_hidden_state
mask_token_index = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]
# 获取 mask 位置的预测向量
mask_token_logits = predictions[0, mask_token_index, :]
# 找到最可能的预测
top_tokens = torch.topk(mask_token_logits, 5, dim=1)
print(f"原始文本: {text}")
print(f"掩码文本: {masked_text}")
print("Top 5 预测结果:")
for i, (value, index) in enumerate(zip(top_tokens.values[0], top_tokens.indices[0])):
token = self.tokenizer.decode([index])
print(f"{i+1}. {token} (得分: {value:.4f})")
# 使用示例
explainer = BERTExplainer()
text = "Natural language processing is amazing."
explainer.demonstrate_masked_language_modeling(text)
BERT 的预训练任务
BERT 通过两个关键的预训练任务来学习语言表示:
1. 掩码语言建模(MLM)
python
import torch
from transformers import BertForMaskedLM, BertTokenizer
class MLMDemonstration:
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
self.model.eval()
def mlm_inference(self, text_with_mask):
"""
执行 MLM 推理
"""
inputs = self.tokenizer(text_with_mask, return_tensors="pt")
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
mask_token_index = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]
# 获取预测结果
mask_logits = logits[0, mask_token_index, :]
top_tokens = torch.topk(mask_logits, 3, dim=1)
print(f"输入: {text_with_mask}")
for i, (values, indices) in enumerate(zip(top_tokens.values, top_tokens.indices)):
for j in range(len(values)):
token = self.tokenizer.decode([indices[j]])
print(f"位置 {i+1} 第 {j+1} 预测: {token} (概率: {torch.softmax(values, dim=0)[j]:.4f})")
# 演示 MLM
mlm_demo = MLMDemonstration()
mlm_demo.mlm_inference("The weather today is [MASK] and sunny.")
2. 下一句预测(NSP)
python
class NSPDemonstration:
def __init__(self):
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
self.model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
self.model.eval()
def nsp_inference(self, sentence_a, sentence_b):
"""
下一句预测推理
"""
# 编码输入
inputs = self.tokenizer(sentence_a, sentence_b, return_tensors='pt')
with torch.no_grad():
outputs = self.model(**inputs)
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)
is_next = probabilities[0, 0].item() # 是下一句的概率
not_next = probabilities[0, 1].item() # 不是下一句的概率
print(f"句子 A: {sentence_a}")
print(f"句子 B: {sentence_b}")
print(f"是下一句的概率: {is_next:.4f}")
print(f"不是下一句的概率: {not_next:.4f}")
print(f"预测: {'是下一句' if is_next > not_next else '不是下一句'}")
# 演示 NSP
nsp_demo = NSPDemonstration()
nsp_demo.nsp_inference(
"The company reported strong earnings this quarter.",
"As a result, the stock price increased significantly."
)
BERT 的架构细节
python
import torch.nn as nn
class SimplifiedBERT(nn.Module):
"""
简化的 BERT 模型实现,用于教学目的
"""
def __init__(self, vocab_size=30522, hidden_size=768, num_layers=12,
num_attention_heads=12, intermediate_size=3072, max_position_embeddings=512):
super(SimplifiedBERT, self).__init__()
self.embedding = BERTEmbedding(vocab_size, hidden_size, max_position_embeddings)
self.encoder_layers = nn.ModuleList([
TransformerEncoderLayer(hidden_size, num_attention_heads, intermediate_size)
for _ in range(num_layers)
])
def forward(self, input_ids, attention_mask=None):
# 嵌入层
hidden_states = self.embedding(input_ids)
# Transformer 编码器层
for layer in self.encoder_layers:
hidden_states = layer(hidden_states, attention_mask)
return hidden_states
class BERTEmbedding(nn.Module):
def __init__(self, vocab_size, hidden_size, max_position_embeddings):
super(BERTEmbedding, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.token_type_embeddings = nn.Embedding(2, hidden_size) # 用于句子对任务
self.LayerNorm = nn.LayerNorm(hidden_size)
self.dropout = nn.Dropout(0.1)
def forward(self, input_ids):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
embeddings = words_embeddings + position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
class TransformerEncoderLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, intermediate_size):
super(TransformerEncoderLayer, self).__init__()
self.attention = MultiHeadAttention(hidden_size, num_attention_heads)
self.intermediate = IntermediateLayer(hidden_size, intermediate_size)
self.output = OutputLayer(hidden_size, intermediate_size)
def forward(self, hidden_states, attention_mask=None):
# 自注意力层
attention_output = self.attention(hidden_states, attention_mask)
# 前馈网络
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class MultiHeadAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads):
super(MultiHeadAttention, self).__init__()
self.num_attention_heads = num_attention_heads
self.attention_head_size = hidden_size // num_attention_heads
self.query = nn.Linear(hidden_size, hidden_size)
self.key = nn.Linear(hidden_size, hidden_size)
self.value = nn.Linear(hidden_size, hidden_size)
self.dense = nn.Linear(hidden_size, hidden_size)
def forward(self, hidden_states, attention_mask=None):
batch_size, seq_length, hidden_size = hidden_states.size()
# 线性变换
query_layer = self.query(hidden_states)
key_layer = self.key(hidden_states)
value_layer = self.value(hidden_states)
# 多头注意力计算
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / torch.sqrt(torch.tensor(self.attention_head_size, dtype=torch.float32))
if attention_mask is not None:
attention_scores = attention_scores + attention_mask
attention_probs = torch.softmax(attention_scores, dim=-1)
context_layer = torch.matmul(attention_probs, value_layer)
# 输出投影
context_layer = self.dense(context_layer)
return context_layer
class IntermediateLayer(nn.Module):
def __init__(self, hidden_size, intermediate_size):
super(IntermediateLayer, self).__init__()
self.dense = nn.Linear(hidden_size, intermediate_size)
self.intermediate_act_fn = nn.GELU()
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states
class OutputLayer(nn.Module):
def __init__(self, hidden_size, intermediate_size):
super(OutputLayer, self).__init__()
self.dense = nn.Linear(intermediate_size, hidden_size)
self.LayerNorm = nn.LayerNorm(hidden_size)
self.dropout = nn.Dropout(0.1)
def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states
GPT 系列:自回归生成模型的演进
GPT 架构概述
GPT(Generative Pre-trained Transformer)系列模型采用了纯解码器架构,专注于自回归语言建模任务。
python
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer
class GPTDemonstrator:
def __init__(self, model_name='gpt2'):
self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
self.model = GPT2LMHeadModel.from_pretrained(model_name)
self.model.eval()
def generate_text(self, prompt, max_length=100, temperature=0.7):
"""
使用 GPT 生成文本
"""
inputs = self.tokenizer.encode(prompt, return_tensors='pt')
with torch.no_grad():
outputs = self.model.generate(
inputs,
max_length=max_length,
temperature=temperature,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
num_return_sequences=1
)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return generated_text
def demonstrate_autoregressive_generation(self, prompt):
"""
演示自回归生成过程
"""
print(f"输入提示: {prompt}")
print("生成过程:")
input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
generated = input_ids.clone()
with torch.no_grad():
for step in range(5): # 只演示前5步
outputs = self.model(generated)
next_token_logits = outputs.logits[:, -1, :]
# 应用温度调节
next_token_logits = next_token_logits / 1.0
next_token_probs = torch.softmax(next_token_logits, dim=-1)
# 选择最可能的下一个token
next_token_id = torch.argmax(next_token_probs, dim=-1)
# 添加到生成序列
generated = torch.cat([generated, next_token_id.unsqueeze(-1)], dim=-1)
next_token = self.tokenizer.decode(next_token_id)
print(f"步骤 {step+1}: 添加 token '{next_token}'")
if next_token_id.item() == self.tokenizer.eos_token_id:
break
full_text = self.tokenizer.decode(generated[0], skip_special_tokens=True)
print(f"\n完整生成文本: {full_text}")
# 使用示例
gpt_demo = GPTDemonstrator()
result = gpt_demo.generate_text("The future of artificial intelligence")
print("GPT 生成结果:", result)
# 演示自回归过程
gpt_demo.demonstrate_autoregressive_generation("In the world of machine learning")
GPT 系列模型演进
graph TD
A[GPT-1] --> B[GPT-2]
B --> C[GPT-3]
C --> D[GPT-3.5]
D --> E[GPT-4]
A --> A1[1.17亿参数]
B --> B1[15亿参数]
C --> C1[1750亿参数]
D --> D1[未知]
E --> E1[未知, 多模态]
A1 --> A2[Transformer 解码器]
B1 --> B2[零样本学习]
C1 --> C2[上下文学习]
D2[指令微调] --> D
E2[多模态能力] --> E
上下文学习(In-context Learning)
python
class InContextLearningDemo:
def __init__(self, model_name='gpt2'):
self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
self.model = GPT2LMHeadModel.from_pretrained(model_name)
self.model.eval()
def few_shot_learning(self, examples, query):
"""
少样本学习演示
"""
# 构建少样本提示
prompt = self._build_few_shot_prompt(examples, query)
inputs = self.tokenizer.encode(prompt, return_tensors='pt')
with torch.no_grad():
outputs = self.model.generate(
inputs,
max_length=len(inputs[0]) + 20,
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
num_return_sequences=1
)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[len(prompt):] # 只返回新生成的部分
def _build_few_shot_prompt(self, examples, query):
"""
构建少样本提示
"""
prompt = ""
for example in examples:
prompt += f"输入: {example['input']}\n输出: {example['output']}\n\n"
prompt += f"输入: {query}\n输出:"
return prompt
# 使用示例
icl_demo = InContextLearningDemo()
# 定义少样本示例
examples = [
{"input": "这部电影太精彩了", "output": "正面"},
{"input": "服务很差,不推荐", "output": "负面"},
{"input": "产品一般般,没什么特别", "output": "中性"}
]
query = "这个餐厅的食物非常美味"
result = icl_demo.few_shot_learning(examples, query)
print(f"查询: {query}")
print(f"模型预测: {result}")
多模态模型:跨模态理解的突破
CLIP:连接视觉与语言
CLIP(Contrastive Language-Image Pre-training)通过对比学习将图像和文本映射到同一语义空间。
python
import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests
class CLIPDemonstrator:
def __init__(self, model_name="openai/clip-vit-base-patch32"):
self.model = CLIPModel.from_pretrained(model_name)
self.processor = CLIPProcessor.from_pretrained(model_name)
self.model.eval()
def image_text_similarity(self, image, text_candidates):
"""
计算图像与多个文本的相似度
"""
# 处理输入
inputs = self.processor(
text=text_candidates,
images=image,
return_tensors="pt",
padding=True
)
with torch.no_grad():
outputs = self.model(**inputs)
# 计算相似度
logits_per_image = outputs.logits_per_image # 图像与文本的相似度
probs = logits_per_image.softmax(dim=1)
return probs
def zero_shot_image_classification(self, image, class_names):
"""
零样本图像分类
"""
# 构建文本提示
text_descriptions = [f"a photo of a {class_name}" for class_name in class_names]
# 计算相似度
probs = self.image_text_similarity(image, text_descriptions)
# 输出结果
results = []
for i, class_name in enumerate(class_names):
results.append({
"class": class_name,
"probability": probs[0][i].item()
})
# 按概率排序
results.sort(key=lambda x: x["probability"], reverse=True)
return results
# 使用示例
def demo_clip():
# 加载示例图像
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
clip_demo = CLIPDemonstrator()
# 定义候选类别
class_names = ["cat", "dog", "car", "person", "building"]
# 执行零样本分类
results = clip_demo.zero_shot_image_classification(image, class_names)
print("零样本图像分类结果:")
for result in results:
print(f"{result['class']}: {result['probability']:.4f}")
# 运行演示
demo_clip()
CLIP 模型架构详解
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class SimplifiedCLIP(nn.Module):
"""
简化的 CLIP 模型实现
"""
def __init__(self, embed_dim=512, image_encoder=None, text_encoder=None):
super(SimplifiedCLIP, self).__init__()
self.image_encoder = image_encoder or SimpleImageEncoder(embed_dim)
self.text_encoder = text_encoder or SimpleTextEncoder(embed_dim)
# 可学习的温度参数
self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))
def forward(self, images, texts):
# 编码图像和文本
image_features = self.image_encoder(images)
text_features = self.text_encoder(texts)
# 归一化特征
image_features = F.normalize(image_features, dim=-1)
text_features = F.normalize(text_features, dim=-1)
# 计算相似度矩阵
logit_scale = self.logit_scale.exp()
logits_per_image = logit_scale * image_features @ text_features.t()
logits_per_text = logits_per_image.t()
return logits_per_image, logits_per_text
class SimpleImageEncoder(nn.Module):
"""
简化的图像编码器(使用 CNN)
"""
def __init__(self, embed_dim):
super(SimpleImageEncoder, self).__init__()
self.conv_layers = nn.Sequential(
# 输入: 3 x 224 x 224
nn.Conv2d(3, 64, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2), # 64 x 112 x 112
nn.Conv2d(64, 128, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2), # 128 x 56 x 56
nn.Conv2d(128, 256, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(2), # 256 x 28 x 28
nn.Conv2d(256, 512, kernel_size=3, padding=1),
nn.ReLU(),
nn.AdaptiveAvgPool2d((1, 1)) # 512 x 1 x 1
)
self.projection = nn.Linear(512, embed_dim)
def forward(self, x):
features = self.conv_layers(x)
features = features.view(features.size(0), -1)
return self.projection(features)
class SimpleTextEncoder(nn.Module):
"""
简化的文本编码器(使用 Transformer)
"""
def __init__(self, embed_dim, vocab_size=10000, max_length=77):
super(SimpleTextEncoder, self).__init__()
self.token_embedding = nn.Embedding(vocab_size, embed_dim)
self.position_embedding = nn.Embedding(max_length, embed_dim)
self.transformer_blocks = nn.ModuleList([
TransformerBlock(embed_dim, num_heads=8)
for _ in range(6)
])
self.ln_final = nn.LayerNorm(embed_dim)
def forward(self, text):
batch_size, seq_len = text.shape
# 创建位置编码
positions = torch.arange(seq_len, device=text.device).unsqueeze(0)
# 获取嵌入
token_embeddings = self.token_embedding(text)
position_embeddings = self.position_embedding(positions)
x = token_embeddings + position_embeddings
# 通过 Transformer 块
for block in self.transformer_blocks:
x = block(x)
# 取 [EOS] token 的特征作为文本表示
x = self.ln_final(x)
return x[:, -1, :] # 取最后一个token
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, num_heads):
super(TransformerBlock, self).__init__()
self.attention = nn.MultiheadAttention(embed_dim, num_heads)
self.mlp = nn.Sequential(
nn.Linear(embed_dim, embed_dim * 4),
nn.GELU(),
nn.Linear(embed_dim * 4, embed_dim)
)
self.ln1 = nn.LayerNorm(embed_dim)
self.ln2 = nn.LayerNorm(embed_dim)
def forward(self, x):
# 自注意力
attn_output, _ = self.attention(x, x, x)
x = x + attn_output
x = self.ln1(x)
# 前馈网络
mlp_output = self.mlp(x)
x = x + mlp_output
x = self.ln2(x)
return x
BLIP:Bootstrapping Language-Image Pre-training
python
from transformers import BlipProcessor, BlipForConditionalGeneration
class BLIPDemo:
def __init__(self):
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
def generate_image_caption(self, image):
"""
生成图像描述
"""
inputs = self.processor(image, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(**inputs)
caption = self.processor.decode(outputs[0], skip_special_tokens=True)
return caption
def visual_question_answering(self, image, question):
"""
视觉问答
"""
inputs = self.processor(image, question, return_tensors="pt")
with torch.no_grad():
outputs = self.model.generate(**inputs)
answer = self.processor.decode(outputs[0], skip_special_tokens=True)
return answer
# 使用示例
def demo_blip():
# 加载示例图像
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
blip_demo = BLIPDemo()
# 生成图像描述
caption = blip_demo.generate_image_caption(image)
print(f"图像描述: {caption}")
# 视觉问答
question = "What animals are in this image?"
answer = blip_demo.visual_question_answering(image, question)
print(f"问题: {question}")
print(f"回答: {answer}")
# 运行演示
demo_blip()
混合专家模型(Mixture of Experts, MoE)
MoE 基本原理
混合专家模型通过引入多个"专家"网络来扩展模型容量,同时保持计算效率。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class MixtureOfExperts(nn.Module):
"""
混合专家模型实现
"""
def __init__(self, input_dim, expert_dim, num_experts=8, capacity_factor=1.0):
super(MixtureOfExperts, self).__init__()
self.num_experts = num_experts
self.capacity_factor = capacity_factor
# 专家网络
self.experts = nn.ModuleList([
nn.Sequential(
nn.Linear(input_dim, expert_dim),
nn.GELU(),
nn.Linear(expert_dim, input_dim)
) for _ in range(num_experts)
])
# 门控网络
self.gate = nn.Linear(input_dim, num_experts)
def forward(self, x):
batch_size, seq_len, hidden_dim = x.shape
# 扁平化处理
x_flat = x.reshape(-1, hidden_dim)
# 计算门控权重
gate_logits = self.gate(x_flat) # [batch*seq_len, num_experts]
gate_weights = F.softmax(gate_logits, dim=-1)
# 选择top-k专家
top_k = 2 # 通常选择前2个专家
top_k_weights, top_k_indices = torch.topk(gate_weights, top_k, dim=-1)
top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
# 初始化输出
output = torch.zeros_like(x_flat)
# 专家路由
for expert_idx in range(self.num_experts):
# 找到需要当前专家处理的样本
expert_mask = (top_k_indices == expert_idx).any(dim=-1)
if expert_mask.any():
expert_input = x_flat[expert_mask]
expert_output = self.experts[expert_idx](expert_input)
# 计算权重
weights_for_expert = torch.zeros(expert_mask.sum(), device=x.device)
for k in range(top_k):
mask_at_k = top_k_indices[expert_mask, k] == expert_idx
weights_for_expert[mask_at_k] = top_k_weights[expert_mask, k][mask_at_k]
# 加权求和
output[expert_mask] += expert_output * weights_for_expert.unsqueeze(-1)
# 恢复形状
output = output.reshape(batch_size, seq_len, hidden_dim)
return output
class MoETransformerBlock(nn.Module):
"""
包含 MoE 的 Transformer 块
"""
def __init__(self, hidden_dim, num_heads, num_experts=8):
super(MoETransformerBlock, self).__init__()
self.attention = nn.MultiheadAttention(hidden_dim, num_heads)
self.moe = MixtureOfExperts(hidden_dim, hidden_dim * 4, num_experts)
self.ln1 = nn.LayerNorm(hidden_dim)
self.ln2 = nn.LayerNorm(hidden_dim)
def forward(self, x):
# 自注意力
attn_output, _ = self.attention(x, x, x)
x = x + attn_output
x = self.ln1(x)
# MoE 前馈网络
moe_output = self.moe(x)
x = x + moe_output
x = self.ln2(x)
return x
# 使用示例
def demo_moe():
batch_size, seq_len, hidden_dim = 2, 10, 512
num_experts = 4
# 创建 MoE 模型
moe_layer = MixtureOfExperts(hidden_dim, hidden_dim * 4, num_experts)
# 创建输入
x = torch.randn(batch_size, seq_len, hidden_dim)
# 前向传播
output = moe_layer(x)
print(f"输入形状: {x.shape}")
print(f"输出形状: {output.shape}")
print(f"MoE 参数量: {sum(p.numel() for p in moe_layer.parameters())}")
demo_moe()
实践应用:构建多模态问答系统
python
import torch
from transformers import (
CLIPModel, CLIPProcessor,
BlipForQuestionAnswering, BlipProcessor,
AutoModel, AutoTokenizer
)
from PIL import Image
import numpy as np
class MultimodalQASystem:
"""
多模态问答系统
"""
def __init__(self):
# 初始化 CLIP 模型用于图像-文本匹配
self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
# 初始化 BLIP 模型用于视觉问答
self.blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
# 文本模型用于文本问答
self.text_model = AutoModel.from_pretrained("bert-base-uncased")
self.text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def answer_question(self, image, question, context=None):
"""
回答关于图像或文本的问题
"""
# 判断问题类型
question_type = self._classify_question_type(question)
if question_type == "visual" and image is not None:
# 视觉问答
return self._visual_qa(image, question)
elif question_type == "textual" and context is not None:
# 文本问答
return self._text_qa(context, question)
else:
# 通用问答
return self._general_qa(question)
def _classify_question_type(self, question):
"""
分类问题类型
"""
visual_keywords = ["image", "picture", "photo", "see", "look", "color", "shape"]
textual_keywords = ["text", "document", "article", "passage", "read"]
question_lower = question.lower()
if any(keyword in question_lower for keyword in visual_keywords):
return "visual"
elif any(keyword in question_lower for keyword in textual_keywords):
return "textual"
else:
return "general"
def _visual_qa(self, image, question):
"""
视觉问答
"""
inputs = self.blip_processor(image, question, return_tensors="pt")
with torch.no_grad():
outputs = self.blip_model.generate(**inputs)
answer = self.blip_processor.decode(outputs[0], skip_special_tokens=True)
return answer
def _text_qa(self, context, question):
"""
文本问答
"""
# 使用 BERT 进行阅读理解
inputs = self.text_tokenizer(
question, context,
return_tensors="pt",
max_length=512,
truncation=True
)
with torch.no_grad():
outputs = self.text_model(**inputs)
# 简单的答案提取(简化版)
start_scores = outputs.last_hidden_state[:, 1:-1].mean(dim=-1)
end_scores = outputs.last_hidden_state[:, 1:-1].mean(dim=-1)
start_idx = torch.argmax(start_scores)
end_idx = torch.argmax(end_scores)
answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]
answer = self.text_tokenizer.decode(answer_tokens, skip_special_tokens=True)
return answer
def _general_qa(self, question):
"""
通用问答(基于知识)
"""
# 这里可以集成知识库或调用外部 API
# 简化实现:返回固定回答
knowledge_base = {
"what is ai": "Artificial Intelligence is the simulation of human intelligence in machines.",
"how does machine learning work": "Machine learning uses algorithms to parse data, learn from it, and make predictions.",
"what is deep learning": "Deep learning is a subset of machine learning using neural networks with multiple layers."
}
question_lower = question.lower()
for key in knowledge_base:
if key in question_lower:
return knowledge_base[key]
return "I'm sorry, I don't have enough information to answer that question."
# 使用示例
def demo_multimodal_qa():
qa_system = MultimodalQASystem()
# 视觉问答示例
try:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
question = "What animals are in this image?"
answer = qa_system.answer_question(image, question)
print(f"视觉问答:")
print(f"问题: {question}")
print(f"回答: {answer}\n")
except:
print("无法加载示例图像,跳过视觉问答演示")
# 文本问答示例
context = """
Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to
natural intelligence displayed by animals including humans. Leading AI textbooks define
the field as the study of intelligent agents: any system that perceives its environment
and takes actions that maximize its chance of achieving its goals.
"""
question = "What is artificial intelligence?"
answer = qa_system.answer_question(None, question, context)
print(f"文本问答:")
print(f"问题: {question}")
print(f"回答: {answer}\n")
# 通用问答示例
question = "How does machine learning work?"
answer = qa_system.answer_question(None, question)
print(f"通用问答:")
print(f"问题: {question}")
print(f"回答: {answer}")
demo_multimodal_qa()
性能优化与最佳实践
模型压缩技术
python
import torch
from transformers import AutoModel, AutoTokenizer
from opacus import PrivacyEngine
class ModelOptimizer:
"""
模型优化工具类
"""
@staticmethod
def quantize_model(model):
"""
量化模型以减少内存使用
"""
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear},
dtype=torch.qint8
)
return quantized_model
@staticmethod
def prune_model(model, pruning_rate=0.2):
"""
剪枝模型以减少参数数量
"""
# 简单的全局剪枝
parameters_to_prune = []
for name, module in model.named_modules():
if isinstance(module, torch.nn.Linear):
parameters_to_prune.append((module, 'weight'))
torch.nn.utils.prune.global_unstructured(
parameters_to_prune,
pruning_method=torch.nn.utils.prune.L1Unstructured,
amount=pruning_rate,
)
return model
@staticmethod
def apply_differential_privacy(model, train_loader, sample_rate, noise_multiplier, max_grad_norm):
"""
应用差分隐私
"""
privacy_engine = PrivacyEngine()
model, optimizer, train_loader = privacy_engine.make_private(
module=model,
optimizer=optimizer,
data_loader=train_loader,
noise_multiplier=noise_multiplier,
max_grad_norm=max_grad_norm,
)
return model, optimizer, train_loader
# 优化示例
def optimization_demo():
# 加载原始模型
model = AutoModel.from_pretrained("bert-base-uncased")
original_size = sum(p.numel() for p in model.parameters())
print(f"原始模型参数量: {original_size}")
# 量化模型
quantized_model = ModelOptimizer.quantize_model(model)
quantized_size = sum(p.numel() for p in quantized_model.parameters())
print(f"量化后模型参数量: {quantized_size}")
print(f"压缩比例: {(1 - quantized_size/original_size)*100:.2f}%")
# 剪枝模型(注意:这需要在实际训练后进行)
# pruned_model = ModelOptimizer.prune_model(model, pruning_rate=0.2)
optimization_demo()
总结
在本章节中,我们深入探讨了 Transformer 架构的重要变体和扩展:
- BERT:通过双向编码器和 MLM、NSP 预训练任务,实现了深度上下文理解
- GPT 系列:采用自回归生成架构,在文本生成任务上表现出色
- 多模态模型:如 CLIP、BLIP,实现了视觉与语言的跨模态理解
- 混合专家模型:通过专家路由机制,在保持效率的同时扩展模型容量
这些模型不仅在学术研究上取得了突破,更在工业应用中展现了巨大价值。理解这些模型的原理和实现细节,对于构建先进的 AI 系统至关重要。
在下一章节中,我们将深入探讨如何使用 PyTorch 和 TensorFlow 实现文本分类任务,进一步巩固对这些模型的理解和应用能力。