2.3 Transformer 变体与扩展：BERT、GPT 与多模态模型

Transformer 变体与扩展：BERT、GPT 与多模态模型

引言

在深度学习与自然语言处理领域，Transformer 架构无疑是最具革命性的突破之一。自从2017年 Vaswani 等人提出原始 Transformer 模型以来，各种基于 Transformer 的变体如雨后春笋般涌现，彻底改变了自然语言处理的格局。

在本章节中，我们将深入探讨三种最重要的 Transformer 变体：BERT、GPT 系列以及多模态模型。这些模型不仅在学术研究上取得了突破性进展，更在工业界得到了广泛应用，成为了现代 AI 系统的核心组件。

BERT：双向编码器的革命

BERT 的核心思想

BERT（Bidirectional Encoder Representations from Transformers）由 Google 在 2018 年提出，其最大的创新在于双向上下文理解。与传统的从左到右或从右到左的语言模型不同，BERT 能够同时考虑单词左右两侧的上下文信息。

python 复制代码

import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
import numpy as np

class BERTExplainer:
    def __init__(self, model_name='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertModel.from_pretrained(model_name)
        self.model.eval()
    
    def demonstrate_masked_language_modeling(self, text):
        """
        演示 BERT 的掩码语言建模能力
        """
        # 将文本中的某个词替换为 [MASK]
        masked_text = text.replace("language", "[MASK]")
        
        # 编码输入
        inputs = self.tokenizer(masked_text, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # 获取预测结果
        predictions = outputs.last_hidden_state
        mask_token_index = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]
        
        # 获取 mask 位置的预测向量
        mask_token_logits = predictions[0, mask_token_index, :]
        
        # 找到最可能的预测
        top_tokens = torch.topk(mask_token_logits, 5, dim=1)
        
        print(f"原始文本: {text}")
        print(f"掩码文本: {masked_text}")
        print("Top 5 预测结果:")
        
        for i, (value, index) in enumerate(zip(top_tokens.values[0], top_tokens.indices[0])):
            token = self.tokenizer.decode([index])
            print(f"{i+1}. {token} (得分: {value:.4f})")

# 使用示例
explainer = BERTExplainer()
text = "Natural language processing is amazing."
explainer.demonstrate_masked_language_modeling(text)

BERT 的预训练任务

BERT 通过两个关键的预训练任务来学习语言表示：

1. 掩码语言建模（MLM）

python 复制代码

import torch
from transformers import BertForMaskedLM, BertTokenizer

class MLMDemonstration:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForMaskedLM.from_pretrained('bert-base-uncased')
        self.model.eval()
    
    def mlm_inference(self, text_with_mask):
        """
        执行 MLM 推理
        """
        inputs = self.tokenizer(text_with_mask, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        logits = outputs.logits
        mask_token_index = torch.where(inputs["input_ids"][0] == self.tokenizer.mask_token_id)[0]
        
        # 获取预测结果
        mask_logits = logits[0, mask_token_index, :]
        top_tokens = torch.topk(mask_logits, 3, dim=1)
        
        print(f"输入: {text_with_mask}")
        for i, (values, indices) in enumerate(zip(top_tokens.values, top_tokens.indices)):
            for j in range(len(values)):
                token = self.tokenizer.decode([indices[j]])
                print(f"位置 {i+1} 第 {j+1} 预测: {token} (概率: {torch.softmax(values, dim=0)[j]:.4f})")

# 演示 MLM
mlm_demo = MLMDemonstration()
mlm_demo.mlm_inference("The weather today is [MASK] and sunny.")

2. 下一句预测（NSP）

python 复制代码

class NSPDemonstration:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
        self.model.eval()
    
    def nsp_inference(self, sentence_a, sentence_b):
        """
        下一句预测推理
        """
        # 编码输入
        inputs = self.tokenizer(sentence_a, sentence_b, return_tensors='pt')
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        
        is_next = probabilities[0, 0].item()  # 是下一句的概率
        not_next = probabilities[0, 1].item()  # 不是下一句的概率
        
        print(f"句子 A: {sentence_a}")
        print(f"句子 B: {sentence_b}")
        print(f"是下一句的概率: {is_next:.4f}")
        print(f"不是下一句的概率: {not_next:.4f}")
        print(f"预测: {'是下一句' if is_next > not_next else '不是下一句'}")

# 演示 NSP
nsp_demo = NSPDemonstration()
nsp_demo.nsp_inference(
    "The company reported strong earnings this quarter.",
    "As a result, the stock price increased significantly."
)

BERT 的架构细节

python 复制代码

import torch.nn as nn

class SimplifiedBERT(nn.Module):
    """
    简化的 BERT 模型实现，用于教学目的
    """
    def __init__(self, vocab_size=30522, hidden_size=768, num_layers=12, 
                 num_attention_heads=12, intermediate_size=3072, max_position_embeddings=512):
        super(SimplifiedBERT, self).__init__()
        
        self.embedding = BERTEmbedding(vocab_size, hidden_size, max_position_embeddings)
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(hidden_size, num_attention_heads, intermediate_size)
            for _ in range(num_layers)
        ])
        
    def forward(self, input_ids, attention_mask=None):
        # 嵌入层
        hidden_states = self.embedding(input_ids)
        
        # Transformer 编码器层
        for layer in self.encoder_layers:
            hidden_states = layer(hidden_states, attention_mask)
            
        return hidden_states

class BERTEmbedding(nn.Module):
    def __init__(self, vocab_size, hidden_size, max_position_embeddings):
        super(BERTEmbedding, self).__init__()
        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
        self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
        self.token_type_embeddings = nn.Embedding(2, hidden_size)  # 用于句子对任务
        
        self.LayerNorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, input_ids):
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        
        words_embeddings = self.word_embeddings(input_ids)
        position_embeddings = self.position_embeddings(position_ids)
        
        embeddings = words_embeddings + position_embeddings
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)
        
        return embeddings

class TransformerEncoderLayer(nn.Module):
    def __init__(self, hidden_size, num_attention_heads, intermediate_size):
        super(TransformerEncoderLayer, self).__init__()
        self.attention = MultiHeadAttention(hidden_size, num_attention_heads)
        self.intermediate = IntermediateLayer(hidden_size, intermediate_size)
        self.output = OutputLayer(hidden_size, intermediate_size)
        
    def forward(self, hidden_states, attention_mask=None):
        # 自注意力层
        attention_output = self.attention(hidden_states, attention_mask)
        
        # 前馈网络
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(intermediate_output, attention_output)
        
        return layer_output

class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_attention_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = hidden_size // num_attention_heads
        
        self.query = nn.Linear(hidden_size, hidden_size)
        self.key = nn.Linear(hidden_size, hidden_size)
        self.value = nn.Linear(hidden_size, hidden_size)
        self.dense = nn.Linear(hidden_size, hidden_size)
        
    def forward(self, hidden_states, attention_mask=None):
        batch_size, seq_length, hidden_size = hidden_states.size()
        
        # 线性变换
        query_layer = self.query(hidden_states)
        key_layer = self.key(hidden_states)
        value_layer = self.value(hidden_states)
        
        # 多头注意力计算
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / torch.sqrt(torch.tensor(self.attention_head_size, dtype=torch.float32))
        
        if attention_mask is not None:
            attention_scores = attention_scores + attention_mask
            
        attention_probs = torch.softmax(attention_scores, dim=-1)
        context_layer = torch.matmul(attention_probs, value_layer)
        
        # 输出投影
        context_layer = self.dense(context_layer)
        
        return context_layer

class IntermediateLayer(nn.Module):
    def __init__(self, hidden_size, intermediate_size):
        super(IntermediateLayer, self).__init__()
        self.dense = nn.Linear(hidden_size, intermediate_size)
        self.intermediate_act_fn = nn.GELU()
        
    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

class OutputLayer(nn.Module):
    def __init__(self, hidden_size, intermediate_size):
        super(OutputLayer, self).__init__()
        self.dense = nn.Linear(intermediate_size, hidden_size)
        self.LayerNorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(0.1)
        
    def forward(self, hidden_states, input_tensor):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)
        return hidden_states

GPT 系列：自回归生成模型的演进

GPT 架构概述

GPT（Generative Pre-trained Transformer）系列模型采用了纯解码器架构，专注于自回归语言建模任务。

python 复制代码

import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer

class GPTDemonstrator:
    def __init__(self, model_name='gpt2'):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.model.eval()
    
    def generate_text(self, prompt, max_length=100, temperature=0.7):
        """
        使用 GPT 生成文本
        """
        inputs = self.tokenizer.encode(prompt, return_tensors='pt')
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=max_length,
                temperature=temperature,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )
        
        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return generated_text
    
    def demonstrate_autoregressive_generation(self, prompt):
        """
        演示自回归生成过程
        """
        print(f"输入提示: {prompt}")
        print("生成过程:")
        
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
        generated = input_ids.clone()
        
        with torch.no_grad():
            for step in range(5):  # 只演示前5步
                outputs = self.model(generated)
                next_token_logits = outputs.logits[:, -1, :]
                
                # 应用温度调节
                next_token_logits = next_token_logits / 1.0
                next_token_probs = torch.softmax(next_token_logits, dim=-1)
                
                # 选择最可能的下一个token
                next_token_id = torch.argmax(next_token_probs, dim=-1)
                
                # 添加到生成序列
                generated = torch.cat([generated, next_token_id.unsqueeze(-1)], dim=-1)
                
                next_token = self.tokenizer.decode(next_token_id)
                print(f"步骤 {step+1}: 添加 token '{next_token}'")
                
                if next_token_id.item() == self.tokenizer.eos_token_id:
                    break
        
        full_text = self.tokenizer.decode(generated[0], skip_special_tokens=True)
        print(f"\n完整生成文本: {full_text}")

# 使用示例
gpt_demo = GPTDemonstrator()
result = gpt_demo.generate_text("The future of artificial intelligence")
print("GPT 生成结果:", result)

# 演示自回归过程
gpt_demo.demonstrate_autoregressive_generation("In the world of machine learning")

GPT 系列模型演进

graph TD A[GPT-1] --> B[GPT-2] B --> C[GPT-3] C --> D[GPT-3.5] D --> E[GPT-4] A --> A1[1.17亿参数] B --> B1[15亿参数] C --> C1[1750亿参数] D --> D1[未知] E --> E1[未知，多模态] A1 --> A2[Transformer 解码器] B1 --> B2[零样本学习] C1 --> C2[上下文学习] D2[指令微调] --> D E2[多模态能力] --> E

上下文学习（In-context Learning）

python 复制代码

class InContextLearningDemo:
    def __init__(self, model_name='gpt2'):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)
        self.model.eval()
    
    def few_shot_learning(self, examples, query):
        """
        少样本学习演示
        """
        # 构建少样本提示
        prompt = self._build_few_shot_prompt(examples, query)
        
        inputs = self.tokenizer.encode(prompt, return_tensors='pt')
        
        with torch.no_grad():
            outputs = self.model.generate(
                inputs,
                max_length=len(inputs[0]) + 20,
                temperature=0.7,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )
        
        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response[len(prompt):]  # 只返回新生成的部分
    
    def _build_few_shot_prompt(self, examples, query):
        """
        构建少样本提示
        """
        prompt = ""
        for example in examples:
            prompt += f"输入: {example['input']}\n输出: {example['output']}\n\n"
        
        prompt += f"输入: {query}\n输出:"
        return prompt

# 使用示例
icl_demo = InContextLearningDemo()

# 定义少样本示例
examples = [
    {"input": "这部电影太精彩了", "output": "正面"},
    {"input": "服务很差，不推荐", "output": "负面"},
    {"input": "产品一般般，没什么特别", "output": "中性"}
]

query = "这个餐厅的食物非常美味"
result = icl_demo.few_shot_learning(examples, query)
print(f"查询: {query}")
print(f"模型预测: {result}")

多模态模型：跨模态理解的突破

CLIP：连接视觉与语言

CLIP（Contrastive Language-Image Pre-training）通过对比学习将图像和文本映射到同一语义空间。

python 复制代码

import torch
import torch.nn as nn
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import requests

class CLIPDemonstrator:
    def __init__(self, model_name="openai/clip-vit-base-patch32"):
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model.eval()
    
    def image_text_similarity(self, image, text_candidates):
        """
        计算图像与多个文本的相似度
        """
        # 处理输入
        inputs = self.processor(
            text=text_candidates, 
            images=image, 
            return_tensors="pt", 
            padding=True
        )
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # 计算相似度
        logits_per_image = outputs.logits_per_image  # 图像与文本的相似度
        probs = logits_per_image.softmax(dim=1)
        
        return probs
    
    def zero_shot_image_classification(self, image, class_names):
        """
        零样本图像分类
        """
        # 构建文本提示
        text_descriptions = [f"a photo of a {class_name}" for class_name in class_names]
        
        # 计算相似度
        probs = self.image_text_similarity(image, text_descriptions)
        
        # 输出结果
        results = []
        for i, class_name in enumerate(class_names):
            results.append({
                "class": class_name,
                "probability": probs[0][i].item()
            })
        
        # 按概率排序
        results.sort(key=lambda x: x["probability"], reverse=True)
        return results

# 使用示例
def demo_clip():
    # 加载示例图像
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    
    clip_demo = CLIPDemonstrator()
    
    # 定义候选类别
    class_names = ["cat", "dog", "car", "person", "building"]
    
    # 执行零样本分类
    results = clip_demo.zero_shot_image_classification(image, class_names)
    
    print("零样本图像分类结果:")
    for result in results:
        print(f"{result['class']}: {result['probability']:.4f}")

# 运行演示
demo_clip()

CLIP 模型架构详解

python 复制代码

import torch
import torch.nn as nn
import torch.nn.functional as F

class SimplifiedCLIP(nn.Module):
    """
    简化的 CLIP 模型实现
    """
    def __init__(self, embed_dim=512, image_encoder=None, text_encoder=None):
        super(SimplifiedCLIP, self).__init__()
        
        self.image_encoder = image_encoder or SimpleImageEncoder(embed_dim)
        self.text_encoder = text_encoder or SimpleTextEncoder(embed_dim)
        
        # 可学习的温度参数
        self.logit_scale = nn.Parameter(torch.ones([]) * torch.log(torch.tensor(1/0.07)))
        
    def forward(self, images, texts):
        # 编码图像和文本
        image_features = self.image_encoder(images)
        text_features = self.text_encoder(texts)
        
        # 归一化特征
        image_features = F.normalize(image_features, dim=-1)
        text_features = F.normalize(text_features, dim=-1)
        
        # 计算相似度矩阵
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()
        
        return logits_per_image, logits_per_text

class SimpleImageEncoder(nn.Module):
    """
    简化的图像编码器（使用 CNN）
    """
    def __init__(self, embed_dim):
        super(SimpleImageEncoder, self).__init__()
        
        self.conv_layers = nn.Sequential(
            # 输入: 3 x 224 x 224
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 64 x 112 x 112
            
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 128 x 56 x 56
            
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),  # 256 x 28 x 28
            
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))  # 512 x 1 x 1
        )
        
        self.projection = nn.Linear(512, embed_dim)
        
    def forward(self, x):
        features = self.conv_layers(x)
        features = features.view(features.size(0), -1)
        return self.projection(features)

class SimpleTextEncoder(nn.Module):
    """
    简化的文本编码器（使用 Transformer）
    """
    def __init__(self, embed_dim, vocab_size=10000, max_length=77):
        super(SimpleTextEncoder, self).__init__()
        
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(max_length, embed_dim)
        
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads=8)
            for _ in range(6)
        ])
        
        self.ln_final = nn.LayerNorm(embed_dim)
        
    def forward(self, text):
        batch_size, seq_len = text.shape
        
        # 创建位置编码
        positions = torch.arange(seq_len, device=text.device).unsqueeze(0)
        
        # 获取嵌入
        token_embeddings = self.token_embedding(text)
        position_embeddings = self.position_embedding(positions)
        
        x = token_embeddings + position_embeddings
        
        # 通过 Transformer 块
        for block in self.transformer_blocks:
            x = block(x)
        
        # 取 [EOS] token 的特征作为文本表示
        x = self.ln_final(x)
        return x[:, -1, :]  # 取最后一个token

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(TransformerBlock, self).__init__()
        
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, embed_dim * 4),
            nn.GELU(),
            nn.Linear(embed_dim * 4, embed_dim)
        )
        
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ln2 = nn.LayerNorm(embed_dim)
        
    def forward(self, x):
        # 自注意力
        attn_output, _ = self.attention(x, x, x)
        x = x + attn_output
        x = self.ln1(x)
        
        # 前馈网络
        mlp_output = self.mlp(x)
        x = x + mlp_output
        x = self.ln2(x)
        
        return x

BLIP：Bootstrapping Language-Image Pre-training

python 复制代码

from transformers import BlipProcessor, BlipForConditionalGeneration

class BLIPDemo:
    def __init__(self):
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
        self.model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    
    def generate_image_caption(self, image):
        """
        生成图像描述
        """
        inputs = self.processor(image, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model.generate(**inputs)
        
        caption = self.processor.decode(outputs[0], skip_special_tokens=True)
        return caption
    
    def visual_question_answering(self, image, question):
        """
        视觉问答
        """
        inputs = self.processor(image, question, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.model.generate(**inputs)
        
        answer = self.processor.decode(outputs[0], skip_special_tokens=True)
        return answer

# 使用示例
def demo_blip():
    # 加载示例图像
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    image = Image.open(requests.get(url, stream=True).raw)
    
    blip_demo = BLIPDemo()
    
    # 生成图像描述
    caption = blip_demo.generate_image_caption(image)
    print(f"图像描述: {caption}")
    
    # 视觉问答
    question = "What animals are in this image?"
    answer = blip_demo.visual_question_answering(image, question)
    print(f"问题: {question}")
    print(f"回答: {answer}")

# 运行演示
demo_blip()

混合专家模型（Mixture of Experts, MoE）

MoE 基本原理

混合专家模型通过引入多个"专家"网络来扩展模型容量，同时保持计算效率。

python 复制代码

import torch
import torch.nn as nn
import torch.nn.functional as F

class MixtureOfExperts(nn.Module):
    """
    混合专家模型实现
    """
    def __init__(self, input_dim, expert_dim, num_experts=8, capacity_factor=1.0):
        super(MixtureOfExperts, self).__init__()
        
        self.num_experts = num_experts
        self.capacity_factor = capacity_factor
        
        # 专家网络
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(input_dim, expert_dim),
                nn.GELU(),
                nn.Linear(expert_dim, input_dim)
            ) for _ in range(num_experts)
        ])
        
        # 门控网络
        self.gate = nn.Linear(input_dim, num_experts)
        
    def forward(self, x):
        batch_size, seq_len, hidden_dim = x.shape
        
        # 扁平化处理
        x_flat = x.reshape(-1, hidden_dim)
        
        # 计算门控权重
        gate_logits = self.gate(x_flat)  # [batch*seq_len, num_experts]
        gate_weights = F.softmax(gate_logits, dim=-1)
        
        # 选择top-k专家
        top_k = 2  # 通常选择前2个专家
        top_k_weights, top_k_indices = torch.topk(gate_weights, top_k, dim=-1)
        top_k_weights = top_k_weights / top_k_weights.sum(dim=-1, keepdim=True)
        
        # 初始化输出
        output = torch.zeros_like(x_flat)
        
        # 专家路由
        for expert_idx in range(self.num_experts):
            # 找到需要当前专家处理的样本
            expert_mask = (top_k_indices == expert_idx).any(dim=-1)
            
            if expert_mask.any():
                expert_input = x_flat[expert_mask]
                expert_output = self.experts[expert_idx](expert_input)
                
                # 计算权重
                weights_for_expert = torch.zeros(expert_mask.sum(), device=x.device)
                for k in range(top_k):
                    mask_at_k = top_k_indices[expert_mask, k] == expert_idx
                    weights_for_expert[mask_at_k] = top_k_weights[expert_mask, k][mask_at_k]
                
                # 加权求和
                output[expert_mask] += expert_output * weights_for_expert.unsqueeze(-1)
        
        # 恢复形状
        output = output.reshape(batch_size, seq_len, hidden_dim)
        
        return output

class MoETransformerBlock(nn.Module):
    """
    包含 MoE 的 Transformer 块
    """
    def __init__(self, hidden_dim, num_heads, num_experts=8):
        super(MoETransformerBlock, self).__init__()
        
        self.attention = nn.MultiheadAttention(hidden_dim, num_heads)
        self.moe = MixtureOfExperts(hidden_dim, hidden_dim * 4, num_experts)
        
        self.ln1 = nn.LayerNorm(hidden_dim)
        self.ln2 = nn.LayerNorm(hidden_dim)
        
    def forward(self, x):
        # 自注意力
        attn_output, _ = self.attention(x, x, x)
        x = x + attn_output
        x = self.ln1(x)
        
        # MoE 前馈网络
        moe_output = self.moe(x)
        x = x + moe_output
        x = self.ln2(x)
        
        return x

# 使用示例
def demo_moe():
    batch_size, seq_len, hidden_dim = 2, 10, 512
    num_experts = 4
    
    # 创建 MoE 模型
    moe_layer = MixtureOfExperts(hidden_dim, hidden_dim * 4, num_experts)
    
    # 创建输入
    x = torch.randn(batch_size, seq_len, hidden_dim)
    
    # 前向传播
    output = moe_layer(x)
    
    print(f"输入形状: {x.shape}")
    print(f"输出形状: {output.shape}")
    print(f"MoE 参数量: {sum(p.numel() for p in moe_layer.parameters())}")

demo_moe()

实践应用：构建多模态问答系统

python 复制代码

import torch
from transformers import (
    CLIPModel, CLIPProcessor, 
    BlipForQuestionAnswering, BlipProcessor,
    AutoModel, AutoTokenizer
)
from PIL import Image
import numpy as np

class MultimodalQASystem:
    """
    多模态问答系统
    """
    def __init__(self):
        # 初始化 CLIP 模型用于图像-文本匹配
        self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
        
        # 初始化 BLIP 模型用于视觉问答
        self.blip_model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
        self.blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        
        # 文本模型用于文本问答
        self.text_model = AutoModel.from_pretrained("bert-base-uncased")
        self.text_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        
    def answer_question(self, image, question, context=None):
        """
        回答关于图像或文本的问题
        """
        # 判断问题类型
        question_type = self._classify_question_type(question)
        
        if question_type == "visual" and image is not None:
            # 视觉问答
            return self._visual_qa(image, question)
        elif question_type == "textual" and context is not None:
            # 文本问答
            return self._text_qa(context, question)
        else:
            # 通用问答
            return self._general_qa(question)
    
    def _classify_question_type(self, question):
        """
        分类问题类型
        """
        visual_keywords = ["image", "picture", "photo", "see", "look", "color", "shape"]
        textual_keywords = ["text", "document", "article", "passage", "read"]
        
        question_lower = question.lower()
        
        if any(keyword in question_lower for keyword in visual_keywords):
            return "visual"
        elif any(keyword in question_lower for keyword in textual_keywords):
            return "textual"
        else:
            return "general"
    
    def _visual_qa(self, image, question):
        """
        视觉问答
        """
        inputs = self.blip_processor(image, question, return_tensors="pt")
        
        with torch.no_grad():
            outputs = self.blip_model.generate(**inputs)
        
        answer = self.blip_processor.decode(outputs[0], skip_special_tokens=True)
        return answer
    
    def _text_qa(self, context, question):
        """
        文本问答
        """
        # 使用 BERT 进行阅读理解
        inputs = self.text_tokenizer(
            question, context, 
            return_tensors="pt", 
            max_length=512, 
            truncation=True
        )
        
        with torch.no_grad():
            outputs = self.text_model(**inputs)
        
        # 简单的答案提取（简化版）
        start_scores = outputs.last_hidden_state[:, 1:-1].mean(dim=-1)
        end_scores = outputs.last_hidden_state[:, 1:-1].mean(dim=-1)
        
        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores)
        
        answer_tokens = inputs["input_ids"][0][start_idx:end_idx+1]
        answer = self.text_tokenizer.decode(answer_tokens, skip_special_tokens=True)
        
        return answer
    
    def _general_qa(self, question):
        """
        通用问答（基于知识）
        """
        # 这里可以集成知识库或调用外部 API
        # 简化实现：返回固定回答
        knowledge_base = {
            "what is ai": "Artificial Intelligence is the simulation of human intelligence in machines.",
            "how does machine learning work": "Machine learning uses algorithms to parse data, learn from it, and make predictions.",
            "what is deep learning": "Deep learning is a subset of machine learning using neural networks with multiple layers."
        }
        
        question_lower = question.lower()
        for key in knowledge_base:
            if key in question_lower:
                return knowledge_base[key]
        
        return "I'm sorry, I don't have enough information to answer that question."

# 使用示例
def demo_multimodal_qa():
    qa_system = MultimodalQASystem()
    
    # 视觉问答示例
    try:
        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        image = Image.open(requests.get(url, stream=True).raw)
        
        question = "What animals are in this image?"
        answer = qa_system.answer_question(image, question)
        print(f"视觉问答:")
        print(f"问题: {question}")
        print(f"回答: {answer}\n")
    except:
        print("无法加载示例图像，跳过视觉问答演示")
    
    # 文本问答示例
    context = """
    Artificial intelligence (AI) is intelligence demonstrated by machines, as opposed to 
    natural intelligence displayed by animals including humans. Leading AI textbooks define 
    the field as the study of intelligent agents: any system that perceives its environment 
    and takes actions that maximize its chance of achieving its goals.
    """
    
    question = "What is artificial intelligence?"
    answer = qa_system.answer_question(None, question, context)
    print(f"文本问答:")
    print(f"问题: {question}")
    print(f"回答: {answer}\n")
    
    # 通用问答示例
    question = "How does machine learning work?"
    answer = qa_system.answer_question(None, question)
    print(f"通用问答:")
    print(f"问题: {question}")
    print(f"回答: {answer}")

demo_multimodal_qa()

性能优化与最佳实践

模型压缩技术

python 复制代码

import torch
from transformers import AutoModel, AutoTokenizer
from opacus import PrivacyEngine

class ModelOptimizer:
    """
    模型优化工具类
    """
    @staticmethod
    def quantize_model(model):
        """
        量化模型以减少内存使用
        """
        quantized_model = torch.quantization.quantize_dynamic(
            model, 
            {torch.nn.Linear}, 
            dtype=torch.qint8
        )
        return quantized_model
    
    @staticmethod
    def prune_model(model, pruning_rate=0.2):
        """
        剪枝模型以减少参数数量
        """
        # 简单的全局剪枝
        parameters_to_prune = []
        for name, module in model.named_modules():
            if isinstance(module, torch.nn.Linear):
                parameters_to_prune.append((module, 'weight'))
        
        torch.nn.utils.prune.global_unstructured(
            parameters_to_prune,
            pruning_method=torch.nn.utils.prune.L1Unstructured,
            amount=pruning_rate,
        )
        
        return model
    
    @staticmethod
    def apply_differential_privacy(model, train_loader, sample_rate, noise_multiplier, max_grad_norm):
        """
        应用差分隐私
        """
        privacy_engine = PrivacyEngine()
        
        model, optimizer, train_loader = privacy_engine.make_private(
            module=model,
            optimizer=optimizer,
            data_loader=train_loader,
            noise_multiplier=noise_multiplier,
            max_grad_norm=max_grad_norm,
        )
        
        return model, optimizer, train_loader

# 优化示例
def optimization_demo():
    # 加载原始模型
    model = AutoModel.from_pretrained("bert-base-uncased")
    original_size = sum(p.numel() for p in model.parameters())
    print(f"原始模型参数量: {original_size}")
    
    # 量化模型
    quantized_model = ModelOptimizer.quantize_model(model)
    quantized_size = sum(p.numel() for p in quantized_model.parameters())
    print(f"量化后模型参数量: {quantized_size}")
    print(f"压缩比例: {(1 - quantized_size/original_size)*100:.2f}%")
    
    # 剪枝模型（注意：这需要在实际训练后进行）
    # pruned_model = ModelOptimizer.prune_model(model, pruning_rate=0.2)

optimization_demo()

总结

在本章节中，我们深入探讨了 Transformer 架构的重要变体和扩展：

BERT：通过双向编码器和 MLM、NSP 预训练任务，实现了深度上下文理解
GPT 系列：采用自回归生成架构，在文本生成任务上表现出色
多模态模型：如 CLIP、BLIP，实现了视觉与语言的跨模态理解
混合专家模型：通过专家路由机制，在保持效率的同时扩展模型容量

这些模型不仅在学术研究上取得了突破，更在工业应用中展现了巨大价值。理解这些模型的原理和实现细节，对于构建先进的 AI 系统至关重要。

在下一章节中，我们将深入探讨如何使用 PyTorch 和 TensorFlow 实现文本分类任务，进一步巩固对这些模型的理解和应用能力。