Adapter微调

1.基本原理

在预训练模型的特定位置(通常是注意层和全连接层之后)添加少量可训练参数,这些参数构成一个小型神经网络模块,如下图所示:

2.代码案例

2.1 定义adapter

python 复制代码
# -*- coding:utf-8 -*-

import torch
import torch.nn as nn

from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset


class Adapter(nn.Module):
    def __init__(self, input_dim, bottleneck_dim=64):
        super(Adapter, self).__init__()
        self.adapter_layer = nn.Sequential(
            nn.Linear(input_dim, bottleneck_dim),
            nn.ReLU(),
            nn.Linear(bottleneck_dim, input_dim)
        )
        nn.init.normal_(self.adapter_layer[0].weight, std=1e-3)
        nn.init.normal_(self.adapter_layer[2].weight, std=1e-3)

    def forward(self, x):
        return x + self.adapter_layer(x)

2.2 在Bert特殊位置插入Adapter

python 复制代码
class BertWithAdapter(nn.Module):
    def __init__(self, model_name, num_classes=2):
        super(BertWithAdapter, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        
        for param in self.bert.parameters():
            param.requires_grad = False
        
        for i in range(len(self.bert.encoder.layer)):
            # 注入 Adapter
            self.bert.encoder.layer[i].attention.output.add_module("adapter", Adapter(768))
            self.bert.encoder.layer[i].output.add_module("adapter", Adapter(768))
            self._inject_forward_logic(self.bert.encoder.layer[i])

        self.classifier = nn.Linear(768, num_classes)

    def _inject_forward_logic(self, layer):
        attn_out = layer.attention.output
        ffn_out = layer.output

        original_attn_forward = attn_out.forward
        def new_attn_forward(hidden_states, input_tensor):
            x = original_attn_forward(hidden_states, input_tensor)
            return attn_out.adapter(x)
        attn_out.forward = new_attn_forward

        original_ffn_forward = ffn_out.forward
        def new_ffn_forward(hidden_states, input_tensor):
            x = original_ffn_forward(hidden_states, input_tensor)
            return ffn_out.adapter(x)
        ffn_out.forward = new_ffn_forward

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1] 
        return self.classifier(pooled_output)

2.3 准备数据

python 复制代码
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=32):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

2.4 Bert模型微调

python 复制代码
def train(model_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"正在运行设备: {device}")

    texts = ["这个电影真好看", "太难看了,浪费时间", "导演很有才华", "剧情一塌糊涂"]
    labels = [1, 0, 1, 0]

    tokenizer = BertTokenizer.from_pretrained(model_name)
    
    model = BertWithAdapter(model_name).to(device)

    dataset = SimpleDataset(texts, labels, tokenizer)
    loader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    model.train()
    print("开始训练...")
    for epoch in range(5):
        total_loss = 0
        for batch in loader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            t_ids = batch['token_type_ids'].to(device) # 增加对 token_type_ids 的显式处理
            targets = batch['labels'].to(device)
            
            logits = model(input_ids, mask, t_ids)
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

    model.eval()
    test_text = ["非常喜欢"]
    test_enc = tokenizer(test_text, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(test_enc['input_ids'], test_enc['attention_mask'], test_enc.get('token_type_ids'))
        pred = torch.argmax(out, dim=1)
        print(f"测试结果: {'正面' if pred.item()==1 else '负面'}")

model_name = 'your_bert-base-chinese'
train(model_name)
相关推荐
balmtv18 小时前
2026年多模态AI文件处理与联网搜索完全教程:国内镜像方案实测
人工智能
2501_9269783318 小时前
AI的三次起落发展分析,及未来预测----理论5.0的应用
人工智能·经验分享·笔记·ai写作·agi
前网易架构师-高司机18 小时前
带标注的瓶盖识别数据集,识别率99.5%,可识别瓶盖,支持yolo,coco json,pascal voc xml格式
人工智能·yolo·数据集·瓶盖
软件供应链安全指南18 小时前
以AI治理AI|问境AIST首家通过信通院大模型安全扫描产品能力评估!
人工智能·安全·ai安全·问境aist·aist·智能体安全
_爱明19 小时前
CUDA索引越界问题(Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions)
人工智能·深度学习
这张生成的图像能检测吗19 小时前
(论文速读)TCN:序列建模不一定需要 RNN
人工智能·深度学习·transformer·卷积·时序预测
大师影视解说19 小时前
基于Web端的AI电影解说自动化生产工具实测:4步完成从文案到成片的全流程
运维·人工智能·自动化·影视解说·电影解说工具·网页版电影解说·ai电影解说
是大强19 小时前
GaN器件
人工智能·神经网络·生成对抗网络
美狐美颜sdk19 小时前
从人脸关键点到动态贴图:面具特效在美颜SDK中的实现原理
前端·图像处理·人工智能·直播美颜sdk·美颜api
威联通网络存储20 小时前
告别掉帧与素材损毁:威联通 QuTS hero 如何重塑影视后期协同工作流
前端·网络·人工智能·python