Adapter微调

1.基本原理

在预训练模型的特定位置(通常是注意层和全连接层之后)添加少量可训练参数,这些参数构成一个小型神经网络模块,如下图所示:

2.代码案例

2.1 定义adapter

python 复制代码
# -*- coding:utf-8 -*-

import torch
import torch.nn as nn

from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset


class Adapter(nn.Module):
    def __init__(self, input_dim, bottleneck_dim=64):
        super(Adapter, self).__init__()
        self.adapter_layer = nn.Sequential(
            nn.Linear(input_dim, bottleneck_dim),
            nn.ReLU(),
            nn.Linear(bottleneck_dim, input_dim)
        )
        nn.init.normal_(self.adapter_layer[0].weight, std=1e-3)
        nn.init.normal_(self.adapter_layer[2].weight, std=1e-3)

    def forward(self, x):
        return x + self.adapter_layer(x)

2.2 在Bert特殊位置插入Adapter

python 复制代码
class BertWithAdapter(nn.Module):
    def __init__(self, model_name, num_classes=2):
        super(BertWithAdapter, self).__init__()
        self.bert = BertModel.from_pretrained(model_name)
        
        for param in self.bert.parameters():
            param.requires_grad = False
        
        for i in range(len(self.bert.encoder.layer)):
            # 注入 Adapter
            self.bert.encoder.layer[i].attention.output.add_module("adapter", Adapter(768))
            self.bert.encoder.layer[i].output.add_module("adapter", Adapter(768))
            self._inject_forward_logic(self.bert.encoder.layer[i])

        self.classifier = nn.Linear(768, num_classes)

    def _inject_forward_logic(self, layer):
        attn_out = layer.attention.output
        ffn_out = layer.output

        original_attn_forward = attn_out.forward
        def new_attn_forward(hidden_states, input_tensor):
            x = original_attn_forward(hidden_states, input_tensor)
            return attn_out.adapter(x)
        attn_out.forward = new_attn_forward

        original_ffn_forward = ffn_out.forward
        def new_ffn_forward(hidden_states, input_tensor):
            x = original_ffn_forward(hidden_states, input_tensor)
            return ffn_out.adapter(x)
        ffn_out.forward = new_ffn_forward

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1] 
        return self.classifier(pooled_output)

2.3 准备数据

python 复制代码
class SimpleDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=32):
        self.encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_len, return_tensors="pt")
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

2.4 Bert模型微调

python 复制代码
def train(model_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"正在运行设备: {device}")

    texts = ["这个电影真好看", "太难看了,浪费时间", "导演很有才华", "剧情一塌糊涂"]
    labels = [1, 0, 1, 0]

    tokenizer = BertTokenizer.from_pretrained(model_name)
    
    model = BertWithAdapter(model_name).to(device)

    dataset = SimpleDataset(texts, labels, tokenizer)
    loader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    model.train()
    print("开始训练...")
    for epoch in range(5):
        total_loss = 0
        for batch in loader:
            optimizer.zero_grad()
            
            input_ids = batch['input_ids'].to(device)
            mask = batch['attention_mask'].to(device)
            t_ids = batch['token_type_ids'].to(device) # 增加对 token_type_ids 的显式处理
            targets = batch['labels'].to(device)
            
            logits = model(input_ids, mask, t_ids)
            loss = criterion(logits, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(loader):.4f}")

    model.eval()
    test_text = ["非常喜欢"]
    test_enc = tokenizer(test_text, return_tensors="pt").to(device)
    with torch.no_grad():
        out = model(test_enc['input_ids'], test_enc['attention_mask'], test_enc.get('token_type_ids'))
        pred = torch.argmax(out, dim=1)
        print(f"测试结果: {'正面' if pred.item()==1 else '负面'}")

model_name = 'your_bert-base-chinese'
train(model_name)
相关推荐
TeDi TIVE10 小时前
开源模型应用落地-工具使用篇-Spring AI-高阶用法(九)
人工智能·spring·开源
MY_TEUCK10 小时前
Sealos 平台部署实战指南:结合 Cursor 与版本发布流程
java·人工智能·学习·aigc
三毛的二哥10 小时前
BEV:典型BEV算法总结
人工智能·算法·计算机视觉·3d
j_xxx404_10 小时前
大语言模型 (LLM) 零基础入门:核心原理、训练机制与能力全解
人工智能·ai·transformer
飞哥数智坊10 小时前
全新 SOLO 在日常办公中的实际体验
人工智能·solo
南宫萧幕11 小时前
自控PID+MATLAB仿真+混动P0/P1/P2/P3/P4构型
算法·机器学习·matlab·simulink·控制·pid
<-->11 小时前
Megatron(全称 Megatron-LM,由 NVIDIA 开发)和 DeepSpeed(由 Microsoft 开发)
人工智能·pytorch·python·深度学习·transformer
朝新_11 小时前
【Spring AI 】图像与语音模型实战
java·人工智能·spring
Yuanxl90311 小时前
神经网络-Sequential 应用与实战
人工智能·深度学习·神经网络
火山引擎开发者社区11 小时前
Seedance 2.0 1080P 生成能力正式上线
人工智能