1. LoRA微调
loader:
python
# -*- coding: utf-8 -*-
import json
import re
import os
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
"""
数据加载
"""
class DataGenerator:
def __init__(self, data_path, config):
self.config = config
self.path = data_path
self.index_to_label = {0: '家居', 1: '房产', 2: '股票', 3: '社会', 4: '文化',
5: '国际', 6: '教育', 7: '军事', 8: '彩票', 9: '旅游',
10: '体育', 11: '科技', 12: '汽车', 13: '健康',
14: '娱乐', 15: '财经', 16: '时尚', 17: '游戏'}
self.label_to_index = dict((y, x) for x, y in self.index_to_label.items())
self.config["class_num"] = len(self.index_to_label)
if self.config["model_type"] == "bert":
self.tokenizer = BertTokenizer.from_pretrained(config["pretrain_model_path"])
self.vocab = load_vocab(config["vocab_path"])
self.config["vocab_size"] = len(self.vocab)
self.load()
def load(self):
self.data = []
with open(self.path, encoding="utf8") as f:
for line in f:
line = json.loads(line)
tag = line["tag"]
label = self.label_to_index[tag]
title = line["title"]
if self.config["model_type"] == "bert":
input_id = self.tokenizer.encode(title, max_length=self.config["max_length"], pad_to_max_length=True)
else:
input_id = self.encode_sentence(title)
input_id = torch.LongTensor(input_id)
label_index = torch.LongTensor([label])
self.data.append([input_id, label_index])
return
def encode_sentence(self, text):
input_id = []
for char in text:
input_id.append(self.vocab.get(char, self.vocab["[UNK]"]))
input_id = self.padding(input_id)
return input_id
#补齐或截断输入的序列,使其可以在一个batch内运算
def padding(self, input_id):
input_id = input_id[:self.config["max_length"]]
input_id += [0] * (self.config["max_length"] - len(input_id))
return input_id
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.data[index]
def load_vocab(vocab_path):
token_dict = {}
with open(vocab_path, encoding="utf8") as f:
for index, line in enumerate(f):
token = line.strip()
token_dict[token] = index + 1 #0留给padding位置,所以从1开始
return token_dict
#用torch自带的DataLoader类封装数据
def load_data(data_path, config, shuffle=True):
dg = DataGenerator(data_path, config)
dl = DataLoader(dg, batch_size=config["batch_size"], shuffle=shuffle)
return dl
if __name__ == "__main__":
from config import Config
dg = DataGenerator("valid_tag_news.json", Config)
print(dg[1])
model:
python
import torch.nn as nn
from config import Config
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from torch.optim import Adam, SGD
TorchModel = AutoModelForSequenceClassification.from_pretrained(Config["pretrain_model_path"])
def choose_optimizer(config, model):
optimizer = config["optimizer"]
learning_rate = config["learning_rate"]
if optimizer == "adam":
return Adam(model.parameters(), lr=learning_rate)
elif optimizer == "sgd":
return SGD(model.parameters(), lr=learning_rate)
evaluate:
python
# -*- coding: utf-8 -*-
import torch
from loader import load_data
"""
模型效果测试
"""
class Evaluator:
def __init__(self, config, model, logger):
self.config = config
self.model = model
self.logger = logger
self.valid_data = load_data(config["valid_data_path"], config, shuffle=False)
self.stats_dict = {"correct":0, "wrong":0} #用于存储测试结果
def eval(self, epoch):
self.logger.info("开始测试第%d轮模型效果:" % epoch)
self.model.eval()
self.stats_dict = {"correct": 0, "wrong": 0} # 清空上一轮结果
for index, batch_data in enumerate(self.valid_data):
if torch.cuda.is_available():
batch_data = [d.cuda() for d in batch_data]
input_ids, labels = batch_data #输入变化时这里需要修改,比如多输入,多输出的情况
with torch.no_grad():
pred_results = self.model(input_ids)[0]
self.write_stats(labels, pred_results)
acc = self.show_stats()
return acc
def write_stats(self, labels, pred_results):
# assert len(labels) == len(pred_results)
for true_label, pred_label in zip(labels, pred_results):
pred_label = torch.argmax(pred_label)
# print(true_label, pred_label)
if int(true_label) == int(pred_label):
self.stats_dict["correct"] += 1
else:
self.stats_dict["wrong"] += 1
return
def show_stats(self):
correct = self.stats_dict["correct"]
wrong = self.stats_dict["wrong"]
self.logger.info("预测集合条目总量:%d" % (correct +wrong))
self.logger.info("预测正确条目:%d,预测错误条目:%d" % (correct, wrong))
self.logger.info("预测准确率:%f" % (correct / (correct + wrong)))
self.logger.info("--------------------")
return correct / (correct + wrong)
main:
python
# -*- coding: utf-8 -*-
import torch
import os
import random
import os
import numpy as np
import torch.nn as nn
import logging
from config import Config
from model import TorchModel, choose_optimizer
from evaluate import Evaluator
from loader import load_data
from peft import get_peft_model, LoraConfig, \
PromptTuningConfig, PrefixTuningConfig, PromptEncoderConfig
#[DEBUG, INFO, WARNING, ERROR, CRITICAL]
logging.basicConfig(level=logging.INFO, format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
"""
模型训练主程序
"""
seed = Config["seed"]
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
def main(config):
#创建保存模型的目录
if not os.path.isdir(config["model_path"]):
os.mkdir(config["model_path"])
#加载训练数据
train_data = load_data(config["train_data_path"], config)
#加载模型
model = TorchModel
#大模型微调策略
tuning_tactics = config["tuning_tactics"]
if tuning_tactics == "lora_tuning":
peft_config = LoraConfig(
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["query", "key", "value"]
)
elif tuning_tactics == "p_tuning":
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
elif tuning_tactics == "prompt_tuning":
peft_config = PromptTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
elif tuning_tactics == "prefix_tuning":
peft_config = PrefixTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
model = get_peft_model(model, peft_config)
# print(model.state_dict().keys())
if tuning_tactics == "lora_tuning":
# lora配置会冻结原始模型中的所有层的权重,不允许其反传梯度
# 但是事实上我们希望最后一个线性层照常训练,只是bert部分被冻结,所以需要手动设置
for param in model.get_submodule("model").get_submodule("classifier").parameters():
param.requires_grad = True
# 标识是否使用gpu
cuda_flag = torch.cuda.is_available()
if cuda_flag:
logger.info("gpu可以使用,迁移模型至gpu")
model = model.cuda()
#加载优化器
optimizer = choose_optimizer(config, model)
#加载效果测试类
evaluator = Evaluator(config, model, logger)
#训练
for epoch in range(config["epoch"]):
epoch += 1
model.train()
logger.info("epoch %d begin" % epoch)
train_loss = []
for index, batch_data in enumerate(train_data):
if cuda_flag:
batch_data = [d.cuda() for d in batch_data]
optimizer.zero_grad()
input_ids, labels = batch_data #输入变化时这里需要修改,比如多输入,多输出的情况
output = model(input_ids)[0]
loss = nn.CrossEntropyLoss()(output, labels.view(-1))
loss.backward()
optimizer.step()
train_loss.append(loss.item())
if index % int(len(train_data) / 2) == 0:
logger.info("batch loss %f" % loss)
logger.info("epoch average loss: %f" % np.mean(train_loss))
acc = evaluator.eval(epoch)
model_path = os.path.join(config["model_path"], "%s.pth" % tuning_tactics)
save_tunable_parameters(model, model_path) #保存模型权重
return acc
def save_tunable_parameters(model, path):
saved_params = {
k: v.to("cpu")
for k, v in model.named_parameters()
if v.requires_grad
}
torch.save(saved_params, path)
if __name__ == "__main__":
main(Config)
pred:
python
import torch
import logging
from model import TorchModel
from peft import get_peft_model, LoraConfig, PromptTuningConfig, PrefixTuningConfig, PromptEncoderConfig
from evaluate import Evaluator
from config import Config
logging.basicConfig(level=logging.INFO, format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
#大模型微调策略
tuning_tactics = Config["tuning_tactics"]
print("正在使用 %s"%tuning_tactics)
if tuning_tactics == "lora_tuning":
peft_config = LoraConfig(
r=8,
lora_alpha=32,
lora_dropout=0.1,
target_modules=["query", "key", "value"]
)
elif tuning_tactics == "p_tuning":
peft_config = PromptEncoderConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
elif tuning_tactics == "prompt_tuning":
peft_config = PromptTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
elif tuning_tactics == "prefix_tuning":
peft_config = PrefixTuningConfig(task_type="SEQ_CLS", num_virtual_tokens=10)
#重建模型
model = TorchModel
# print(model.state_dict().keys())
# print("====================")
model = get_peft_model(model, peft_config)
# print(model.state_dict().keys())
# print("====================")
state_dict = model.state_dict()
#将微调部分权重加载
if tuning_tactics == "lora_tuning":
loaded_weight = torch.load('output/lora_tuning.pth')
elif tuning_tactics == "p_tuning":
loaded_weight = torch.load('output/p_tuning.pth')
elif tuning_tactics == "prompt_tuning":
loaded_weight = torch.load('output/prompt_tuning.pth')
elif tuning_tactics == "prefix_tuning":
loaded_weight = torch.load('output/prefix_tuning.pth')
print(loaded_weight.keys())
state_dict.update(loaded_weight)
#权重更新后重新加载到模型
model.load_state_dict(state_dict)
#进行一次测试
model = model.cuda()
evaluator = Evaluator(Config, model, logger)
evaluator.eval(0)