目录
[1 加载预训练模型对应的分词器](#1 加载预训练模型对应的分词器)
[2 加载数据集](#2 加载数据集)
[3 数据预处理](#3 数据预处理)
[4 构建数据加载器DataLoader](#4 构建数据加载器DataLoader)
[5 定义下游任务模型](#5 定义下游任务模型)
[6 测试代码](#6 测试代码)
[7 训练代码](#7 训练代码)
#做(中文与英文的)分类任务,Bert模型比较合适,用cls向下游任务传输数据,做分类任务
#Bert模型要求一般传一个句子对(两句话)
1 加载预训练模型对应的分词器
python
from transformers import AutoTokenizer
#use_fast=True 表示使用RUST语言写的分词器,速度比python写的快
tokenizer = AutoTokenizer.from_pretrained('../data/model/distilbert-base-uncased/', use_fast=True)
tokenizer
DistilBertTokenizerFast(name_or_path='../data/model/distilbert-base-uncased/', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),
python
#编码试算
tokenizer.batch_encode_plus(['hello, everyone, today is a good day',
'how are you, fine thank you, and you?'])
#编码返回的是'input_ids' 和 'attention_mask'
{'input_ids': [[101, 7592, 1010, 3071, 1010, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2024, 2017, 1010, 2986, 4067, 2017, 1010, 1998, 2017, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
2 加载数据集
python
from datasets import load_dataset
dataset = load_dataset('../data/datasets/cola/', trust_remote_code=True)
dataset
DatasetDict({ train: Dataset({ features: ['text', 'label'], num_rows: 8551 }) test: Dataset({ features: ['text', 'label'], num_rows: 527 }) })
python
dataset['train'][0]
{'text': "Our friends won't buy this analysis, let alone the next one we propose.", 'label': 1}
3 数据预处理
python
def f(examples, tokenizer):
"""只对传输数据集的句子文本'text'进行编码分词"""
return tokenizer.batch_encode_plus(examples['text'], truncation=True)
dataset = dataset.map(f,
batched=True,
batch_size=1000, #一批有1000个数据
#num_proc=1 更快 , 数据量不多的时候, 创建进程也是需要时间开销
num_proc=1, #8个进程同时处理,cpu是8核
remove_columns=['text'], #原数据集中的['text']不要了,转化成['input_ids']
fn_kwargs={'tokenizer': tokenizer})
python
print(dataset['train'][0])
{'label': 1, 'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
4 构建数据加载器DataLoader
python
#一批数据传输时,每句话的长度必须相同, 否则无法参与矩阵运算
import torch
#DataCollatorWithPadding 读取数据时,自动补全padding,使句子长度相同
from transformers.data.data_collator import DataCollatorWithPadding
loader = torch.utils.data.DataLoader(
dataset=dataset['train'],
batch_size=8,
#实例化一个匿名的collate_fn ,使数据一批批传输,并自动补全padding,使句子长度相同
collate_fn=DataCollatorWithPadding(tokenizer),
shuffle=True,
drop_last=True)
for data in loader:
break #for循环赋值, 不输出
#data包含'input_ids'和 'attention_mask' 两部分
data
{'input_ids': tensor([[ 101, 2043, 3021, 5610, 2015, 1010, 2035, 1996, 2062, 2515, 6294, 5223, 2032, 1012, 102, 0, 0, 0, 0, 0, 0], [ 101, 2057, 4687, 2008, 3021, 2187, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 101, 2008, 2008, 2005, 5106, 15721, 2000, 5466, 1037, 4906, 2052, 28679, 1996, 4932, 2001, 5793, 2003, 2025, 2995, 1012, 102], [ 101, 1996, 2214, 3899, 2351, 2035, 1996, 2126, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 101, 1045, 2215, 2009, 2000, 3961, 1037, 3595, 2008, 3021, 2187, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0], [ 101, 2027, 2700, 2032, 2637, 1005, 1055, 17089, 2343, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 101, 1996, 2795, 2003, 2936, 2084, 1996, 2341, 2003, 2898, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 101, 6294, 9619, 2098, 2000, 3046, 2000, 4025, 2000, 2031, 2042, 4782, 1012, 102, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 0, 1, 0, 1, 1, 1, 1])}
python
len(loader)
1068
5 定义下游任务模型
python
from transformers import AutoModelForSequenceClassification, DistilBertModel
python
#查看模型参数与层结构
model_pretrained_parameters = AutoModelForSequenceClassification.from_pretrained('../data/model/distilbert-base-uncased/', num_labels=2)
model_pretrained_parameters
DistilBertForSequenceClassification( (distilbert): DistilBertModel( (embeddings): Embeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (transformer): Transformer( (layer): ModuleList( (0-5): 6 x TransformerBlock( (attention): MultiHeadSelfAttention( (dropout): Dropout(p=0.1, inplace=False) (q_lin): Linear(in_features=768, out_features=768, bias=True) (k_lin): Linear(in_features=768, out_features=768, bias=True) (v_lin): Linear(in_features=768, out_features=768, bias=True) (out_lin): Linear(in_features=768, out_features=768, bias=True) ) (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (ffn): FFN( (dropout): Dropout(p=0.1, inplace=False) (lin1): Linear(in_features=768, out_features=3072, bias=True) (lin2): Linear(in_features=3072, out_features=768, bias=True) (activation): GELUActivation() ) (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) ) ) ) ) (pre_classifier): Linear(in_features=768, out_features=768, bias=True) (classifier): Linear(in_features=768, out_features=2, bias=True) (dropout): Dropout(p=0.2, inplace=False) )
python
class Model(torch.nn.Module):
def __init__(self):
super().__init__() #继承父类的方法
self.model_pretrained = DistilBertModel.from_pretrained('../data/model/distilbert-base-uncased/')
#全连接层
#Bert模型输出的数据的最后一维度是768,这里输入的第0维度也要是768
self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),
torch.nn.ReLU(),
torch.nn.Dropout(p=0.2),
torch.nn.Linear(768, 2)) #二分类问题,情感分析(积极1/消极0)
#加载预训练参数的模型
model_pretrained_parameters = AutoModelForSequenceClassification.from_pretrained('../data/model/distilbert-base-uncased/',
num_labels=2) #labels的类别数量
#让全连接层加载预训练的参数
self.fc[0].load_state_dict(model_pretrained_parameters.pre_classifier.state_dict())
self.fc[3].load_state_dict(model_pretrained_parameters.classifier.state_dict())
#损失函数
self.criterion = torch.nn.CrossEntropyLoss()
def forward(self, input_ids, attention_mask, labels=None):
#将输入数据传入预训练模型,得到一个输出结果
#logits是三维的
logits = self.model_pretrained(input_ids=input_ids, attention_mask=attention_mask)
# :使logits变成二维数据
logits = logits.last_hidden_state[:, 0] #0就是cls的输出结果,因为cls的位置是固定的(每句话的第一个单词就是),其他位置具有不确定性能拿到数据
#将logits传入输出层
logits = self.fc(logits)
#计算损失
loss = None #先将loss设为空
if labels is not None: #若传入了labels数据,不为空了
#计算损失
loss = self.criterion(logits, labels)
return {'loss': loss, 'logits': logits}
model = Model()
#查看模型参数量
print(sum(i.numel() for i in model.parameters()))
66955010
python
#试跑一下下游任务模型
#向模型中传入参数
out = model(**data) #out是一个字典,包含输出的loss和logits
print(out['loss'], out['logits'], out['logits'].shape)
#out['logits'].shape=torch.Size([8, 2]), 8是一批有8个数据, 2是两个类别的概率(哪个值更大,就归哪个类别)
tensor(0.6448, grad_fn=<NllLossBackward0>) tensor([[-0.0228, 0.0688], [-0.1635, -0.0205], [-0.1123, 0.0630], [-0.0492, 0.0820], [-0.1185, 0.1382], [-0.1488, 0.1725], [-0.0806, 0.0836], [-0.0384, 0.0721]], grad_fn=<AddmmBackward0>) torch.Size([8, 2])
python
#查看测试数据集的labels是否正常有效(没有-1)
dataset['test'][0]
{'label': 1, 'input_ids': [101, 1996, 11279, 8469, 1996, 9478, 3154, 1997, 1996, 5749, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
6 测试代码
python
def test(model):
model.eval() #测试预测时,调到评估模式
#构建数据加载器
loader_test = torch.utils.data.DataLoader(
dataset=dataset['test'],
batch_size=16, #测试预测是在cpu上进行的,batch_size的值可以大一些,为16
#DataCollatorWithPadding(tokenizer)实例化collate_fn,不然会报错
collate_fn=DataCollatorWithPadding(tokenizer), #成批输送数据时,自动补全pad,使句子长度一致
shuffle=True,
drop_last=True)
outs = [] #存放计算的最大类别概率
labels = [] #存放真实值
for i, data in enumerate(loader_test):
#进行下游任务模型计算预测时,不进行求导梯度下降
with torch.no_grad():
#out是一个字典,包含loss和logits,
out = model(**data)
#out['logits']是一个二维数组,shape=(batch_szie, 类别数量)
outs.append(out['logits'].argmax(dim=1))
labels.append(data['labels'])
if i % 10 ==0: #每隔10次
print(i)
if i == 50:
break #到50,停止
#将outs和labels分别拼接起来
outs = torch.cat(outs)
labels = torch.cat(labels)
#计算准确度
accuracy = (outs == labels).sum().item() / len(labels)
print('accuracy:', accuracy)
python
test(model)
0 10 20 30 accuracy: 0.693359375
7 训练代码
python
from transformers import AdamW #AdamW梯度下降的优化算法
from transformers.optimization import get_scheduler #学习率的衰减计算
#设置设备、
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
device(type='cuda', index=0)
python
#训练代码
def train():
#模型训练时,需要梯度下降、学习更新模型参数,以达到最好的预测效果
#定义优化器
optimizer = AdamW(model.parameters(),betas=(0.9, 0.999), eps=1e-8, lr=2e-5) #betas/eps/lr都是默认值
#学习率衰减计划
scheduler = get_scheduler(name='linear',
num_warmup_steps=0, #无预热缓冲区,从一开始就衰减
num_training_steps=len(loader),
optimizer=optimizer)
#将模型发送到设备上
model.to(device)
model.train() #模型训练模式
for i,data in enumerate(loader):
#接收需要输入的数据
input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
#将数据传到设备上
input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
#将这些数据传到设备上的模型,获取输出值out(一个字典,包含loss和logits(类别概率))
out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
#从out中获取loss
loss = out['loss'] #字典key索引
#用损失函数进行反向传播
loss.backward()
#为了梯度下降的稳定性,使用梯度裁剪
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) #公式中的c=1.0
#梯度更新
optimizer.step()
scheduler.step() #学习率衰减计划更新
#梯度清零
optimizer.zero_grad()
model.zero_grad()
if i% 50 == 0:
lr = optimizer.state_dict()['param_groups'][0]['lr']
#计算预测类别概率的最大值
out = out['logits'].argmax(dim=1)
#计算准确率
accuracy = (labels==out).sum().item() / 8 #batch_size=8
print(i, loss.item(), lr, accuracy)
print()
python
train()
0 0.6603636145591736 1.9981273408239703e-05 0.75 50 0.6770923733711243 1.9044943820224723e-05 0.625 100 0.5856966972351074 1.810861423220974e-05 0.75 150 0.5937663316726685 1.7172284644194758e-05 0.75 200 0.5329931974411011 1.6235955056179777e-05 0.75 250 0.47660014033317566 1.5299625468164797e-05 0.875 300 0.22391566634178162 1.4363295880149814e-05 0.875 350 0.2534029185771942 1.3426966292134834e-05 1.0 400 0.5150715112686157 1.2490636704119851e-05 0.75 450 0.5376325845718384 1.155430711610487e-05 0.75 500 0.48840606212615967 1.0617977528089888e-05 0.875 550 0.40059715509414673 9.681647940074908e-06 0.875 600 0.679754376411438 8.745318352059925e-06 0.75 650 0.21557165682315826 7.808988764044945e-06 0.875 700 0.6123908758163452 6.872659176029963e-06 0.75 750 0.4683417081832886 5.936329588014982e-06 0.75 800 0.38990333676338196 5e-06 0.875 850 0.43256130814552307 4.063670411985019e-06 0.75 900 0.32022809982299805 3.1273408239700374e-06 0.875 950 0.9173805713653564 2.1910112359550564e-06 0.625 1000 0.42855364084243774 1.2546816479400751e-06 0.875 1050 0.4637509882450104 3.183520599250937e-07 0.75
python
#训练完模型,再次测试
test(model.to('cpu')) #因为测试的数据都在cpu上,需要把在gpu上训练的模型发到cpu上
0 10 20 30 accuracy: 0.779296875