NLP任务之文本分类(情感分析)

[1 加载预训练模型对应的分词器](#1 加载预训练模型对应的分词器)

[2 加载数据集](#2 加载数据集)

[3 数据预处理](#3 数据预处理)

[4 构建数据加载器DataLoader](#4 构建数据加载器DataLoader)

[5 定义下游任务模型](#5 定义下游任务模型)

[6 测试代码](#6 测试代码)

[7 训练代码](#7 训练代码)

#做（中文与英文的）分类任务，Bert模型比较合适，用cls向下游任务传输数据，做分类任务

#Bert模型要求一般传一个句子对（两句话）

1 加载预训练模型对应的分词器

python 复制代码

from transformers import AutoTokenizer


#use_fast=True 表示使用RUST语言写的分词器，速度比python写的快
tokenizer = AutoTokenizer.from_pretrained('../data/model/distilbert-base-uncased/', use_fast=True)

tokenizer

复制代码

DistilBertTokenizerFast(name_or_path='../data/model/distilbert-base-uncased/', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False),

python 复制代码

#编码试算
tokenizer.batch_encode_plus(['hello, everyone, today is a good day', 
                             'how are you, fine thank you, and you?'])
#编码返回的是'input_ids' 和 'attention_mask'

复制代码

{'input_ids': [[101, 7592, 1010, 3071, 1010, 2651, 2003, 1037, 2204, 2154, 102], [101, 2129, 2024, 2017, 1010, 2986, 4067, 2017, 1010, 1998, 2017, 1029, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

2 加载数据集

python 复制代码

from datasets import load_dataset


dataset = load_dataset('../data/datasets/cola/', trust_remote_code=True)
dataset

复制代码

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8551
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 527
    })
})

python 复制代码

dataset['train'][0]

复制代码

{'text': "Our friends won't buy this analysis, let alone the next one we propose.",
 'label': 1}

3 数据预处理

python 复制代码

def f(examples, tokenizer):
    """只对传输数据集的句子文本'text'进行编码分词"""
    return tokenizer.batch_encode_plus(examples['text'], truncation=True)

dataset = dataset.map(f,
                      batched=True,
                      batch_size=1000,  #一批有1000个数据
                      #num_proc=1 更快 ,   数据量不多的时候， 创建进程也是需要时间开销
                      num_proc=1,  #8个进程同时处理，cpu是8核
                      remove_columns=['text'],   #原数据集中的['text']不要了，转化成['input_ids']
                      fn_kwargs={'tokenizer': tokenizer})

python 复制代码

print(dataset['train'][0])

复制代码

{'label': 1, 'input_ids': [101, 2256, 2814, 2180, 1005, 1056, 4965, 2023, 4106, 1010, 2292, 2894, 1996, 2279, 2028, 2057, 16599, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

4 构建数据加载器DataLoader

python 复制代码

#一批数据传输时，每句话的长度必须相同， 否则无法参与矩阵运算
import torch
#DataCollatorWithPadding 读取数据时，自动补全padding，使句子长度相同
from transformers.data.data_collator import DataCollatorWithPadding


loader = torch.utils.data.DataLoader(
    dataset=dataset['train'],
    batch_size=8,
    #实例化一个匿名的collate_fn ,使数据一批批传输，并自动补全padding，使句子长度相同
    collate_fn=DataCollatorWithPadding(tokenizer),  
    shuffle=True,
    drop_last=True)

for data in loader:  
    break  #for循环赋值， 不输出
#data包含'input_ids'和 'attention_mask' 两部分


data

复制代码

{'input_ids': tensor([[  101,  2043,  3021,  5610,  2015,  1010,  2035,  1996,  2062,  2515,
          6294,  5223,  2032,  1012,   102,     0,     0,     0,     0,     0,
             0],
        [  101,  2057,  4687,  2008,  3021,  2187,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  2008,  2008,  2005,  5106, 15721,  2000,  5466,  1037,  4906,
          2052, 28679,  1996,  4932,  2001,  5793,  2003,  2025,  2995,  1012,
           102],
        [  101,  1996,  2214,  3899,  2351,  2035,  1996,  2126,  1012,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  1045,  2215,  2009,  2000,  3961,  1037,  3595,  2008,  3021,
          2187,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  2027,  2700,  2032,  2637,  1005,  1055, 17089,  2343,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  1996,  2795,  2003,  2936,  2084,  1996,  2341,  2003,  2898,
          1012,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [  101,  6294,  9619,  2098,  2000,  3046,  2000,  4025,  2000,  2031,
          2042,  4782,  1012,   102,     0,     0,     0,     0,     0,     0,
             0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]]), 'labels': tensor([1, 0, 1, 0, 1, 1, 1, 1])}

python 复制代码

len(loader)

1068

5 定义下游任务模型

python 复制代码

from transformers import AutoModelForSequenceClassification, DistilBertModel

python 复制代码

#查看模型参数与层结构
model_pretrained_parameters = AutoModelForSequenceClassification.from_pretrained('../data/model/distilbert-base-uncased/', num_labels=2) 
model_pretrained_parameters

复制代码

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in_features=768, out_features=3072, bias=True)
            (lin2): Linear(in_features=3072, out_features=768, bias=True)
            (activation): GELUActivation()
          )
          (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        )
      )
    )
  )
  (pre_classifier): Linear(in_features=768, out_features=768, bias=True)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

python 复制代码

class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()  #继承父类的方法
        self.model_pretrained = DistilBertModel.from_pretrained('../data/model/distilbert-base-uncased/')
        #全连接层
        #Bert模型输出的数据的最后一维度是768，这里输入的第0维度也要是768
        self.fc = torch.nn.Sequential(torch.nn.Linear(768, 768),
                                      torch.nn.ReLU(),
                                      torch.nn.Dropout(p=0.2),
                                      torch.nn.Linear(768, 2))  #二分类问题，情感分析（积极1/消极0）
        
        #加载预训练参数的模型
        model_pretrained_parameters = AutoModelForSequenceClassification.from_pretrained('../data/model/distilbert-base-uncased/',
                                                                        num_labels=2) #labels的类别数量
        #让全连接层加载预训练的参数
        self.fc[0].load_state_dict(model_pretrained_parameters.pre_classifier.state_dict())
        self.fc[3].load_state_dict(model_pretrained_parameters.classifier.state_dict())
        
        #损失函数
        self.criterion = torch.nn.CrossEntropyLoss()
        
    def forward(self, input_ids, attention_mask, labels=None):
        #将输入数据传入预训练模型，得到一个输出结果
        #logits是三维的
        logits = self.model_pretrained(input_ids=input_ids, attention_mask=attention_mask)
        # ：使logits变成二维数据
        logits = logits.last_hidden_state[:, 0]  #0就是cls的输出结果，因为cls的位置是固定的（每句话的第一个单词就是），其他位置具有不确定性能拿到数据
        #将logits传入输出层
        logits = self.fc(logits)
        
        #计算损失
        loss = None  #先将loss设为空
        if labels is not None: #若传入了labels数据，不为空了
            #计算损失
            loss = self.criterion(logits, labels)
            
        return {'loss': loss, 'logits': logits}


model = Model()
#查看模型参数量
print(sum(i.numel() for i in model.parameters()))

66955010

python 复制代码

#试跑一下下游任务模型
#向模型中传入参数
out = model(**data)   #out是一个字典，包含输出的loss和logits
print(out['loss'], out['logits'], out['logits'].shape)
#out['logits'].shape=torch.Size([8, 2]), 8是一批有8个数据， 2是两个类别的概率（哪个值更大，就归哪个类别）

复制代码

tensor(0.6448, grad_fn=<NllLossBackward0>) tensor([[-0.0228,  0.0688],
        [-0.1635, -0.0205],
        [-0.1123,  0.0630],
        [-0.0492,  0.0820],
        [-0.1185,  0.1382],
        [-0.1488,  0.1725],
        [-0.0806,  0.0836],
        [-0.0384,  0.0721]], grad_fn=<AddmmBackward0>) torch.Size([8, 2])

python 复制代码

#查看测试数据集的labels是否正常有效（没有-1）
dataset['test'][0]

复制代码

{'label': 1,
 'input_ids': [101,
  1996,
  11279,
  8469,
  1996,
  9478,
  3154,
  1997,
  1996,
  5749,
  1012,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

6 测试代码

python 复制代码

def test(model):
    model.eval()  #测试预测时，调到评估模式
    
    #构建数据加载器
    loader_test = torch.utils.data.DataLoader(
        dataset=dataset['test'],
        batch_size=16,  #测试预测是在cpu上进行的，batch_size的值可以大一些，为16
        #DataCollatorWithPadding(tokenizer)实例化collate_fn，不然会报错
        collate_fn=DataCollatorWithPadding(tokenizer), #成批输送数据时，自动补全pad，使句子长度一致
        shuffle=True,
        drop_last=True)
    
    outs = []  #存放计算的最大类别概率
    labels = []  #存放真实值
    
    for i, data in enumerate(loader_test):
        #进行下游任务模型计算预测时，不进行求导梯度下降
        with torch.no_grad():
            #out是一个字典，包含loss和logits，
            out = model(**data)
        
        #out['logits']是一个二维数组，shape=（batch_szie, 类别数量）
        outs.append(out['logits'].argmax(dim=1))
        labels.append(data['labels'])
        
        if i % 10 ==0:  #每隔10次
            print(i)
            
        if i == 50:
            break  #到50，停止
            
    #将outs和labels分别拼接起来
    outs = torch.cat(outs)
    labels = torch.cat(labels)
    
    #计算准确度
    accuracy = (outs == labels).sum().item() / len(labels)
    print('accuracy:', accuracy)

python 复制代码

test(model)

复制代码

0
10
20
30
accuracy: 0.693359375

7 训练代码

python 复制代码

from transformers import AdamW   #AdamW梯度下降的优化算法
from transformers.optimization import get_scheduler  #学习率的衰减计算


#设置设备、
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

复制代码

device(type='cuda', index=0)

python 复制代码

#训练代码
def train():
    #模型训练时，需要梯度下降、学习更新模型参数，以达到最好的预测效果
    #定义优化器
    optimizer = AdamW(model.parameters(),betas=(0.9, 0.999), eps=1e-8, lr=2e-5)  #betas/eps/lr都是默认值
    
    #学习率衰减计划
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,  #无预热缓冲区，从一开始就衰减
                              num_training_steps=len(loader),
                              optimizer=optimizer)
    
    #将模型发送到设备上
    model.to(device)
    model.train()  #模型训练模式
    
    for i,data in enumerate(loader):
        #接收需要输入的数据
        input_ids, attention_mask, labels = data['input_ids'], data['attention_mask'], data['labels']
        #将数据传到设备上
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        #将这些数据传到设备上的模型，获取输出值out（一个字典，包含loss和logits（类别概率））
        out = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        
        #从out中获取loss
        loss = out['loss']  #字典key索引
        #用损失函数进行反向传播
        loss.backward()
        #为了梯度下降的稳定性，使用梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  #公式中的c=1.0
        #梯度更新
        optimizer.step()
        scheduler.step()  #学习率衰减计划更新
        
        #梯度清零
        optimizer.zero_grad()
        model.zero_grad()
        
        
        if i% 50 == 0:
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            #计算预测类别概率的最大值
            out = out['logits'].argmax(dim=1)
            #计算准确率
            accuracy = (labels==out).sum().item() / 8 #batch_size=8
            
            print(i, loss.item(), lr, accuracy)
            print()

python 复制代码

train()

复制代码

0 0.6603636145591736 1.9981273408239703e-05 0.75

50 0.6770923733711243 1.9044943820224723e-05 0.625

100 0.5856966972351074 1.810861423220974e-05 0.75

150 0.5937663316726685 1.7172284644194758e-05 0.75

200 0.5329931974411011 1.6235955056179777e-05 0.75

250 0.47660014033317566 1.5299625468164797e-05 0.875

300 0.22391566634178162 1.4363295880149814e-05 0.875

350 0.2534029185771942 1.3426966292134834e-05 1.0

400 0.5150715112686157 1.2490636704119851e-05 0.75

450 0.5376325845718384 1.155430711610487e-05 0.75

500 0.48840606212615967 1.0617977528089888e-05 0.875

550 0.40059715509414673 9.681647940074908e-06 0.875

600 0.679754376411438 8.745318352059925e-06 0.75

650 0.21557165682315826 7.808988764044945e-06 0.875

700 0.6123908758163452 6.872659176029963e-06 0.75

750 0.4683417081832886 5.936329588014982e-06 0.75

800 0.38990333676338196 5e-06 0.875

850 0.43256130814552307 4.063670411985019e-06 0.75

900 0.32022809982299805 3.1273408239700374e-06 0.875

950 0.9173805713653564 2.1910112359550564e-06 0.625

1000 0.42855364084243774 1.2546816479400751e-06 0.875

1050 0.4637509882450104 3.183520599250937e-07 0.75

python 复制代码

#训练完模型，再次测试
test(model.to('cpu'))  #因为测试的数据都在cpu上，需要把在gpu上训练的模型发到cpu上

复制代码

0
10
20
30
accuracy: 0.779296875