【NLP练习】中文文本分类-Pytorch实现

中文文本分类-Pytorch实现

一、准备工作

1. 任务说明

本次使用Pytorch实现中文文本分类。主要代码与文本分类代码基本一致,不同的是本次任务使用了本地的中文数据,数据示例如下:

2.加载数据

python 复制代码
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms,datasets
import os,PIL,pathlib,warnings

warnings.filterwarnings("ignore")   #忽略警告信息

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

输出:

python 复制代码
device(type='cpu')
python 复制代码
import pandas as pd
 #加载自定义中文数据
train_data = pd.read_csv('./train.csv',sep='\t',header = None)
train_data.head()

输出:

python 复制代码
#构造数据集迭代器
def coustom_data_iter(texts,labels):
 for x,y in zip(texts,labels):
  yield x,y
 
train_iter =coustom_data_iter(train_data[0].values[:],train_data[1].values[:])

二、数据预处理

python 复制代码
#构建词典
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import jieba

#中文分词方法
tokenizer = jieba.lcut
def yield_tokens(data_iter):
    for text,_ in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter),
                                 specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])   #设置默认索引,如果找不到单词,则会选择默认索引
vocab(['我','想','看','和平','精英','上','战神','必备','技巧','的','游戏','视频'])

输出:

python 复制代码
[2, 10, 13, 973, 1079, 146, 7724, 7574, 7793, 1, 186, 28]
python 复制代码
label_name = list(set(train_data[1].values[:]))
print(label_name)

输出:

python 复制代码
['FilmTele-Play', 'Alarm-Update', 'Weather-Query', 'Audio-Play', 'Radio-Listen', 'Travel-Query', 'Music-Play', 'Video-Play', 'HomeAppliance-Control', 'Calendar-Query', 'TVProgram-Play', 'Other']
python 复制代码
text_pipeline = lambda x : vocab(tokenizer(x))
label_pipeline = lambda x : label_name.index(x)

print(text_pipeline('我想看和平精英上战神必备技巧的游戏视频'))
print(label_pipeline('Video-Play'))

输出:

python 复制代码
[2, 10, 13, 973, 1079, 146, 7724, 7574, 7793, 1, 186, 28]
7

lambda表达式的语法为:lambda arguments: expression

其中arguments是函数的参数,可以有多个参数,用逗号分隔。expression是一个表达式,它定义了函数的返回值。

  • text_pipeline函数: 将原始文本数据转换为整数列表,使用了之前构建的vocab词表和tokenizer分词器函数。具体步骤:
  1. 接受一个字符串x作为输入
  2. 使用tokenizer将其分词
  3. 将每个词在vocab词表中的索引放入一个列表返回
  • label_pipeline函数: 将原始标签数据转换为整数,它接受一个字符串x作为输入,并使用 label_index.index(x) 方法获取x在label_name列表中的索引作为输出。

2.生成数据批次和迭代器

python 复制代码
#生成数据批次和迭代器
from torch.utils.data import DataLoader

def collate_batch(batch):
    label_list, text_list, offsets = [],[],[0]         
    for(_text, _label) in batch:
        #标签列表
        label_list.append(label_pipeline(_label))
        #文本列表
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        #偏移量
        offsets.append(processed_text.size(0))

    label_list = torch.tensor(label_list,dtype=torch.int64)
    text_list = torch.cat(text_list)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)       #返回维度dim中输入元素的累计和
    return text_list.to(device), label_list.to(device), offsets.to(device)

#数据加载器
dataloader = DataLoader(
    train_iter,
    batch_size = 8,
    shuffle = False,
    collate_fn = collate_batch
)

三、模型构建

1. 搭建模型

python 复制代码
#搭建模型
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel,self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size,      #词典大小
                                        embed_dim,        # 嵌入的维度
                                        sparse=False)     #
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

2. 初始化模型

python 复制代码
#初始化模型
#定义实例
num_class = len(label_name)
vocab_size = len(vocab)
em_size = 64
model = TextClassificationModel(vocab_size, em_size, num_class).to(device)

3. 定义训练与评估函数

python 复制代码
#定义训练与评估函数
import time

def train(dataloader):
    model.train()          #切换为训练模式
    total_acc, train_loss, total_count = 0,0,0
    log_interval = 50
    start_time = time.time()
    for idx, (text,label, offsets) in enumerate(dataloader):
        predicted_label = model(text, offsets)
        optimizer.zero_grad()                             #grad属性归零
        loss = criterion(predicted_label, label)          #计算网络输出和真实值之间的差距,label为真
        loss.backward()                                   #反向传播
        torch.nn.utils.clip_grad_norm_(model.parameters(),0.1)  #梯度裁剪
        optimizer.step()                                  #每一步自动更新
        
        #记录acc与loss
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        train_loss += loss.item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('|epoch{:d}|{:4d}/{:4d} batches|train_acc{:4.3f} train_loss{:4.5f}'.format(
                epoch,
                idx,
                len(dataloader),
                total_acc/total_count,
                train_loss/total_count))
            total_acc,train_loss,total_count = 0,0,0
            staet_time = time.time()

def evaluate(dataloader):
    model.eval()      #切换为测试模式
    total_acc,train_loss,total_count = 0,0,0
    with torch.no_grad():
        for idx,(text,label,offsets) in enumerate(dataloader):
            predicted_label = model(text, offsets)
            loss = criterion(predicted_label,label)   #计算loss值
            #记录测试数据
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            train_loss += loss.item()
            total_count += label.size(0)
    
    return total_acc/total_count, train_loss/total_count

四、训练模型

1. 拆分数据集并运行模型

python 复制代码
#拆分数据集并运行模型
from torch.utils.data.dataset   import random_split
from torchtext.data.functional  import to_map_style_dataset

# 超参数设定
EPOCHS      = 10   #epoch
LR          = 5    #learningRate
BATCH_SIZE  = 64   #batch size for training

#设置损失函数、选择优化器、设置学习率调整函数
criterion   = torch.nn.CrossEntropyLoss()
optimizer   = torch.optim.SGD(model.parameters(), lr = LR)
scheduler   = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma = 0.1)
total_accu  = None

# 构建数据集
train_iter = custom_data_iter(train_data[0].values[:],train_data[1].values[:])
train_dataset   = to_map_style_dataset(train_iter)
split_train_, split_valid_ = random_split(train_dataset,
                                         [int(len(train_dataset)*0.8),int(len(train_dataset)*0.2)])

                                           
train_dataloader    = DataLoader(split_train_, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_batch)
valid_dataloader    = DataLoader(split_valid_, batch_size = BATCH_SIZE, shuffle = True, collate_fn = collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    val_acc, val_loss = evaluate(valid_dataloader)
    #获取当前的学习率
    lr = optimizer.state_dict()['param_groups'][0]['lr']
    if total_accu is not None and total_accu > val_acc:
        scheduler.step()
    else:
        total_accu = val_acc
    print('-' * 69)
    print('| epoch {:d} | time:{:4.2f}s | valid_acc {:4.3f} valid_loss {:4.3f}'.format(
        epoch,
        time.time() - epoch_start_time,
        val_acc,
        val_loss))
    print('-' * 69)

输出:

python 复制代码
['还有双鸭山到淮阴的汽车票吗13号的' '从这里怎么回家' '随便播放一首专辑阁楼里的佛里的歌' ...
 '黎耀祥陈豪邓萃雯畲诗曼陈法拉敖嘉年杨怡马浚伟等到场出席' '百事盖世群星星光演唱会有谁' '下周一视频会议的闹钟帮我开开']
|epoch1|  50/ 152 batches|train_acc0.953 train_loss0.00282
|epoch1| 100/ 152 batches|train_acc0.953 train_loss0.00271
|epoch1| 150/ 152 batches|train_acc0.952 train_loss0.00292
---------------------------------------------------------------------
| epoch 1 | time:5.50s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch2|  50/ 152 batches|train_acc0.961 train_loss0.00231
|epoch2| 100/ 152 batches|train_acc0.967 train_loss0.00204
|epoch2| 150/ 152 batches|train_acc0.963 train_loss0.00228
---------------------------------------------------------------------
| epoch 2 | time:5.06s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch3|  50/ 152 batches|train_acc0.975 train_loss0.00173
|epoch3| 100/ 152 batches|train_acc0.973 train_loss0.00177
|epoch3| 150/ 152 batches|train_acc0.972 train_loss0.00166
---------------------------------------------------------------------
| epoch 3 | time:5.07s | valid_acc 0.948 valid_loss 0.003
---------------------------------------------------------------------
|epoch4|  50/ 152 batches|train_acc0.984 train_loss0.00137
|epoch4| 100/ 152 batches|train_acc0.987 train_loss0.00123
|epoch4| 150/ 152 batches|train_acc0.983 train_loss0.00119
---------------------------------------------------------------------
| epoch 4 | time:5.07s | valid_acc 0.950 valid_loss 0.003
---------------------------------------------------------------------
|epoch5|  50/ 152 batches|train_acc0.985 train_loss0.00125
|epoch5| 100/ 152 batches|train_acc0.987 train_loss0.00119
|epoch5| 150/ 152 batches|train_acc0.986 train_loss0.00120
---------------------------------------------------------------------
| epoch 5 | time:5.03s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch6|  50/ 152 batches|train_acc0.985 train_loss0.00118
|epoch6| 100/ 152 batches|train_acc0.989 train_loss0.00114
|epoch6| 150/ 152 batches|train_acc0.985 train_loss0.00120
---------------------------------------------------------------------
| epoch 6 | time:5.40s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch7|  50/ 152 batches|train_acc0.984 train_loss0.00119
|epoch7| 100/ 152 batches|train_acc0.986 train_loss0.00119
|epoch7| 150/ 152 batches|train_acc0.989 train_loss0.00112
---------------------------------------------------------------------
| epoch 7 | time:5.71s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch8|  50/ 152 batches|train_acc0.985 train_loss0.00115
|epoch8| 100/ 152 batches|train_acc0.986 train_loss0.00128
|epoch8| 150/ 152 batches|train_acc0.989 train_loss0.00107
---------------------------------------------------------------------
| epoch 8 | time:5.22s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch9|  50/ 152 batches|train_acc0.988 train_loss0.00114
|epoch9| 100/ 152 batches|train_acc0.983 train_loss0.00127
|epoch9| 150/ 152 batches|train_acc0.989 train_loss0.00109
---------------------------------------------------------------------
| epoch 9 | time:5.28s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
|epoch10|  50/ 152 batches|train_acc0.986 train_loss0.00115
|epoch10| 100/ 152 batches|train_acc0.987 train_loss0.00117
|epoch10| 150/ 152 batches|train_acc0.986 train_loss0.00119
---------------------------------------------------------------------
| epoch 10 | time:5.22s | valid_acc 0.949 valid_loss 0.003
---------------------------------------------------------------------
python 复制代码
test_acc,test_loss = evaluate(valid_dataloader)
print('模型准确率为:{:5.4f}'.format(test_acc))

输出:

python 复制代码
模型准确率为:0.9492

2. 测试指定数据

python 复制代码
#测试指定的数据
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

ex_text_str = "还有双鸭山到淮阴的汽车票吗13号的"
model = model.to("cpu")

print("该文本的类别是: %s" %label_name[predict(ex_text_str,text_pipeline)])

输出:

python 复制代码
该文本的类别是: Travel-Query

五、总结

训练神经网络时,可使用梯度裁剪的方法来防止梯度爆炸,使得模型训练更加稳定

相关推荐
数据智能老司机3 小时前
PyTorch 深度学习——使用神经网络来拟合数据
pytorch·深度学习
数据智能老司机3 小时前
PyTorch 深度学习——用于图像的扩散模型
pytorch·深度学习
数据智能老司机3 小时前
PyTorch 深度学习——Transformer 是如何工作的
pytorch·深度学习
数据智能老司机1 天前
PyTorch 深度学习——使用张量表示真实世界数据
pytorch·深度学习
数据智能老司机1 天前
PyTorch 深度学习——它始于一个张量
pytorch·深度学习
Narrastory3 天前
明日香 - Pytorch 快速入门保姆级教程(三)
pytorch·深度学习
Narrastory6 天前
明日香 - Pytorch 快速入门保姆级教程(一)
人工智能·pytorch·深度学习
Narrastory6 天前
明日香 - Pytorch 快速入门保姆级教程(二)
人工智能·pytorch·深度学习
盼小辉丶11 天前
PyTorch实战(30)——使用TorchScript和ONNX导出通用PyTorch模型
人工智能·pytorch·深度学习·模型部署
NGBQ1213811 天前
Imgflip社交媒体表情包数据集-202208条多模板meme数据-包含完整图片URL和文本说明-适用于NLP模型训练和社交媒体分析
人工智能·自然语言处理·媒体