nlp培训重点-3

1. 文本匹配

分类:

loader:

python 复制代码
# -*- coding: utf-8 -*-

import json
import re
import os
import torch
import random
import logging
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from transformers import BertTokenizer
"""
数据加载
"""

logging.getLogger("transformers").setLevel(logging.ERROR)

class DataGenerator:
    def __init__(self, data_path, config):
        self.config = config
        self.path = data_path
        self.tokenizer = load_vocab(config["vocab_path"])
        self.config["vocab_size"] = len(self.tokenizer.vocab)
        self.schema = load_schema(config["schema_path"])
        self.train_data_size = config["epoch_data_size"] #由于采取随机采样,所以需要设定一个采样数量,否则可以一直采
        self.max_length = config["max_length"]
        self.data_type = None  #用来标识加载的是训练集还是测试集 "train" or "test"
        self.load()

    def load(self):
        self.data = []
        self.knwb = defaultdict(list)
        with open(self.path, encoding="utf8") as f:
            for line in f:
                line = json.loads(line)
                #加载训练集
                if isinstance(line, dict):
                    self.data_type = "train"
                    questions = line["questions"]
                    label = line["target"]
                    for question in questions:
                        self.knwb[self.schema[label]].append(question)
                #加载测试集
                else:
                    self.data_type = "test"
                    assert isinstance(line, list)
                    question, label = line
                    label_index = torch.LongTensor([self.schema[label]])
                    self.data.append([question, label_index])
        return

    #每次加载两个文本,输出他们的拼接后编码
    def encode_sentence(self, text1, text2):
        input_id = self.tokenizer.encode(text1, text2,
                                         truncation='longest_first',
                                         max_length=self.max_length,
                                         padding='max_length',
                                         )
        return input_id

    def __len__(self):
        if self.data_type == "train":
            return self.config["epoch_data_size"]
        else:
            assert self.data_type == "test", self.data_type
            return len(self.data)

    def __getitem__(self, index):
        if self.data_type == "train":
            return self.random_train_sample() #随机生成一个训练样本
        else:
            return self.data[index]

    #依照一定概率生成负样本或正样本
    #负样本从随机两个不同的标准问题中各随机选取一个
    #正样本从随机一个标准问题中随机选取两个
    def random_train_sample(self):
        standard_question_index = list(self.knwb.keys())
        #随机正样本
        if random.random() <= self.config["positive_sample_rate"]:
            p = random.choice(standard_question_index)
            #如果选取到的标准问下不足两个问题,则无法选取,所以重新随机一次
            if len(self.knwb[p]) < 2:
                return self.random_train_sample()
            else:
                s1, s2 = random.sample(self.knwb[p], 2)
                input_ids = self.encode_sentence(s1, s2)
                input_ids = torch.LongTensor(input_ids)
                return [input_ids, torch.LongTensor([1])]
        #随机负样本
        else:
            p, n = random.sample(standard_question_index, 2)
            s1 = random.choice(self.knwb[p])
            s2 = random.choice(self.knwb[n])
            input_ids = self.encode_sentence(s1, s2)
            input_ids = torch.LongTensor(input_ids)
            return [input_ids, torch.LongTensor([0])]



#加载字表或词表
def load_vocab(vocab_path):
    tokenizer = BertTokenizer(vocab_path)
    return tokenizer

#加载schema
def load_schema(schema_path):
    with open(schema_path, encoding="utf8") as f:
        return json.loads(f.read())

#用torch自带的DataLoader类封装数据
def load_data(data_path, config, shuffle=True):
    dg = DataGenerator(data_path, config)
    dl = DataLoader(dg, batch_size=config["batch_size"], shuffle=shuffle)
    return dl



if __name__ == "__main__":
    from config import Config
    dg = DataGenerator("../data/valid.json", Config)

model:

python 复制代码
# -*- coding: utf-8 -*-

import torch
import torch.nn as nn
from torch.optim import Adam, SGD
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from transformers import BertModel, BertConfig

"""
建立网络模型结构
"""

class GetFirst(nn.Module):
    def __init__(self):
        super(GetFirst, self).__init__()

    def forward(self, x):
        return x[0]

class SentenceMatchNetwork(nn.Module):
    def __init__(self, config):
        super(SentenceMatchNetwork, self).__init__()
        # 可以用bert,参考下面
        # pretrain_model_path = config["pretrain_model_path"]
        # self.bert_encoder = BertModel.from_pretrained(pretrain_model_path)

        # 常规的embedding + layer
        hidden_size = config["hidden_size"]
        #20000应为词表大小,这里借用bert的词表,没有用它精确的数字,因为里面有很多无用词,舍弃一部分,不影响效果
        self.embedding = nn.Embedding(20000, hidden_size)
        #一种多层按顺序执行的写法,具体的层可以换
        #unidirection:batch_size, max_len, hidden_size
        #bidirection:batch_size, max_len, hidden_size * 2
        self.encoder = nn.Sequential(nn.LSTM(hidden_size, hidden_size, bidirectional=True, batch_first=True),
                                     GetFirst(),
                                     nn.ReLU(),
                                     nn.Linear(hidden_size * 2, hidden_size), #batch_size, max_len, hidden_size
                                     nn.ReLU(),
                                     )
        self.classify_layer = nn.Linear(hidden_size, 2)
        self.loss = nn.CrossEntropyLoss()

    # 同时传入两个句子的拼接编码
    # 输出一个相似度预测,不匹配的概率
    def forward(self, input_ids, target=None):
        # x = self.bert_encoder(input_ids)[1]
        #input_ids = batch_size, max_length
        x = self.embedding(input_ids) #x:batch_size, max_length, embedding_size
        x = self.encoder(x) #
        #x: batch_size, max_len, hidden_size
        x = nn.MaxPool1d(x.shape[1])(x.transpose(1,2)).squeeze()
        #x: batch_size, hidden_size
        x = self.classify_layer(x)
        #x: batch_size, 2
        #如果有标签,则计算loss
        if target is not None:
            return self.loss(x, target.squeeze())
        #如果无标签,预测相似度
        else:
            return torch.softmax(x, dim=-1)[:, 1] #如果改为x[:,0]则是两句话不匹配的概率



def choose_optimizer(config, model):
    optimizer = config["optimizer"]
    learning_rate = config["learning_rate"]
    if optimizer == "adam":
        return Adam(model.parameters(), lr=learning_rate)
    elif optimizer == "sgd":
        return SGD(model.parameters(), lr=learning_rate)


if __name__ == "__main__":
    from config import Config
    Config["vocab_size"] = 10
    Config["max_length"] = 4
    model = SentenceMatchNetwork(Config)
    s1 = torch.LongTensor([[1,2,3,0], [2,2,0,0]])
    s2 = torch.LongTensor([[1,2,3,4], [3,2,3,4]])
    l = torch.LongTensor([[1],[0]])
    # y = model(s1, s2, l)
    # print(y)
    # print(model.state_dict())

main:

python 复制代码
# -*- coding: utf-8 -*-

import torch
import os
import random
import os
import numpy as np
import logging
from config import Config
from model import SentenceMatchNetwork, choose_optimizer
from evaluate import Evaluator
from loader import load_data

logging.basicConfig(level = logging.INFO,format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

"""
模型训练主程序
"""

def main(config):
    #创建保存模型的目录
    if not os.path.isdir(config["model_path"]):
        os.mkdir(config["model_path"])
    #加载训练数据
    train_data = load_data(config["train_data_path"], config)
    #加载模型
    model = SentenceMatchNetwork(config)
    # 标识是否使用gpu
    cuda_flag = torch.cuda.is_available()
    if cuda_flag:
        logger.info("gpu可以使用,迁移模型至gpu")
        model = model.cuda()
    #加载优化器
    optimizer = choose_optimizer(config, model)
    #加载效果测试类
    evaluator = Evaluator(config, model, logger)
    #训练
    for epoch in range(config["epoch"]):
        epoch += 1
        model.train()
        logger.info("epoch %d begin" % epoch)
        train_loss = []
        for index, batch_data in enumerate(train_data):
            optimizer.zero_grad()
            if cuda_flag:  #如果gpu可用则使用gpu加速
                batch_data = [d.cuda() for d in batch_data]
            input_ids, labels = batch_data
            loss = model(input_ids, labels)  #计算loss
            train_loss.append(loss.item())
            #每轮训练一半的时候输出一下loss,观察下降情况
            if index % int(len(train_data) / 2) == 0:
                logger.info("batch loss %f" % loss)
            loss.backward()  #梯度计算
            optimizer.step() #梯度更新
        logger.info("epoch average loss: %f" % np.mean(train_loss))
    evaluator.eval(config["epoch"])
    # model_path = os.path.join(config["model_path"], "epoch_%d.pth" % epoch)
    # torch.save(model.state_dict(), model_path)
    return

if __name__ == "__main__":
    main(Config)

evaluate:

python 复制代码
# -*- coding: utf-8 -*-
import torch
from loader import load_data
import numpy as np

"""
模型效果测试
"""

class Evaluator:
    def __init__(self, config, model, logger):
        self.config = config
        self.model = model
        self.logger = logger
        self.valid_data = load_data(config["valid_data_path"], config, shuffle=False)
        # 由于效果测试需要训练集当做知识库,再次加载训练集。
        # 事实上可以通过传参把前面加载的训练集传进来更合理,但是为了主流程代码改动量小,在这里重新加载一遍
        self.train_data = load_data(config["train_data_path"], config)
        self.tokenizer = self.train_data.dataset.tokenizer
        self.stats_dict = {"correct":0, "wrong":0}  #用于存储测试结果

    #将知识库中的问题向量化,为匹配做准备
    #每轮训练的模型参数不一样,生成的向量也不一样,所以需要每轮测试都重新进行向量化
    def knwb_to_vector(self):
        self.question_index_to_standard_question_index = {}
        self.questions = []
        for standard_question_index, questions in self.train_data.dataset.knwb.items():
            for question in questions:
                #记录问题编号到标准问题标号的映射,用来确认答案是否正确
                self.question_index_to_standard_question_index[len(self.questions)] = standard_question_index
                self.questions.append(question)
        return

    def eval(self, epoch):
        self.logger.info("开始测试第%d轮模型效果:" % epoch)
        self.stats_dict = {"correct":0, "wrong":0}  #清空前一轮的测试结果
        self.model.eval()
        self.knwb_to_vector()
        for index, batch_data in enumerate(self.valid_data):
            test_questions, labels = batch_data
            predicts = []
            for test_question in test_questions:
                input_ids = []
                for question in self.questions:
                    input_ids.append(self.train_data.dataset.encode_sentence(test_question, question))

                with torch.no_grad():
                    input_ids = torch.LongTensor(input_ids)
                    if torch.cuda.is_available():
                        input_ids = input_ids.cuda()
                    scores = self.model(input_ids).detach().cpu().tolist()
                hit_index = np.argmax(scores)
                # print(hit_index)
                predicts.append(hit_index)
            self.write_stats(predicts, labels)
        self.show_stats()
        return

    def write_stats(self, predicts, labels):
        assert len(labels) == len(predicts)
        for hit_index, label in zip(predicts, labels):
            hit_index = self.question_index_to_standard_question_index[hit_index] #转化成标准问编号
            if int(hit_index) == int(label):
                self.stats_dict["correct"] += 1
            else:
                self.stats_dict["wrong"] += 1
        return

    def show_stats(self):
        correct = self.stats_dict["correct"]
        wrong = self.stats_dict["wrong"]
        self.logger.info("预测集合条目总量:%d" % (correct +wrong))
        self.logger.info("预测正确条目:%d,预测错误条目:%d" % (correct, wrong))
        self.logger.info("预测准确率:%f" % (correct / (correct + wrong)))
        self.logger.info("--------------------")
        return
相关推荐
化作星辰41 分钟前
深度学习_神经网络激活函数
人工智能·深度学习·神经网络
陈天伟教授44 分钟前
人工智能技术- 语音语言- 03 ChatGPT 对话、写诗、写小说
人工智能·chatgpt
llilian_161 小时前
智能数字式毫秒计在实际生活场景中的应用 数字式毫秒计 智能毫秒计
大数据·网络·人工智能
打码人的日常分享1 小时前
基于信创体系政务服务信息化建设方案(PPT)
大数据·服务器·人工智能·信息可视化·架构·政务
硬汉嵌入式2 小时前
专为 MATLAB 优化的 AI 助手MATLAB Copilot
人工智能·matlab·copilot
北京盛世宏博2 小时前
如何利用技术手段来甄选一套档案馆库房安全温湿度监控系统
服务器·网络·人工智能·选择·档案温湿度
搞科研的小刘选手2 小时前
【EI稳定】检索第六届大数据经济与信息化管理国际学术会议(BDEIM 2025)
大数据·人工智能·经济
半吊子全栈工匠2 小时前
软件产品的10个UI设计技巧及AI 辅助
人工智能·ui
机器之心3 小时前
真机RL!最强VLA模型π*0.6来了,机器人在办公室开起咖啡厅
人工智能·openai
机器之心3 小时前
马斯克Grok 4.1低调发布!通用能力碾压其他一切模型
人工智能·openai