# coding: utf-8
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
# 导入torch工具
import json
import torch
# 导入nn准备构建模型
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# 导入torch的数据源 数据迭代器工具包
from torch.utils.data import Dataset, DataLoader
# 用于获得常见字母及字符规范化
import string
# 导入时间工具包
import time
# 引入制图工具包
import matplotlib.pyplot as plt
# 从io中导入文件打开方法
from io import open
# 1 获取常用的字符 标点,把每个char字符作为一个token,用onehot编码表示token
# 因此我们的词表就是 char表 (字符表) 57个char
all_letters = string.ascii_letters + " ,.;'"
print(all_letters)
n_letter = len(all_letters) # 词表的大小
print('字符表的长度:', n_letter)
# 2 获取国家的类别种数
# 国家名 种类数
categorys = ['Italian', 'English', 'Arabic', 'Spanish', 'Scottish', 'Irish', 'Chinese', 'Vietnamese', 'Japanese',
'French', 'Greek', 'Dutch', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Czech', 'German']
# 国家名 个数,就是模型的 (linear输出维度) 分类数
categorynum = len(categorys)
print('categorys--->', categorys)
# 3 读取数据
def read_data(filename):
# 3.1 初始化空列表两个
my_list_x, my_list_y = [], []
# 3.2 读取文件内容
with open(filename, 'r', encoding='utf-8') as fr:
for line in fr.readlines():
# 异常点判断:改行长度<=5,说明这是异常样本,直接跳到下一行
if len(line) <= 5:
continue
x, y = line.strip().split('\t')
my_list_x.append(x)
my_list_y.append(y)
# 3.3 返回两个列表
return my_list_x, my_list_y
# 4 构建数据集
class NameClsDataset(Dataset):
def __init__(self, mylist_x, mylist_y):
self.mylist_x = mylist_x
self.mylist_y = mylist_y
def __len__(self):
return len(self.mylist_x)
def __getitem__(self, item):
# 01 item 异常值出处理
index = min(max(item, 0), len(self.mylist_x) - 1)
# 02 根据idx拿到人名 国家名
x = self.mylist_x[index]
y = self.mylist_y[index]
# 03 完成onehot
tensor_x = torch.zeros(len(x), n_letter)
for idx, letter in enumerate(x):
tensor_x[idx][all_letters.find(letter)] = 1
# 04 获得标签
tensor_y = torch.tensor(categorys.index(y), dtype=torch.long)
return tensor_x, tensor_y
# 5 构建dataloader
def get_dataloader():
filename = './data/name_classfication.txt'
my_list_x, my_list_y = read_data(filename)
mydataset = NameClsDataset(my_list_x, my_list_y)
my_dataloader = DataLoader(
mydataset,
batch_size=1,
shuffle=True, # 打乱顺序
# drop_last=True, # 是否丢弃最后那个不足一个batch_size的数据组
# collate_fn=collate_fn, # 处理一个batch的数据为整齐的维度
)
x, y = next(iter(my_dataloader))
# print(x)
# print(x.shape)
# print(y)
return my_dataloader
# 6 创建rnn模型
class MyRNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size, num_layers=1):
super().__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.num_layers = num_layers
self.rnn = nn.RNN(self.input_size, self.hidden_size,
self.num_layers, batch_first=True)
# self.linear = nn.Linear(self.hidden_size, self.hidden_size)
self.linear = nn.Linear(self.hidden_size, self.output_size)
self.softmax = nn.LogSoftmax(dim=-1)
def forward(self, input):
# input.shape = (1, 9, 57)
# hidden.shape = (1, 1, 128)
# rnn_output.shape = (1, 9, 128)
# rnn_hn.shape = (1, 1, 128)
# rnn_output, _ = self.rnn(input)
rnn_output, rnn_hn = self.rnn(input)
# temp.shape = (1, 128)
# temp = rnn_output[0][-1].unsqueeze(0)
temp = rnn_hn[0]
# output.shape=(1,18)
# self.softmax(output) (2, 18)
output = self.linear(temp) # 可以接受三维数据
return self.softmax(output), rnn_hn
# 7 测试RNN
def ceshiRNN():
# 1 拿到数据
my_dataloader = get_dataloader()
# 2 实例化模型
input_size = n_letter # 字符表的大小 (词表的大小)
hidden_size = 128 # 超参数 768,rnn输出维度
output_size = len(categorys) # 18,分类总数
my_rnn = MyRNN(input_size, hidden_size, output_size)
# 3 将数据送入到模型
x, y = next(iter(my_dataloader))
output, hn = my_rnn(x) # output.shape = (1, 18)
print(output.shape)
print(hn.shape)
# 8 训练RNN
def train_my_rnn():
epochs = 1
my_lr = 1e-3
# 1 读取数据
my_list_x, my_list_y = read_data('./data/name_classfication.txt')
# 2 定义dataset
myDataset = NameClsDataset(my_list_x, my_list_y)
# 3 实例化dataloader
my_dataloader = DataLoader(myDataset, batch_size=1, shuffle=True)
# 4 实例化RNN模型
input_size = 57
hidden_size = 128
output_size = 18
my_rnn = MyRNN(input_size, hidden_size, output_size)
# 5 损失函数
my_crossentropy = nn.NLLLoss()
# 6 优化器
my_optimizer = optim.Adam(my_rnn.parameters(), lr=my_lr)
# 7 日志
start_time = time.time()
total_iter_num = 0 # 已经训练好的样本数
total_loss = 0 # 总的loss
total_loss_list = [] # 每隔多少步存储loss-avg
total_acc_num = 0
total_acc_list = [] # 存储间隔准确率acc-avg
# 8 开始训练
# 8.1 外部循环
for epoch_idx in range(epochs):
# 8.2 batch循环
for i, (x, y) in enumerate(my_dataloader):
# 8.3 将x送入到模型 一轮模型训练
output, hn = my_rnn(x)
my_loss = my_crossentropy(output, y)
my_optimizer.zero_grad()
my_loss.backward()
my_optimizer.step()
total_iter_num += 1
total_loss += my_loss.item()
item1 = 1 if torch.argmax(output, dim=-1).item() == y.item() else 0
total_acc_num += item1
# 每隔 100 步存储avg-loss acc-avg
if total_iter_num % 100 == 0:
# 保存一下平均损失
loss_avg = total_loss / total_iter_num
total_loss_list.append(loss_avg)
# acc-avg
acc_avg = total_acc_num / total_iter_num
total_acc_list.append(acc_avg)
if total_iter_num % 1000 == 0:
loss_avg = total_loss / total_iter_num
acc_avg = total_acc_num / total_iter_num
end_time = time.time()
use_time = end_time - start_time
print(
'当前的训练批次:%d, 平均损失:%.5f, 训练时间:%.3f, 准确率:%.2f' % (
epoch_idx + 1,
loss_avg,
use_time,
acc_avg
)
)
# 9 保存模型
torch.save(my_rnn.state_dict(), './model/my_rnn.bin')
# 10 结束
all_time = time.time() - start_time
print('总耗时:', all_time)
return total_loss_list, total_acc_list, all_time
# 9 将模型结果进行保存,方便进行读取
def save_rnn_res():
# 1 训练模型,得到需要的结果
total_loss_list, total_acc_list, all_time = train_my_rnn()
# 2 定义一个字典
dict1 = {
'loss': total_loss_list,
'time': all_time,
'acc': total_acc_list
}
# 3 保存成json
with open('./data/rnn_result.json', 'w') as fw:
fw.write(json.dumps(dict1))
# 10 读取模型结果json
def read_json(json_path):
with open(json_path, 'r') as fr:
# '{a:1, b:2,,,}' --> json.loads()
# json.load() 加载json文件
res = json.load(fr)
return res
# 11 绘图
def plt_RNN():
# 1 拿到数据
rnn_results = read_json('./data/rnn_result-epoch3.json')
total_loss_list_rnn, all_time_rnn, total_acc_list_rnn = rnn_results['loss'], rnn_results['time'], rnn_results['acc']
lstm_results = read_json('./data/lstm_result-epoch3.json')
total_loss_list_lstm, all_time_lstm, total_acc_list_lstm = lstm_results['loss'], lstm_results['time'], lstm_results[
'acc']
gru_results = read_json('./data/gru_result-epoch3.json')
total_loss_list_gru, all_time_gru, total_acc_list_gru = gru_results['loss'], gru_results['time'], gru_results['acc']
# 2 绘制loss对比曲线图
plt.figure(0)
plt.plot(total_loss_list_rnn, label='RNN')
plt.plot(total_loss_list_lstm, label='LSTM', color='red')
plt.plot(total_loss_list_gru, label='GRU', color='orange')
plt.legend(loc='upper right')
plt.savefig('./picture/loss.png')
plt.show()
# 3 绘制耗时柱状图
plt.figure(1)
x_data = ['RNN', 'LSTM', 'GRU']
y_data = [all_time_rnn, all_time_lstm, all_time_gru]
plt.bar(range(len(x_data)), y_data, tick_label=x_data)
plt.savefig('./picture/use_time.png')
plt.show()
# 4 绘制acc曲线图
plt.figure(2)
plt.plot(total_acc_list_rnn, label='RNN')
plt.plot(total_acc_list_lstm, label='LSTM', color='red')
plt.plot(total_acc_list_gru, label='GRU', color='orange')
plt.legend(loc='upper right')
plt.savefig('./picture/acc.png')
plt.show()
# 12 定义预测输入的x --》 tensor_x
def line2tensor(x):
tensor_x = torch.zeros(len(x), n_letter)
for li, letter in enumerate(x):
tensor_x[li][all_letters.find(letter)] = 1
return tensor_x
# 13 预测主函数
def rnn_predict(x):
# 1 x --》 tensor_x
tensor_x = line2tensor(x)
# 2 实力化模型
my_rnn = MyRNN(input_size=57, hidden_size=128, output_size=18)
my_rnn.load_state_dict(torch.load('./model/my_rnn.bin'))
# 3 预测
with torch.no_grad(): # 预测时不去计算梯度
input0 = tensor_x.unsqueeze(0) # input0 是三维的,rnn需要
output, hn = my_rnn(input0)
topv, topi = output.topk(3, 1, True)
print('人名是', x)
# 4 打印topk个
for i in range(3):
value = topv[0][i]
index = topi[0][i]
cate = categorys[index]
print('国家名是:', cate)
if __name__ == '__main__':
# filename = './data/name_classfication.txt'
# x, y = read_data(filename)
# print(x)
# print(y)
# get_dataloader()
# ceshiRNN()
# train_my_rnn()
# plt_RNN()
rnn_predict('zhang')
人名分类器(nlp)
weixin_431470862024-11-27 12:41
相关推荐
King.6241 小时前
SQLynx 数据库管理平台 3.6.0 全新发布:全面支持华为数据库和ClickHouse,代码提示更智能!Dollhan3 小时前
ARTS-01深圳市青牛科技实业有限公司 小芋圆3 小时前
GC8872 是一款带故障报告功能的刷式直流电机驱动芯片, 适用于打印机、电器、工业设备以及其他小型机器。子午4 小时前
基于Python深度学习【眼疾识别】系统设计与实现+人工智能+机器学习+TensorFlow算法云天徽上5 小时前
【数据可视化-11】全国大学数据可视化分析小馋喵知识杂货铺5 小时前
pytest 截图功能李洋-蛟龙腾飞公司5 小时前
HarmonyOS NEXT 应用开发练习:AI智能语音播报JAMES费7 小时前
《Hands on Large Language Models》(深入浅出大型语言模型)实战书探秘MichaelIp7 小时前
LLM大语言模型中RAG切片阶段改进策略XianxinMao7 小时前
MemGPT:赋能大型语言模型的自我记忆管理