1 下载数据
1.1 简单数据分析:
2 上代码,上模型
1 整理数据。 2 加载预训练模型。 3 微调。
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
data = pd.read_csv("./data/ChnSentiCorp_htl_all.csv")
data = data.dropna()
# 画图
data.groupby('label').count().plot(kind='bar')
# 变化成模型能够读懂的格式
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self) -> None:
super().__init__()
self.data = pd.read_csv("./data/ChnSentiCorp_htl_all.csv")
self.data = self.data.dropna()
def __getitem__(self, index):
return self.data.iloc[index]["review"], self.data.iloc[index]["label"]
def __len__(self):
return len(self.data)
# 划分数据集
from torch.utils.data import random_split
dataset = MyDataset()
trainset, validset = random_split(dataset, lengths=[0.9, 0.1])
len(trainset), len(validset)
import torch
tokenizer = AutoTokenizer.from_pretrained("./dianping")
# 前面提到了 需要对数据进行处理称模型能够接受的格式: input_ids, token_type_ids,attention_mask
def collate_func(batch):
texts, labels = [], []
for item in batch:
texts.append(item[0])
labels.append(item[1])
inputs = tokenizer(texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
inputs["labels"] = torch.tensor(labels)
return inputs
from torch.utils.data import DataLoader
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=collate_func)
# 加载预训练模型,然后优化
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained("./dianping")
if torch.cuda.is_available():
model = model.cuda()
# 两个参数, 1 模型参数, 2 学习速率
optimizer = Adam(model.parameters(), lr=2e-5)
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in validloader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
output = model(**batch)
pred = torch.argmax(output.logits, dim=-1)
acc_num += (pred.long() == batch["labels"].long()).float().sum()
return acc_num / len(validset)
def train(epoch=3, log_step=100):
global_step = 0
for ep in range(epoch):
# 开启训练模式
model.train()
for batch in trainloader:
# 是否使用显卡
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
# 梯度归零
optimizer.zero_grad()
# 计算损失
output = model(**batch)
# 反向求导
output.loss.backward()
# 迭代
optimizer.step()
if global_step % log_step == 0:
print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
global_step += 1
acc = evaluate()
print(f"ep: {ep}, acc: {acc}")
train()
sen = "真的不错,推荐你来来试试这个过程!"
id2_label = {0: "差评!", 1: "好评!"}
model.eval()
with torch.inference_mode():
inputs = tokenizer(sen, return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}
logits = model(**inputs).logits
pred = torch.argmax(logits, dim=-1)
print(f"输入:{sen}\n模型预测结果:{id2_label.get(pred.item())}")
from transformers import pipeline
model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
输出: ep: 0, global_step: 0, loss: 0.3093985319137573 ep: 0, global_step: 100, loss: 0.15114116668701172 ep: 0, global_step: 200, loss: 0.18284356594085693 ep: 0, acc: 0.9123711585998535 ep: 1, global_step: 300, loss: 0.06553637981414795 ep: 1, global_step: 400, loss: 0.12580494582653046 ep: 1, acc: 0.907216489315033 ep: 2, global_step: 500, loss: 0.1393854171037674 ep: 2, global_step: 600, loss: 0.024754131212830544 ep: 2, acc: 0.907216489315033 输入:真的不错,推荐你来来试试这个过程! 模型预测结果:好评!
cpu很慢.....