-
从 CSV 中读取文本和标签,构建 Dataset。
-
用
random_split
划分训练集和测试集。 -
用 Tokenizer 将文本编码为 BERT 输入,collate_fn 批量处理并 padding。
-
DataLoader 按 batch 输出训练数据。
-
模型基于预训练 BERT,只训练分类层(fc)。
-
训练流程:前向 → 损失 → 反向 → 更新 fc → 验证。
python
from transformers import BertTokenizer, BertModel
import os
import torch
import torch.nn as nn
from tqdm import tqdm
import pandas as pd
from torch.utils.data import Dataset, random_split, DataLoader
CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
class OutSellDataset(Dataset):
def __init__(self, filepath):
self.dataset = pd.read_csv(filepath)
def __len__(self):
return len(self.dataset)
def __getitem__(self, i):
text = self.dataset.text[i]
label = self.dataset.label[i]
return text, label
# 创建分类模型
class Model(nn.Module):
def __init__(self,bert):
super().__init__()
self.bert=bert
self.fc = torch.nn.Linear(in_features=768, out_features=2)
# 冻结 BERT 参数,只训练分类层
for param in self.bert.parameters():
param.requires_grad = False
def forward(self, input_ids, attention_mask, token_type_ids):
out = self.bert(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
)
# 对抽取的特征只取第1个字的结果做分类即可
logits = self.fc(out.last_hidden_state[:, 0])
return logits
if __name__ == "__main__":
ds_path = os.path.join(CURRENT_DIR, 'datasets', "waimai.csv")
df = pd.read_csv(ds_path)
print(df.head())
print(df.info())
print(df.label.value_counts())
print(df.text.head())
datasets = OutSellDataset(ds_path)
print(datasets[0])
# 数据集划分
train_size = int(len(datasets) * 0.8)
test_size = len(datasets) - train_size
# 随机划分数据集
generator = torch.Generator().manual_seed(42) # 设置随机种子
train_dataset, test_dataset = random_split(datasets, [train_size, test_size])
print(len(train_dataset), len(test_dataset))
# 预训练词典
tokenizer = BertTokenizer.from_pretrained(
"bert-base-chinese",
cache_dir=os.path.join(CURRENT_DIR, 'chinese'),
do_lower_case=False,
)
# 自定义数据整理器
def collate_fn(data):
sents = [i[0] for i in data]
labels = [i[1] for i in data]
# 编码
data = tokenizer(
text=sents,
truncation=True,
padding="max_length",
max_length=500,
return_tensors="pt",
return_length=True,
)
# input_ids:编码之后的数字
# attention_mask:补零的位置是0, 其他位置是1
input_ids = data["input_ids"]
attention_mask = data["attention_mask"]
token_type_ids = data["token_type_ids"]
labels = torch.LongTensor(labels)
# # 把数据移动到计算设备上
# input_ids = input_ids.to(device)
# attention_mask = attention_mask.to(device)
# token_type_ids = token_type_ids.to(device)
# labels = labels.to(device)
return input_ids, attention_mask, token_type_ids, labels
# 加载器构建
train_dl = DataLoader(
dataset=train_dataset,
batch_size=16,
collate_fn=collate_fn,
shuffle=True,
drop_last=True,
)
test_dl = DataLoader(
dataset=test_dataset,
batch_size=16,
collate_fn=collate_fn,
# shuffle=True,
drop_last=True,
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 预训练模型
bert = BertModel.from_pretrained(
"bert-base-chinese",
cache_dir=os.path.join(CURRENT_DIR, 'model'),
).to(device)
# 打印参数量
print(sum(p.numel() for p in bert.parameters()))
# 输出
input_ids, attention_mask, token_type_ids, labels = next(iter(train_dl))
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
token_type_ids = token_type_ids.to(device)
# out = bert(
# input_ids=input_ids,
# attention_mask=attention_mask,
# token_type_ids=token_type_ids,
# )
# # 批大小,序列长度,隐藏维度
# print(out.last_hidden_state.size())
# # print(f"批大小,序列长度,隐藏维度\n{out.last_hidden_state}")
#
# # 模型冻结
# for param in bert.parameters():
# param.requires_grad = False
# 使用分类模型
logmodel = Model(bert).to(device)
output = logmodel(input_ids, attention_mask, token_type_ids)
print(output)
# 定义超参数
loss_fn = nn.CrossEntropyLoss()
# 只训练 fc层
optimizer = torch.optim.Adam(logmodel.fc.parameters(), lr=2e-4, eps=1e-8)
# 微调
# optimizer = torch.optim.Adam(logmodel.parameters(), lr=2e-4, eps=1e-8)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.5)
# 定义训练函数
def train(dataloader):
logmodel.train()
total_acc, total_count, total_loss = 0, 0, 0
for input_ids, mask, type_ids, label in tqdm(dataloader, desc="Training", leave=False):
# 移动到 device
input_ids, mask, type_ids, label = input_ids.to(device), mask.to(device), type_ids.to(device), label.to(
device)
optimizer.zero_grad()
predicted_label = logmodel(input_ids, token_type_ids=type_ids, attention_mask=mask)
loss = loss_fn(predicted_label, label)
loss.backward()
optimizer.step()
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item() * label.size(0)
# 返回平均 loss 和准确率
return total_loss / total_count, total_acc / total_count
def test(dataloader):
logmodel.eval()
total_acc, total_count, total_loss = 0, 0, 0
with torch.no_grad():
for input_ids, mask, type_ids, label in tqdm(dataloader, desc="Testing", leave=False):
input_ids, mask, type_ids, label = input_ids.to(device), mask.to(device), type_ids.to(device), label.to(
device)
predicted_label = logmodel(input_ids, token_type_ids=type_ids, attention_mask=mask)
loss = loss_fn(predicted_label, label)
total_acc += (predicted_label.argmax(1) == label).sum().item()
total_count += label.size(0)
total_loss += loss.item() * label.size(0)
return total_loss / total_count, total_acc / total_count
# 开始训练
epochs = 2
train_loss=[]
train_acc=[]
test_loss=[]
test_acc=[]
for epoch in range(epochs):
epoch_loss, epoch_acc = train(train_dl)
epoch_test_loss, epoch_test_acc = test(test_dl)
train_loss.append(epoch_loss)
train_acc.append(epoch_acc)
test_loss.append(epoch_test_loss)
test_acc.append(epoch_test_acc)
scheduler.step()
template=(
"epoch:{:2d},train_loss:{:.5f},train_acc:{:.1f}%,"
"test_loss:{:.5f},test_acc:{:.1f}%"
)
print(
template.format(
epoch+1,
epoch_loss,
epoch_acc*100,
epoch_test_loss,
epoch_test_acc*100,
)
)
print("Done!")