我们有一个初始的邮件数据,可以利用这些数据进行骚扰邮件分类的大模型微调。主要来实现垃圾邮件的分类任务,判断一个邮件内容是否是骚扰邮件,为一个二分类问题。微调主要有以下几个步骤:
- 数据处理:对骚扰邮件的数据处理成能够输入进模型的格式,构造dataloader
- 修改模型可训练参数和指定层:固定大部分层的参数,将指定层参数设置为可训练,修改输出层
- 构造损失函数:设计适合分类的模型损失函数
数据处理
主要进行实现以下功能:
- 数据集下载
- 数据集中的某些值的处理(垃圾邮件的标签映射为1)
- 数据集划分为训练集、验证集和测试集
python
import urllib.request
import zipfile
import os
from pathlib import Path
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "./data/classify_finetune/sms_spam_collection.zip"
extracted_path = "./data/classify_finetune/sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
if data_file_path.exists():
print(f"{data_file_path} already exists. Skipping download and extraction.")
return
# 下载文件
with urllib.request.urlopen(url) as response:
with open(zip_path, "wb") as out_file:
out_file.write(response.read())
# 解压文件
with zipfile.ZipFile(zip_path, "r") as zip_ref:
zip_ref.extractall(extracted_path)
# 添加 .tsv 文件扩展
original_file_path = Path(extracted_path) / "SMSSpamCollection"
os.rename(original_file_path, data_file_path)
print(f"File downloaded and saved as {data_file_path}")
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
import pandas as pd
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
print(df['Label'].value_counts())
def create_balanced_dataset(df):
# 计算"spam"实例的数量
num_spam = df[df["Label"] == "spam"].shape[0]
# 随机采样"ham"实例以匹配"spam"实例的数量
ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)
# 将"ham"子集与"spam"结合起来z
balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])
return balanced_df
balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
# import ipdb; ipdb.set_trace()
# 数据集划分
def random_split(df, train_frac, validation_frac):
# 打乱整个 DataFrame
df = df.sample(frac=1, random_state=123).reset_index(drop=True)
# 计算切分索引
train_end = int(len(df) * train_frac)
validation_end = train_end + int(len(df) * validation_frac)
# 切分 DataFrame
train_df = df[:train_end]
validation_df = df[train_end:validation_end]
test_df = df[validation_end:]
return train_df, validation_df, test_df
train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# 测试大小默认为 0.2
train_df.to_csv("./data/classify_finetune/train.csv", index=None)
validation_df.to_csv("./data/classify_finetune/validation.csv", index=None)
test_df.to_csv("./data/classify_finetune/test.csv", index=None)
数据集构造
构造数据集类和数据集加载器。
下面将数据集封装成Dataset,同时将数据进行编码,使之能够作为模型的输入。
python
tokenizer = tiktoken.get_encoding("gpt2")
class SpamDataset(Dataset):
def __init__(self, csv, tokenizer, max_length=None, pad_id=50256):
super().__init__()
self.data = pd.read_csv(csv)
self.encoded_texts = [
tokenizer.encode(text) for text in self.data["Text"]
]
if max_length is None:
max_length = 0
for text in self.encoded_texts:
if len(text) > max_length:
max_length = len(text)
self.max_length = max_length
else:
self.max_length = max_length
self.encoded_texts = [
text[:max_length] for text in self.encoded_texts
]
self.encoded_texts = [
text + [pad_id] * (self.max_length - len(text)) for text in self.encoded_texts
]
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return (
torch.tensor(self.encoded_texts[index]),
torch.tensor(self.data["Label"][index])
)
将数据集划分成多batch的形式,提高训练效率。
python
train_dataset_max_length = None
def get_dataloader():
train_dataset = SpamDataset(
"./data/classify_finetune/train.csv",
max_length=None,
tokenizer=tokenizer
)
global train_dataset_max_length
train_dataset_max_length = train_dataset.max_length
val_dataset = SpamDataset(
"./data/classify_finetune/validation.csv",
max_length=train_dataset.max_length,
tokenizer=tokenizer
)
test_dataset = SpamDataset(
"./data/classify_finetune/test.csv",
max_length=train_dataset.max_length,
tokenizer=tokenizer
)
num_workers = 0
batch_size = 8
train_loader = DataLoader(
dataset=train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=num_workers,
drop_last=True
)
val_loader = DataLoader(
dataset=val_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
drop_last=False
)
test_loader = DataLoader(
dataset=test_dataset,
batch_size=batch_size,
shuffle=False,
num_workers=num_workers,
drop_last=False
)
return train_loader, val_loader, test_loader
模型修改
原来模型的输出为 b a t c h × N × c t x l e n batch \times N \times ctxlen batch×N×ctxlen ,也就是每个token会输出大小为词表大小的向量,每个向量代表每个词预测的分数。但是我们希望模型对邮件是否为骚扰邮件进行判断,也就是实现二分类问题,我们需要对模型输出层进行修改,只映射为大小为2的向量,代表预测为骚扰邮件和非骚扰邮件的分数。
同时需要冻结模型的参数,只需要最后一个transformer block的参数可训练,以及最后的层归一化可以训练。
注意:
- transformer中的导入预训练模型的路径只能向下搜索,不能带有父目录的形式,如
../weights/gpt2-small
。 - all_code 是之前预训练gpt2模型的所有代码,可以在我的github中查看
python
# input_batch: [8, 120] target_batch: [8]
train_loader, val_loader, test_loader = get_dataloader()
from transformers import GPT2Model
from all_code import GPTModel, BASE_CONFIG, load_weights
weights_path = "weights/gpt2-small"
gpt_hf = GPT2Model.from_pretrained(weights_path)
gpt_hf.eval()
model = GPTModel(BASE_CONFIG)
load_weights(model, gpt_hf)
model.eval()
for param in model.parameters():
param.requires_grad = False
num_classes = 2
model.out_head = torch.nn.Linear(BASE_CONFIG["emb_dim"], num_classes)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for param in model.trf_blocks[-1].parameters():
param.requires_grad = True
for param in model.final_norm.parameters():
param.requires_grad = True
损失函数
在之前的GPT2模型中,考虑到输出的是 N 个token预测的下一个词的概率,由于因果注意力机制,每个token所能关注到的信息为该token及之前的token,只有最后一个token可以关注到全局的token信息,所以我们取最后一个token的预测结果就可以了。对应 logits = model(input_batch)[:, -1, :]
代码。
同时实现预测准确率计算的函数,使用预测正确个数除以总个数既可。
损失函数的计算和之前的代码非常相似,因为损失函数一般要可微,故使用交叉熵损失计算。
python
def calc_accuracy_loader(loader, model, device, num_batches=None):
model.eval()
correct, total = 0, 0
if num_batches is None:
num_batches = len(loader)
else:
num_batches = min(num_batches, len(loader))
for i, (input_batch, target_batch) in enumerate(loader):
if i >= num_batches:
break
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
with torch.no_grad():
logits = model(input_batch)[:, -1, :]
predicted_labels = torch.argmax(logits, dim=-1)
total += predicted_labels.shape[0]
correct += (predicted_labels == target_batch).sum().item()
return correct / total
def calc_loss_batch(input_batch, target_batch, model, device):
input_batch, target_batch = input_batch.to(device), target_batch.to(device)
logits = model(input_batch)[:, -1, :]
loss = torch.nn.functional.cross_entropy(logits, target_batch)
return loss
def calc_loss_loader(loader, model, device, num_batches=None):
total_loss = 0.
if len(loader) == 0:
return float("nan")
elif num_batches is None:
num_batches = len(loader)
else:
num_batches = min(num_batches, len(loader))
for i, (input_batch, target_batch) in enumerate(loader):
if i >= num_batches:
break
loss = calc_loss_batch(input_batch, target_batch, model, device)
total_loss += loss.item()
return total_loss / num_batches
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
model.eval()
with torch.no_grad():
train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
model.train()
return train_loss, val_loss
模型训练
训练过程中,记录损失和模型的预测准确率。
python
def train_classifier(model, train_loader, val_loader, optimizer, device, num_epochs,
eval_freq, eval_iter, tokenizer):
# 初始化列表以跟踪损失和看到的示例
train_losses, val_losses, train_accs, val_accs = [], [], [], []
examples_seen, global_step = 0, -1
# 主要的训练循环
for epoch in range(num_epochs):
model.train() # 将模型设置为训练模式
for input_batch, target_batch in train_loader:
optimizer.zero_grad() # 重置上一个 epoch 的损失梯度
loss = calc_loss_batch(input_batch, target_batch, model, device)
loss.backward() # 计算损失梯度
optimizer.step() # 使用损失梯度更新模型权重
examples_seen += input_batch.shape[0] # 新功能:跟踪示例而不是标记
global_step += 1
# 可选的评估步骤
if global_step % eval_freq == 0:
train_loss, val_loss = evaluate_model(
model, train_loader, val_loader, device, eval_iter)
train_losses.append(train_loss)
val_losses.append(val_loss)
print(f"Epoch {epoch+1} (Step {global_step:06d}): "
f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")
# 计算每个 epoch 后的准确率
train_accuracy = calc_accuracy_loader(train_loader, model, device, num_batches=eval_iter)
val_accuracy = calc_accuracy_loader(val_loader, model, device, num_batches=eval_iter)
print(f"Training accuracy: {train_accuracy*100:.2f}% | ", end="")
print(f"Validation accuracy: {val_accuracy*100:.2f}%")
train_accs.append(train_accuracy)
val_accs.append(val_accuracy)
return train_losses, val_losses, train_accs, val_accs, examples_seen
import time
start_time = time.time()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=0.1)
num_epochs = 5
train_losses, val_losses, train_accs, val_accs, examples_seen = train_classifier(
model, train_loader, val_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=50, eval_iter=5,
tokenizer=tokenizer
)
end_time = time.time()
execution_time_minutes = (end_time - start_time) / 60
print(f"Training completed in {execution_time_minutes:.2f} minutes.")
训练之后可以进行模型的简单测试:
python
def classify_review(text, model, tokenizer, device, max_length=None, pad_token_id=50256):
model.eval()
# 准备模型的输入
input_ids = tokenizer.encode(text)
supported_context_length = model.pos_emb.weight.shape[1]
# 如果序列太长则截断
input_ids = input_ids[:min(max_length, supported_context_length)]
# 将序列填充到最长序列
input_ids += [pad_token_id] * (max_length - len(input_ids))
input_tensor = torch.tensor(input_ids, device=device).unsqueeze(0) # 添加批次维度
# 模型推理
with torch.no_grad():
logits = model(input_tensor)[:, -1, :] # 最后一个输出 token 的 Logits
predicted_label = torch.argmax(logits, dim=-1).item()
# 返回分类结果
return "spam" if predicted_label == 1 else "not spam"
text_1 = (
"You are a winner you have been specially"
" selected to receive $1000 cash or a $2000 award."
)
print(classify_review(
text_1, model, tokenizer, device, max_length=train_dataset_max_length
))
text_2 = (
"Hey, just wanted to check if we're still on"
" for dinner tonight? Let me know!"
)
print(classify_review(
text_2, model, tokenizer, device, max_length=train_dataset_max_length
))
# 保存模型
torch.save(model.state_dict(), "weights/classify-finetune/classifier.pth")
输出
spam
not spam
下次加载已经训练过的模型时,直接使用
python
model_state_dict = torch.load("weights/classify-finetune/classifier.pth")
model.load_state_dict(model_state_dict)
GPT2模型实现可参考:GPT2从零实现