文本分类代码实践:新增drop-out,随机mask,早停,增加Tokenizer wi转化功能以及train/val损失对比。
默认版本-新最佳模型已保存! Epoch: 9, 验证集准确率: 0.8493 Epoch 8/20 | 训练损失: 0.0036 | 验证损失: 0.7760 | 验证集准确率: 0.8457
mask方法:新最佳模型已保存! Epoch: 18, 验证集准确率: 0.8658 Epoch 18/20 | 训练损失: 0.0106 | 验证损失: 0.9426 | 验证集准确率: 0.8658
mask方法+dropout:新最佳模型已保存! Epoch: 20, 验证集准确率: 0.8688 Epoch 20/20 | 训练损失: 0.0097 | 验证损失: 1.0001 | 验证集准确率: 0.8688
python
# %%
# 03_tokenization
# %%
import torch
torch.cuda.is_available()
# %%
# import torch
# print(torch.__version__)
# %%
import jieba
text = "我在梦里收到清华大学录取通知书"
seg_list = jieba.lcut(text, cut_all=False) # cut_all=False 表示精确模式
print(seg_list)
# %%
# 未加载词典前的错误分词
text = "九头虫让奔波儿灞把唐僧师徒除掉"
print(f"精准模式: {jieba.lcut(text, cut_all=False)}")
# 加载自定义词典
jieba.load_userdict("./user_dict.txt")
print(f"加载词典后: {jieba.lcut(text, cut_all=False)}")
# %%
text = "我在Boss直聘找工作"
# 开启HMM(默认)
seg_list_hmm = jieba.lcut(text, HMM=True)
print(f"HMM开启: {seg_list_hmm}")
# 关闭HMM
seg_list_no_hmm = jieba.lcut(text, HMM=False)
print(f"HMM关闭: {seg_list_no_hmm}")
# %%
import jieba.posseg as pseg
text = "九头虫让奔波儿灞把唐僧师徒除掉"
# HMM=False 强制只使用词典和动态规划
words = pseg.lcut(text, HMM=False)
print(f"默认词性输出: {words}")
# %%
jieba.load_userdict("./user_dict2.txt")
dic_words = pseg.lcut(text, HMM=False)
print(f"加载词性词典后: {dic_words}")
# %%
# 标签 含义 标签 含义
# n 名词 nr 人名
# ns 地名 nt 机构团体
# nz 其他专名 v 动词
# a 形容词 d 副词
# m 数词 q 量词
# r 代词 p 介词
# c 连词 u 助词
# t 时间词 x 非语素字
# w 标点符号 un 未知词
# 表 2-1 常见词性标签含义
# %%
# 06_gensim
# %%
import jieba
from gensim import corpora
# Step 1: 准备分词后的语料 (新闻标题)
raw_headlines = [
"央行降息,刺激股市反弹",
"球队赢得总决赛冠军,球员表现出色"
]
tokenized_headlines = [jieba.lcut(doc) for doc in raw_headlines]
print(f"分词后语料: {tokenized_headlines}")
# Step 2: 创建词典
dictionary = corpora.Dictionary(tokenized_headlines)
print(f"词典: {dictionary.token2id}")
# Step 3: 转换为 BoW 向量语料库
corpus_bow = [dictionary.doc2bow(doc) for doc in tokenized_headlines]
print(f"BoW语料库: {corpus_bow}")
# %%
import jieba
from gensim import corpora, models
# 1. 准备语料 (新闻标题,包含财经和体育两个明显主题)
headlines = [
"央行降息,刺激股市反弹",
"球队赢得总决赛冠军,球员表现出色",
"国家队公布最新一期足球集训名单",
"A股市场持续震荡,投资者需谨慎",
"篮球巨星刷新历史得分记录",
"理财产品收益率创下新高"
]
tokenized_headlines = [jieba.lcut(title) for title in headlines]
# 2. 创建词典和 BoW 语料库
dictionary = corpora.Dictionary(tokenized_headlines)
corpus_bow = [dictionary.doc2bow(doc) for doc in tokenized_headlines]
# 3. 训练 TF-IDF 模型
tfidf_model = models.TfidfModel(corpus_bow)
# 4. 将BoW语料库转换为 TF-IDF 向量表示
corpus_tfidf = tfidf_model[corpus_bow]
# 辅助函数:把 (token_id, weight) 转成 (token, weight),并按权重降序展示
def tfidf_with_words(tfidf_vec, id2word):
pairs = [(id2word[token_id], weight) for token_id, weight in tfidf_vec]
return sorted(pairs, key=lambda x: x[1], reverse=True)
# 打印第一篇标题的 TF-IDF 向量
first_tfidf = list(corpus_tfidf)[0]
print("第一篇标题的 TF-IDF 向量:")
print(first_tfidf)
print("第一篇标题的 TF-IDF 向量(带词语):")
print(tfidf_with_words(first_tfidf, dictionary))
# 5. 对新标题应用模型
new_headline = "股市大涨,牛市来了"
new_headline_bow = dictionary.doc2bow(list(jieba.cut(new_headline)))
new_headline_tfidf = tfidf_model[new_headline_bow]
print("\n新标题的 TF-IDF 向量:")
print(new_headline_tfidf)
print(tfidf_with_words(new_headline_tfidf, dictionary))
# %%
tokenized_headlines
# %%
dictionary
# %%
corpus_bow
# %%
new_headline_bow
# %%
from gensim import corpora, models
# 1. 准备语料
headlines = [
"央行降息,刺激股市反弹",
"球队赢得总决赛冠军,球员表现出色",
"国家队公布最新一期足球集训名单",
"A股市场持续震荡,投资者需谨慎",
"篮球巨星刷新历史得分记录",
"理财产品收益率创下新高"
]
tokenized_headlines = [jieba.lcut(title) for title in headlines]
# 2. 创建词典和 BoW 语料库
dictionary = corpora.Dictionary(tokenized_headlines)
corpus_bow = [dictionary.doc2bow(doc) for doc in tokenized_headlines]
# 3. 训练 LDA 模型 (假设需要发现 2 个主题)
lda_model = models.LdaModel(corpus=corpus_bow, id2word=dictionary, num_topics=2, random_state=100)
# 4. 查看模型发现的主题
print("模型发现的2个主题及其关键词:")
for topic in lda_model.print_topics():
print(topic)
# 5. 推断新文档的主题分布
new_headline = "巨星詹姆斯获得常规赛MVP"
new_headline_bow = dictionary.doc2bow(jieba.lcut(new_headline))
topic_distribution = lda_model[new_headline_bow]
print(f"\n新标题 '{new_headline}' 的主题分布:")
print(topic_distribution)
# %%
lda_model[corpus_bow[0]]
# %%
lda_model[corpus_bow[3]]
# %%
# 目前看不是很准确
# %%
from gensim.models import Word2Vec
# 1. 准备语料
headlines = [
# 财经
"央行降息,刺激股市反弹",
"A股市场持续震荡,投资者需谨慎",
"理财产品收益率创下新高",
"证监会发布新规,规范市场交易",
"创业板指数上涨,科技股领涨大盘",
"房价调控政策出台,房地产市场降温",
"全球股市动荡,影响资本市场信心",
"分析师认为,当前股市风险与机遇并存,市场情绪复杂",
# 体育
"球队赢得总决赛冠军,球员表现出色",
"国家队公布最新一期足球集训名单",
"篮球巨星刷新历史得分记录",
"奥运会开幕,中国代表团旗手确定",
"马拉松比赛圆满结束,选手创造佳绩",
"电子竞技联赛吸引大量年轻观众",
"这支球队的每位球员都表现出色",
"球员转会市场活跃,多支球队积极引援"
]
tokenized_headlines = [jieba.lcut(title) for title in headlines]
# 2. 训练Word2Vec模型
model = Word2Vec(tokenized_headlines, vector_size=50, window=3, min_count=1, sg=1)
# %%
# 1. 寻找最相似的词
# 在小语料上,结果可能不完美,但能体现出模型学习到了主题内的关联
similar_to_market = model.wv.most_similar('股市')
print(f"与 '股市' 最相似的词: {similar_to_market}")
# 2. 计算两个词的余弦相似度
similarity = model.wv.similarity('球队', '球员')
print(f"\n'球队' 和 '球员' 的相似度: {similarity:.4f}")
# 3. 获取一个词的向量
market_vector = model.wv['市场']
print(f"\n'市场' 的向量维度: {market_vector.shape}")
# %%
from gensim.models import KeyedVectors
# 保存词向量到文件
model.wv.save("news_vectors.kv")
# 从文件加载词向量
loaded_wv = KeyedVectors.load("news_vectors.kv")
# 加载后可以执行同样的操作
print(f"\n加载后,'球队' 和 '球员' 的相似度: {loaded_wv.similarity('球队', '球员'):.4f}")
# %%
# 习题部分
# 根据已经学过的内容使用 20newsgroups 数据(from sklearn.datasets import fetch_20newsgroups)实现基于全连接的文本分类模型训练和推理代码(若自行实现困难,可以参考文本分类简单实现)
from sklearn.datasets import fetch_20newsgroups
# %%
# 1加载数据
# 2.构建vocab和tokenize
# 3.做好数据padding
# 4.dataset和dataloader
# 5.训练 训练过程,是否更优-包存或者迭代
# 6.推理
# %%
# 加载数据
# %%
import joblib
# joblib.dump(train_dataset_raw, 'train_dataset_raw.joblib')
# joblib.dump(test_dataset_raw, 'test_dataset_raw.joblib')
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
train_dataset_raw = joblib.load('train_dataset_raw.joblib')
test_dataset_raw = joblib.load('test_dataset_raw.joblib')
# %%
# categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
# train_dataset_raw = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
# test_dataset_raw = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
sample = {
"text_preview": train_dataset_raw.data[0][:200],
"label": train_dataset_raw.target_names[train_dataset_raw.target[0]],
}
sample
# %%
import matplotlib.pyplot as plt
import re
# 为了进行探索,先定义一个简单的分词函数
def basic_tokenize(text):
text = text.lower()
# 去除非字母数字及指定标点以外的字符,替换为空格
text = re.sub(r"[^a-z0-9(),.!?\\'`]", " ", text)
# 在指定标点前后添加空格,便于后续分词
text = re.sub(r"([,.!?\\'`])", r" \1 ", text)
tokens = text.strip().split()
return tokens
# 计算每篇文档的词元数量
train_text_lengths = [len(basic_tokenize(text)) for text in train_dataset_raw.data]
plt.figure(figsize=(10, 6))
plt.hist(train_text_lengths, bins=50, alpha=0.7, color='blue')
plt.title('Distribution of Text Lengths in Training Data')
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()
# %%
# 统计词频分布
from typing import Any
from collections import Counter
import numpy as np
# 初始花Counter
word_counts = Counter()
for sent in [basic_tokenize(text) for text in train_dataset_raw.data]:
word_counts.update(sent)
# %%
frequencies = sorted(word_counts.values(), reverse=True)
# 生成排名
ranks = np.arange(1, len(frequencies) + 1)
# 绘制对数坐标图
plt.figure(figsize=(10, 6))
plt.loglog(ranks, frequencies)
plt.title('Rank vs. Frequency (Log-Log Scale)')
plt.xlabel('Rank (Log)')
plt.ylabel('Frequency (Log)')
plt.grid(True)
plt.show()
# 通过数据分析可以发现,图 7-1 的文本长度分布直方图显示大部分文本的长度集中在较短的区间,
# 但仍存在少量长度非常长的"异常值",说明简单的直接截断策略可能会丢失过多信息。除此之外,
# 如图 7-2 的对数坐标图所示,词频分布呈现出自然语言中典型的齐夫定律
# (Zipf's Law)现象,即少数高频词占据了绝大多数的出现次数,而大量词汇构成了长长的"尾巴",其出现频率极低。
# %%
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
# %%
train_dataset_raw.data[0]
# %%
# Tokenizer 封装
# 创建一个 Tokenizer(分词器)类来负责所有与分词、词典构建和 ID 转换相关的任务,
# 它封装了与数据探索时相同的分词逻辑并增加了 ID 转换等功能。
# 其中 _tokenize_text 方法实现了一套基于正则表达式的分词策略,
# 先将文本转为小写,通过 re.sub 移除非字母、数字和基本标点之外的字符,为了确保标点符号能被作为独立的词元,在它们周围添加空格,最后按空格切分文本得到词元列表。
# 而 build_vocab 方法则根据分词结果建立词汇表,过滤低频词并为每个词分配唯一的 ID 索引。
# 最后,tokenize 方法将输入文本转换为对应的 ID 序列,
# 这在训练模型时非常重要,因为模型只能处理数值输入而不是原始文本。
class Tokenizer:
def __init__(self, text, min_freq=5):
self.text = text
self.min_freq = min_freq
self.vocab = {}
self.vocab = {"<PAD>": 0, "<UNK>": 1}
self.tokenized_sents = [self._tokenize_text(i) for i in text]
self.vocab, self.w2i, self.i2w, self.sorted_freq = self.build_vocab()
self.vocab_size = len(self.vocab)
def _tokenize_text(self, text):
"""基于正则表达式的文本分词"""
text = text.lower()
text = re.sub(r"[^a-z0-9(),.!?\\'`]", " ", text)
text = re.sub(r"([,.!?\\'`])", r" \1 ", text)
return text.strip().split()
def build_vocab(self):
"""根据分词结果建立词汇表,过滤低频词"""
word_counts = Counter()
for sent in self.tokenized_sents:
word_counts.update(sent)
sorted_freq = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
for w, c in sorted_freq:
if c >= self.min_freq:
self.vocab[w] = len(self.vocab)
self.w2i = {w: i for i, w in enumerate(self.vocab)}
self.i2w = {i: w for w, i in self.w2i.items()}
return self.vocab, self.w2i, self.i2w, sorted_freq
def tokenize(self, text):
"""将文本转换为 ID 序列"""
return [self.w2i[w] for w in self._tokenize_text(text) if w in self.vocab]
def __len__(self):
"""返回词汇表大小"""
return len(self.vocab)
def __contains__(self, token):
"""检查 token 是否在词汇表中"""
return token in self.vocab
def get_id(self, token):
"""获取 token 的 ID 索引"""
return self.w2i.get(token, self.w2i['<UNK>'])
def get_token(self, id):
"""获取 ID 对应的 token"""
return self.i2w.get(id, '<UNK>')
tokenizer = Tokenizer(train_dataset_raw.data)
tokenizer.vocab_size
# %%
# %%
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from tqdm import tqdm
import random
# 文本分类数据集类(支持长文本滑窗、训练时随机Mask)
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128, is_training=False, mask_prob=0.15):
"""
初始化文本分类数据集
:param texts: 文本列表,如 ["文本1", "文本2", ...]
:param labels: 标签列表,如 [0, 1, 0, ...]
:param tokenizer: 分词器(需实现tokenize方法和unk_token_id属性)
:param max_len: 文本最大长度(超过则滑窗分割)
:param is_training: 是否为训练模式(训练模式下开启随机Mask)
:param mask_prob: 随机Mask的概率(替换为<UNK>)
"""
self.tokenizer = tokenizer
self.max_len = max_len
self.is_training = is_training
self.mask_prob = mask_prob
# 获取<UNK>的token id,默认值为1(适配主流Tokenizer)
self.unk_token_id = getattr(tokenizer, '<UNK>', 1)
self.processed_data = []
# 预处理数据:长文本滑窗分割
for text, label in tqdm(zip(texts, labels), total=len(labels), desc="预处理数据"):
# 分词(返回token id列表)
token_ids = self.tokenizer.tokenize(text)
# 滑窗分割逻辑:处理超长文本
if len(token_ids) <= self.max_len:
self.processed_data.append({"token_ids": token_ids, "label": label})
else:
# 滑窗步长:max_len * 0.8(保证上下文重叠)
stride = max(1, int(self.max_len * 0.8))
# 滑窗遍历超长文本
for i in range(0, len(token_ids) - self.max_len + 1, stride):
chunk = token_ids[i:i+self.max_len]
self.processed_data.append({"token_ids": chunk, "label": label})
def __len__(self):
"""返回数据集总长度"""
return len(self.processed_data)
def __getitem__(self, idx):
"""获取单个样本(训练时随机Mask部分token为<UNK>)"""
item = self.processed_data[idx]
token_ids = item["token_ids"].copy()
# 训练模式下:随机将部分token替换为<UNK>(数据增强)
if self.is_training:
num_tokens = len(token_ids)
num_to_mask = max(1, int(num_tokens * self.mask_prob))
# 筛选有效位置(排除PAD,假设0是PAD token id)
valid_positions = [i for i, tid in enumerate(token_ids) if tid != 0]
if len(valid_positions) > 0:
# 随机选择要Mask的位置
mask_positions = random.sample(valid_positions, min(num_to_mask, len(valid_positions)))
# 替换为<UNK> token id
for pos in mask_positions:
token_ids[pos] = self.unk_token_id
return {"token_ids": token_ids, "label": item["label"]}
# %%
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from tqdm import tqdm
import random
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128, is_training=False, mask_prob=0.15):
self.tokenizer = tokenizer
self.max_len = max_len
self.is_training = is_training
self.mask_prob = mask_prob
# 获取<UNK>的token id,通常为1\n",
self.unk_token_id = getattr(tokenizer, '<UNK>', 1)
self.processed_data = []
for text, label in tqdm(zip(texts, labels), total=len(labels)):
token_ids = self.tokenizer.tokenize(text)
# 滑窗分割逻辑
if len(token_ids) <= self.max_len:
self.processed_data.append({"token_ids": token_ids, "label": label})
else:
stride = max(1, int(self.max_len * 0.8))
for i in range(0, len(token_ids) - self.max_len + 1, stride):
chunk = token_ids[i:i+self.max_len]
self.processed_data.append({"token_ids": chunk, "label": label})
def __len__(self):
return len(self.processed_data)
def __getitem__(self, idx):
return self.processed_data[idx]
# %%
def collate_fn(batch):
max_batch_len = max(len(item["token_ids"]) for item in batch)
batch_token_ids, batch_labels = [], []
for item in batch:
token_ids = item["token_ids"]
padding_len = max_batch_len - len(token_ids)
padded_ids = token_ids + [0] * padding_len
batch_token_ids.append(padded_ids)
batch_labels.append(item["label"])
return {
"token_ids": torch.tensor(batch_token_ids, dtype=torch.long),
"labels": torch.tensor(batch_labels, dtype=torch.long),
}
# %%
from torch.utils.data import DataLoader
train_dataset = TextClassificationDataset(train_dataset_raw.data, train_dataset_raw.target, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_dataset = TextClassificationDataset(test_dataset_raw.data, test_dataset_raw.target, tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn)
{"train_samples": len(train_dataset), "valid_samples": len(valid_dataset), "batch_size": 32}
# %%
from torch.utils.data import DataLoader
train_dataset = TextClassificationDataset(train_dataset_raw.data, train_dataset_raw.target, tokenizer, is_training=True, mask_prob=0.15)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_dataset = TextClassificationDataset(test_dataset_raw.data, test_dataset_raw.target, tokenizer)
valid_loader = DataLoader(valid_dataset, batch_size=32, collate_fn=collate_fn)
{"train_samples": len(train_dataset), "valid_samples": len(valid_dataset), "batch_size": 32}
# %%
for batch in valid_loader:
print([tokenizer.get_token(i.item()) for i in batch['token_ids'][0]])
break
# %%
test_dataset_raw.data[0]
# %%
# Input:
# token_ids (词元ID序列): [batch_size, seq_len]
# |
# V
# nn.Embedding(padding_idx=0)
# |
# V
# embedded: [batch_size, seq_len, embed_dim]
# |
# V
# nn.Linear(embed_dim, hidden_dim*2) -> nn.ReLU -> nn.Linear(hidden_dim*2, hidden_dim*4) -> nn.ReLU
# |
# V
# token_features: [batch_size, seq_len, hidden_dim*4]
# |
# V
# Masked Average Pooling (关键操作)
# |
# V
# pooled_features: [batch_size, hidden_dim*4] <-- seq_len维度被聚合掉了
# |
# V
# nn.Linear (分类层)
# |
# V
# Output:
# logits: [batch_size, num_classes]
# %%
# 池化(Pooling)的目的是将一个序列的特征([seq_len, hidden_dim])聚合成一个代表整条序列的向量([hidden_dim]),
# 但简单的平均池化会受到填充 <PAD> 的影响从而导致语义偏差。
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes, dropout=0.3):
super(TextClassifier, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.feature_extractor = nn.Sequential(
nn.Linear(embed_dim, hidden_dim*2),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim*2, hidden_dim*4),
nn.ReLU(),
nn.Dropout(dropout)
)
self.classifier = nn.Linear(hidden_dim*4, num_classes)
def forward(self, token_ids):
embedded = self.embedding(token_ids)
token_features = self.feature_extractor(embedded)
# shapes:
# token_ids: [batch_size, seq_len]
# embedded: [batch_size, seq_len, embed_dim]
# token_features: [batch_size, seq_len, hidden_dim * 4]
# padding_mask: [batch_size, seq_len]
# masked_features: [batch_size, seq_len, hidden_dim * 4]
# summed_features: [batch_size, hidden_dim * 4]
# pooled_features: [batch_size, hidden_dim * 4]
# logits: [batch_size, num_classes]
padding_mask = (token_ids != self.embedding.padding_idx).float()
masked_features = token_features * padding_mask.unsqueeze(-1)
summed_features = torch.sum(masked_features, dim=1)
pooled_features = summed_features / torch.clamp(padding_mask.sum(dim=1, keepdim=True), min=1e-9)
logits = self.classifier(pooled_features)
return logits
# %%
# import os
# import json
# class Trainer:
# def __init__(self, model, optimizer, criterion, train_loader, valid_loader, device, output_dir=".", seed=42):
# import random
# import numpy as np
# import torch
# random.seed(seed)
# np.random.seed(seed)
# torch.manual_seed(seed)
# if torch.cuda.is_available():
# torch.cuda.manual_seed_all(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False
# self.model = model
# self.optimizer = optimizer
# self.criterion = criterion
# self.train_loader = train_loader
# self.valid_loader = valid_loader
# self.device = device
# self.best_accuracy = 0.0
# self.output_dir = output_dir
# os.makedirs(self.output_dir, exist_ok=True)
# # 用于记录历史数据
# self.train_losses = []
# self.val_accuracies = []
# def _run_epoch(self, epoch):
# self.model.train()
# total_loss = 0
# for batch in tqdm(self.train_loader, desc=f"Epoch {epoch+1} [训练中]"):
# self.optimizer.zero_grad()
# token_ids = batch["token_ids"].to(self.device)
# labels = batch["labels"].to(self.device)
# outputs = self.model(token_ids)
# loss = self.criterion(outputs, labels)
# total_loss += loss.item()
# loss.backward()
# self.optimizer.step()
# return total_loss / len(self.train_loader)
# def _evaluate(self, epoch):
# self.model.eval()
# correct_preds = 0
# total_samples = 0
# with torch.no_grad():
# for batch in tqdm(self.valid_loader, desc=f"Epoch {epoch+1} [评估中]"):
# token_ids = batch["token_ids"].to(self.device)
# labels = batch["labels"].to(self.device)
# outputs = self.model(token_ids)
# _, predicted = torch.max(outputs, 1)
# total_samples += labels.size(0)
# correct_preds += (predicted == labels).sum().item()
# return correct_preds / total_samples
# def _save_checkpoint(self, epoch, val_accuracy):
# if val_accuracy > self.best_accuracy:
# self.best_accuracy = val_accuracy
# save_path = os.path.join(self.output_dir, "best_model.pth")
# torch.save(self.model.state_dict(), save_path)
# print(f"新最佳模型已保存! Epoch: {epoch+1}, 验证集准确率: {val_accuracy:.4f}")
# def train(self, epochs, tokenizer, label_map):
# self.train_losses = []
# self.val_accuracies = []
# for epoch in range(epochs):
# avg_loss = self._run_epoch(epoch)
# val_accuracy = self._evaluate(epoch)
# self.train_losses.append(avg_loss)
# self.val_accuracies.append(val_accuracy)
# print(f"Epoch {epoch+1}/{epochs} | 训练损失: {avg_loss:.4f} | 验证集准确率: {val_accuracy:.4f}")
# self._save_checkpoint(epoch, val_accuracy)
# print("训练完成!")
# # 训练结束后,保存最终的词典和标签映射
# vocab_path = os.path.join(self.output_dir, 'vocab.json')
# with open(vocab_path, 'w', encoding='utf-8') as f:
# json.dump(tokenizer.vocab, f, ensure_ascii=False, indent=4)
# labels_path = os.path.join(self.output_dir, 'label_map.json')
# with open(labels_path, 'w', encoding='utf-8') as f:
# json.dump(label_map, f, ensure_ascii=False, indent=4)
# print(f"词典 ({vocab_path}) 和标签映射 ({labels_path}) 已保存。")
# return self.train_losses, self.val_accuracies
# %%
import os
import json
from tqdm import tqdm
class Trainer:
def __init__(self, model, optimizer, criterion, train_loader, valid_loader, device, output_dir=".", seed=42, patience=5):
import random
import numpy as np
import torch
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
self.model = model
self.optimizer = optimizer
self.criterion = criterion
self.train_loader = train_loader
self.valid_loader = valid_loader
self.device = device
self.best_accuracy = 0.0
self.output_dir = output_dir
os.makedirs(self.output_dir, exist_ok=True)
self.train_losses = []
self.val_losses = []
self.val_accuracies = []
# 早停相关参数
self.patience = patience
self.counter = 0
self.early_stop = False
def _run_epoch(self, epoch):
self.model.train()
total_loss = 0
for batch in tqdm(self.train_loader, desc=f"Epoch {epoch+1} [训练中]"):
self.optimizer.zero_grad()
token_ids = batch["token_ids"].to(self.device)
labels = batch["labels"].to(self.device)
outputs = self.model(token_ids)
loss = self.criterion(outputs, labels)
total_loss += loss.item()
loss.backward()
self.optimizer.step()
return total_loss / len(self.train_loader)
def _evaluate(self, epoch):
self.model.eval()
total_loss = 0
correct_preds = 0
total_samples = 0
with torch.no_grad():
for batch in tqdm(self.valid_loader, desc=f"Epoch {epoch+1} [评估中]"):
token_ids = batch["token_ids"].to(self.device)
labels = batch["labels"].to(self.device)
outputs = self.model(token_ids)
loss = self.criterion(outputs, labels)
total_loss += loss.item()
_, predicted = torch.max(outputs, 1)
total_samples += labels.size(0)
correct_preds += (predicted == labels).sum().item()
avg_loss = total_loss / len(self.valid_loader)
accuracy = correct_preds / total_samples
return avg_loss, accuracy
def _save_checkpoint(self, epoch, val_accuracy):
if val_accuracy > self.best_accuracy:
self.best_accuracy = val_accuracy
save_path = os.path.join(self.output_dir, "best_model.pth")
torch.save(self.model.state_dict(), save_path)
print(f"新最佳模型已保存! Epoch: {epoch+1}, 验证集准确率: {val_accuracy:.4f}")
# 重置早停计数器
self.counter = 0
else:
self.counter += 1
print(f"验证集准确率未提升,早停计数器: {self.counter}/{self.patience}")
if self.counter >= self.patience:
self.early_stop = True
def train(self, epochs, tokenizer, label_map):
self.train_losses = []
self.val_losses = []
self.val_accuracies = []
for epoch in range(epochs):
avg_loss = self._run_epoch(epoch)
val_loss, val_accuracy = self._evaluate(epoch)
self.train_losses.append(avg_loss)
self.val_losses.append(val_loss)
self.val_accuracies.append(val_accuracy)
print(f"Epoch {epoch+1}/{epochs} | 训练损失: {avg_loss:.4f} | 验证损失: {val_loss:.4f} | 验证集准确率: {val_accuracy:.4f}")
self._save_checkpoint(epoch, val_accuracy)
" \n",
# 检查早停条件
if self.early_stop:
print(f"\n早停触发! 连续{self.patience}个轮次验证集准确率未提升,停止训练。")
break
print("训练完成!")
vocab_path = os.path.join(self.output_dir, 'vocab.json')
with open(vocab_path, 'w', encoding='utf-8') as f:
json.dump(tokenizer.vocab, f, ensure_ascii=False, indent=4)
labels_path = os.path.join(self.output_dir, 'label_map.json')
with open(labels_path, 'w', encoding='utf-8') as f:
json.dump(label_map, f, ensure_ascii=False, indent=4)
print(f"词典 ({vocab_path}) 和标签映射 ({labels_path}) 已保存。")
return self.train_losses, self.val_losses, self.val_accuracies
# %%
# 超参数
hparams = {
"vocab_size": len(tokenizer),
"embed_dim": 128,
"hidden_dim": 256,
"num_classes": len(train_dataset_raw.target_names),
"epochs": 20,
"learning_rate": 0.001,
"device": "cuda" if torch.cuda.is_available() else "cpu",
"output_dir": "output"
}
# 实例化
model = TextClassifier(
hparams["vocab_size"],
hparams["embed_dim"],
hparams["hidden_dim"],
hparams["num_classes"]
).to(hparams["device"])
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=hparams["learning_rate"])
hparams
# %%
trainer = Trainer(
model,
optimizer,
criterion,
train_loader,
valid_loader,
hparams["device"],
output_dir=hparams["output_dir"]
)
# 创建 标签名 -> ID 的映射,并传入 trainer 以便保存
label_map = {name: i for i, name in enumerate(train_dataset_raw.target_names)}
# 开始训练,并接收返回的历史数据
train_losses, val_losses, val_accuracies = trainer.train(epochs=hparams["epochs"], tokenizer=tokenizer, label_map=label_map)
# %%
# 默认版本-新最佳模型已保存! Epoch: 9, 验证集准确率: 0.8493 Epoch 8/20 | 训练损失: 0.0036 | 验证损失: 0.7760 | 验证集准确率: 0.8457
# mask方法:新最佳模型已保存! Epoch: 18, 验证集准确率: 0.8658 Epoch 18/20 | 训练损失: 0.0106 | 验证损失: 0.9426 | 验证集准确率: 0.8658
# %%
import matplotlib.pyplot as plt
def plot_history(train_losses, val_losses, val_accuracies, title_prefix=""):
epochs = range(1, len(train_losses) + 1)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(epochs, train_losses, 'bo-', label='Train Loss')
ax1.plot(epochs, val_losses, 'ro-', label='Validation Loss')
ax1.set_title(f'{title_prefix} Loss Comparison')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.grid(True)
ax1.legend()
ax2.plot(epochs, val_accuracies, 'go-', label='Validation Accuracy')
ax2.set_title(f'{title_prefix} Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.grid(True)
ax2.legend()
plt.tight_layout()
plt.show()
# %%
plot_history(train_losses, val_losses, val_accuracies, title_prefix="Feed-Forward Network")
# %%
# def plot_history(train_losses, val_accuracies, title_prefix=""):
# epochs = range(1, len(train_losses) + 1)
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
# # 绘制训练损失曲线
# ax1.plot(epochs, train_losses, 'bo-', label='Training Loss')
# ax1.set_title(f'{title_prefix} Training Loss')
# ax1.set_xlabel('Epochs')
# ax1.set_ylabel('Loss')
# ax1.grid(True)
# ax1.legend()
# # 绘制验证集准确率曲线
# ax2.plot(epochs, val_accuracies, 'ro-', label='Validation Accuracy')
# ax2.set_title(f'{title_prefix} Validation Accuracy')
# ax2.set_xlabel('Epochs')
# ax2.set_ylabel('Accuracy')
# ax2.grid(True)
# ax2.legend()
# plt.suptitle(f'{title_prefix} Training and Validation Metrics', fontsize=16)
# plt.show()
# # 调用绘图函数
# plot_history(train_losses, val_accuracies, title_prefix="Feed-Forward Network")
# %%
class Predictor:
def __init__(self, model, tokenizer, label_map, device, max_len=128):
self.model = model.to(device)
self.model.eval()
self.tokenizer = tokenizer
self.label_map = label_map
self.id_to_label = {idx: label for label, idx in self.label_map.items()}
self.device = device
self.max_len = max_len
def predict(self, text):
token_ids = self.tokenizer.tokenize(text)
chunks = []
if len(token_ids) <= self.max_len:
chunks.append(token_ids)
else:
stride = max(1, int(self.max_len * 0.8))
for i in range(0, len(token_ids) - self.max_len + 1, stride):
chunks.append(token_ids[i:i + self.max_len])
chunk_tensors = torch.tensor(chunks, dtype=torch.long).to(self.device)
with torch.no_grad():
outputs = self.model(chunk_tensors)
preds = torch.argmax(outputs, dim=1)
final_pred_id = torch.bincount(preds).argmax().item()
final_pred_label = self.id_to_label[final_pred_id]
return final_pred_label
# 加载资源
vocab_path = os.path.join(hparams["output_dir"], 'vocab.json')
with open(vocab_path, 'r', encoding='utf-8') as f:
loaded_vocab = json.load(f)
labels_path = os.path.join(hparams["output_dir"], 'label_map.json')
with open(labels_path, 'r', encoding='utf-8') as f:
label_map_loaded = json.load(f)
# 实例化推理组件
inference_tokenizer = Tokenizer(train_dataset_raw.data)
inference_model = TextClassifier(
len(inference_tokenizer),
hparams["embed_dim"],
hparams["hidden_dim"],
len(label_map_loaded)
).to(hparams["device"])
model_path = os.path.join(hparams["output_dir"], "best_model.pth")
inference_model.load_state_dict(torch.load(model_path, map_location=hparams["device"]))
predictor = Predictor(
inference_model,
inference_tokenizer,
label_map_loaded,
hparams["device"]
)
# 预测
new_text = "The doctor prescribed a new medicine for the patient's illness, focusing on its gpu accelerated healing properties."
predicted_class = predictor.predict(new_text)
{"text": new_text, "pred": predicted_class}
# %%