第N8周:使用Word2vec实现文本分类

  • 🍨 本文为🔗365天深度学习训练营 中的学习记录博客
  • 🍖 原作者:K同学啊
    Word2Vec是Google在2013年开源的将词表征为实数值向量的高效工具。简单说,它能把"苹果"、"香蕉"这样的词变成100维的数字向量,让计算机能"理解"词语之间的语义关系。
    思维导图如下:
python 复制代码
import torch
import os,PIL,pathlib,warnings
from torch import nn
import time
import pandas as pd
from torchvision import transforms, datasets
import jieba

# 🚫 关闭烦人的警告(就像关掉手机通知免得被消息轰炸)
warnings.filterwarnings("ignore")
# 🖥️ 检查是用电脑还是GPU干活(就像选坐地铁还是高铁)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)  # 输出:cuda 或 cpu

# 📥 1.2 加载自定义中文数据
train_data = pd.read_csv('./train.csv', sep='\t', header=None)
print(train_data.head())  # 打印前5行数据(就像打开Excel看第一眼数据)

# 🧩 1.3 构造数据集迭代器
def custom_data_iter(texts, labels):
    for x, y in zip(texts, labels):
        yield x, y  # 这就像快递员把包裹(文本)和地址(标签)配对打包

x = train_data[0].values[:]  # 文本内容(快递包裹)
y = train_data[1].values[:]  # 类别标签(包裹地址)
bash 复制代码
                       0              1
0      还有双鸭山到淮阴的汽车票吗13号的   Travel-Query
1                从这里怎么回家   Travel-Query
2       随便播放一首专辑阁楼里的佛里的歌     Music-Play
3              给看一下墓王之王嘛  FilmTele-Play
4  我想看挑战两把s686打突变团竞的游戏视频     Video-Play
bash 复制代码
# 🌱 1.4 构建词典(Word2Vec训练)
from gensim.models.word2vec import Word2Vec
import numpy as np

# 像教小朋友认字:100维向量=100个特征标签(身高/发型/颜色)
w2v = Word2Vec(vector_size=100, min_count=3)  # 词频低于3的字不教

w2v.build_vocab(x)  # 建立字典(教小朋友认字)
w2v.train(x, total_examples=w2v.corpus_count, epochs=20)  # 训练20遍(就像反复读课文)

输出如下

bash 复制代码
(2732848, 3663560)
bash 复制代码
# 📌 将文本转为向量(平均词向量法)
def average_vec(text):
    vec = np.zeros(100).reshape((1, 100))  # 初始化100维向量(就像空盒子)
    for word in text:
        try:
            vec += w2v.wv[word].reshape((1, 100))  # 每个字加进盒子(像积木拼图)
        except KeyError:
            continue  # 读不懂的字跳过(就像跳过不认识的生词)
    return vec

# 📦 将所有文本转为向量矩阵
x_vec = np.concatenate([average_vec(z) for z in x])  # 拼接成大矩阵(像把所有积木拼成大楼)
w2v.save('w2v_model.pkl')  # 保存字典模型(像存下字典书)

train_iter = custom_data_iter(x_vec, y)  # 数据迭代器(像快递分拣员)
print(len(x), len(x_vec))  # 原始数据量 vs 向量数据量(就像100个包裹变成100个向量盒子)

label_name = list(set(train_data[1].values[:]))  # 类别名称(就像快递地址的分类:北京/上海/广州)

# 📦 生成数据批次和迭代器
text_pipeline = lambda x: average_vec(x)  # 文本转向量(快递打包)
label_pipeline = lambda x: label_name.index(x)  # 类别转数字(地址转编码)

print(text_pipeline("你在干嘛"))  # 打印向量(像把"你在干嘛"变成100个数字)
print(label_pipeline("Travel-Query"))  # 打印类别编码("Travel-Query"对应哪个数字)

输出如下

bash 复制代码
12100
['Audio-Play', 'Other', 'Calendar-Query', 'Alarm-Update', 'Weather-Query', 'HomeAppliance-Control', 'Travel-Query', 'TVProgram-Play', 'FilmTele-Play', 'Radio-Listen', 'Video-Play', 'Music-Play']
bash 复制代码
# 📦 数据加载器(像快递分拣流水线)
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        label_list.append(label_pipeline(_label))  # 收集类别编码
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.float32)  # 文本转张量
        text_list.append(processed_text)

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.cat(text_list)  # 合并成大矩阵(像把快递盒拼成大箱子)

    return text_list.to(device), label_list.to(device)  # 转移到GPU/CPU

dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)
# 8个样本一打包(就像每8个快递装成一车)

# 🧱 2.1 搭建模型(文本分类神经网络)
class TextClassificationModel(nn.Module):
    def __init__(self, num_class):
        super(TextClassificationModel, self).__init__()
        # 100维向量 → 类别数的分类器(就像100个特征输入到分类器)
        self.fc = nn.Linear(100, num_class)  

    def forward(self, text):
        return self.fc(text)  # 通过全连接层输出

# 🧪 2.2 初始化模型
num_class = len(label_name)  # 类别数量(比如5种快递类型)
vocab_size = 100000  # 词典大小(就像10万本字典)
em_size = 12  # 词向量维度(这里实际用100,但代码写12,可能是笔误)
model = TextClassificationModel(num_class).to(device)  # 模型搬上GPU

# 📈 2.3 定义训练和评估函数
def train(dataloader):
    model.train()  # 切换训练模式(像打开学习模式)
    total_acc, train_loss, total_count = 0, 0, 0
    for idx, (text, label) in enumerate(dataloader):
        predicted_label = model(text)  # 模型预测
        optimizer.zero_grad()  # 梯度清零(像擦掉草稿纸)
        loss = criterion(predicted_label, label)  # 计算误差
        loss.backward()  # 反向传播(像倒着看解题步骤)
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)  # 梯度裁剪(防止梯度爆炸)
        optimizer.step()  # 更新参数(像修改学习策略)

def evaluate(dataloader):
    model.eval()  # 切换评估模式(像考试模式)
    total_acc, train_loss, total_count = 0, 0, 0
    with torch.no_grad():  # 关闭梯度计算(像考试不写草稿)
        for idx, (text, label) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()  # 计算准确率

# 📊 数据集拆分(80%训练+20%验证)
from torch.utils.data.dataset import random_split
train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
train_dataset = list(train_iter)  # 转成列表(像把快递单子排成队列)

train_size = int(len(train_dataset) * 0.8)  # 80%训练集
valid_size = len(train_dataset) - train_size  # 20%验证集

# 分割数据集(像把快递单分成训练区和测试区)
split_train_, split_valid_ = random_split(train_dataset, [train_size, valid_size])

# 创建数据加载器(像设置快递分拣流水线)
train_dataloader = DataLoader(split_train_, batch_size=64, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=64, shuffle=True, collate_fn=collate_batch)

# 🔥 3.2 正式训练(像马拉松比赛)
for epoch in range(1, EPOCHS + 1):
    train(train_dataloader)  # 每轮训练
    val_acc, val_loss = evaluate(valid_dataloader)  # 验证准确率
    # 学习率调整(像比赛时调整配速)
    if total_accu is not None and total_accu > val_acc:
        scheduler.step()  # 降速(学习率衰减)
    else:
        total_accu = val_acc

输出如下

bash 复制代码
|epoch1|  50/ 152 batches|train_acc0.746 train_loss0.02132
|epoch1| 100/ 152 batches|train_acc0.825 train_loss0.01539
|epoch1| 150/ 152 batches|train_acc0.835 train_loss0.01567
---------------------------------------------------------------------
| epoch 1 | time:1.46s | valid_acc 0.726 valid_loss 0.038 | lr 4.000000
---------------------------------------------------------------------
|epoch2|  50/ 152 batches|train_acc0.841 train_loss0.01451
|epoch2| 100/ 152 batches|train_acc0.835 train_loss0.01556
|epoch2| 150/ 152 batches|train_acc0.842 train_loss0.01566
---------------------------------------------------------------------
| epoch 2 | time:1.30s | valid_acc 0.851 valid_loss 0.014 | lr 4.000000
---------------------------------------------------------------------
|epoch3|  50/ 152 batches|train_acc0.843 train_loss0.01494
|epoch3| 100/ 152 batches|train_acc0.851 train_loss0.01482
|epoch3| 150/ 152 batches|train_acc0.850 train_loss0.01337
---------------------------------------------------------------------
| epoch 3 | time:1.46s | valid_acc 0.848 valid_loss 0.015 | lr 4.000000
---------------------------------------------------------------------
|epoch4|  50/ 152 batches|train_acc0.881 train_loss0.00891
|epoch4| 100/ 152 batches|train_acc0.885 train_loss0.00821
|epoch4| 150/ 152 batches|train_acc0.897 train_loss0.00704
---------------------------------------------------------------------
| epoch 4 | time:1.47s | valid_acc 0.895 valid_loss 0.008 | lr 0.400000
---------------------------------------------------------------------
|epoch5|  50/ 152 batches|train_acc0.901 train_loss0.00628
|epoch5| 100/ 152 batches|train_acc0.891 train_loss0.00674
|epoch5| 150/ 152 batches|train_acc0.898 train_loss0.00653
---------------------------------------------------------------------
| epoch 5 | time:1.59s | valid_acc 0.887 valid_loss 0.007 | lr 0.400000
---------------------------------------------------------------------
|epoch6|  50/ 152 batches|train_acc0.900 train_loss0.00581
|epoch6| 100/ 152 batches|train_acc0.904 train_loss0.00586
|epoch6| 150/ 152 batches|train_acc0.904 train_loss0.00558
---------------------------------------------------------------------
| epoch 6 | time:1.64s | valid_acc 0.893 valid_loss 0.007 | lr 0.040000
---------------------------------------------------------------------
|epoch7|  50/ 152 batches|train_acc0.908 train_loss0.00543
|epoch7| 100/ 152 batches|train_acc0.902 train_loss0.00546
|epoch7| 150/ 152 batches|train_acc0.903 train_loss0.00591
---------------------------------------------------------------------
| epoch 7 | time:1.50s | valid_acc 0.894 valid_loss 0.007 | lr 0.004000
---------------------------------------------------------------------
|epoch8|  50/ 152 batches|train_acc0.903 train_loss0.00609
|epoch8| 100/ 152 batches|train_acc0.905 train_loss0.00517
|epoch8| 150/ 152 batches|train_acc0.906 train_loss0.00552
---------------------------------------------------------------------
| epoch 8 | time:1.47s | valid_acc 0.894 valid_loss 0.007 | lr 0.000400
---------------------------------------------------------------------
|epoch9|  50/ 152 batches|train_acc0.897 train_loss0.00593
|epoch9| 100/ 152 batches|train_acc0.907 train_loss0.00526
|epoch9| 150/ 152 batches|train_acc0.910 train_loss0.00556
---------------------------------------------------------------------
| epoch 9 | time:1.53s | valid_acc 0.894 valid_loss 0.007 | lr 0.000040
---------------------------------------------------------------------
|epoch10|  50/ 152 batches|train_acc0.912 train_loss0.00542
|epoch10| 100/ 152 batches|train_acc0.901 train_loss0.00574
|epoch10| 150/ 152 batches|train_acc0.901 train_loss0.00564
---------------------------------------------------------------------
| epoch 10 | time:1.49s | valid_acc 0.894 valid_loss 0.007 | lr 0.000004
---------------------------------------------------------------------
模型准确率为:0.8938
python 复制代码
# 🎯 测试指定数据
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text), dtype=torch.float32)  # 文本转向量
        output = model(text)  # 模型预测
        return output.argmax(1).item()  # 返回最高概率的类别

ex_text_str = "还有双鸭山到淮阴的汽车票吗13号的"
print("该文本的类别是: %s" % label_name[predict(ex_text_str, text_pipeline)])

输出如下:

bash 复制代码
torch.Size([1, 100])
该文本的类别是: Travel-Query
相关推荐
子洋1 小时前
LLM 原理 - 输入预处理
前端·人工智能·后端
我很哇塞耶1 小时前
OpenAI公开新的模型训练方法:或许能解决模型撒谎问题,已在GPT-5 thiking验证
人工智能·ai·大模型·训练
小白狮ww1 小时前
lammps 教程:npt 控温估计 FCC Cu 熔点
人工智能·深度学习·机器学习·分子动力学·lammps·npt·材料建模
TOYOAUTOMATON2 小时前
自动化工业夹爪
大数据·人工智能·算法·目标检测·机器人
智算菩萨2 小时前
Pip与第三方库:一行命令安装 AI 能力
人工智能·pip
serve the people2 小时前
TensorFlow 基础训练循环(简化版 + 补全代码)
人工智能·python·tensorflow
Slaughter信仰2 小时前
图解大模型_生成式AI原理与实战学习笔记(第四章)
人工智能·笔记·学习
拓端研究室2 小时前
2025医疗健康行业革新报告:AI赋能、国际化|附170+份报告PDF、数据、可视化模板汇总下载
人工智能·pdf
DisonTangor2 小时前
iMontage: 统一、多功能、高度动态的多对多图像生成
人工智能·ai作画·开源·aigc