第N4周:NLP中的文本嵌入

本周任务:

加载第N1周的.txt文件,使用Embeddingbag与Embedding完成词嵌入

Embedding

自定义数据集类

python 复制代码
import torch
from torch import nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset

class MyDataset(Dataset):
    def __init__(self,texts,labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self,idx):
        texts = self.texts[idx]
        labels = self.labels[idx]

        return texts,labels

定义填充函数

python 复制代码
# 自定义填充函数
def collate_batch(batch):
    texts,labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [F.pad(text,(0,max_len - len(text)),value=0) for text in texts]
    padded_texts = torch.stack(padded_texts)
    labels = torch.tensor(labels,dtype=torch.float).unsqueeze(1)
    return padded_texts,labels

准备数据和数据加载器

python 复制代码
# 准备数据和数据加载器
text_data = [
    torch.tensor([1,1,1,1],dtype=torch.long),
    torch.tensor([2,2,2],dtype=torch.long),
    torch.tensor([3,3],dtype=torch.long)
]

labels = torch.tensor([4,5,6],dtype=torch.float)

my_dataset = MyDataset(text_data,labels)
data_loader = DataLoader(my_dataset,batch_size=2,shuffle=True,collate_fn=collate_batch)

for batch in data_loader:
    print(batch)

定义模型

python 复制代码
# 定义模型
class EmbeddingModel(nn.Module):
    def __init__(self,vocab_size,embed_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn. Embedding(vocab_size,embed_dim)
        self.fc = nn. Linear(embed_dim,1)

    def forward(self,text):
        print('embedding输入文本是:',text)
        print('embedding输入文本shape:',text.shape)
        embedding=self.embedding(text)
        embedding_mean = embedding.mean(dim=1)
        print('embedding输出文本shape:',embedding_mean.shape)
        return self.fc(embedding_mean)

训练模型

python 复制代码
vocab_size = 10
embed_dim = 6

model = EmbeddingModel(vocab_size, embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts, labels = batch
        outputs = model(texts)
        loss = criterion(outputs,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print(f'Epoch {epoch+1}, Loss: {loss.item()}')

Embeddingbag

前面的步骤相同

定义模型

python 复制代码
# 定义模型

class EmbeddingBagModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(EmbeddingBagModel,self).__init__()
        self.embedding_bag = nn.EmbeddingBag(vocab_size,embed_dim,mode='mean')
        self.fc = nn.Linear(embed_dim,1)

    def forward(self,text,offsets):
        print('embedding_bag输入文本是:',text)
        print('embedding_bag输入文本shape:',text.shape)
        embedded = self.embedding_bag(text,offsets)
        print('embedding_bag输出文本shape:',embedded.shape)
        return self.fc(embedded)

训练模型

python 复制代码
vocab_size = 10
embed_dim = 6

model = EmbeddingBagModel(vocab_size,embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(),lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts,labels = zip(*batch)

        offsets = [0] + [len(text) for text in texts[:-1]]
        offsets = torch.tensor(offsets).cumsum(dim=0)
        texts = torch.cat(texts)
        labels = torch.tensor(labels).unsqueeze(1)

        outputs = model(texts,offsets)
        loss = criterion(outputs,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')

使用任务文件.txt嵌入

Embedding

python 复制代码
import torch
from torch import nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import jieba
import numpy as np

# 自定义填充函数
def collate_batch(batch):
    texts, labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [F.pad(text, (0, max_len - len(text)), value=0) for text in texts]
    padded_texts = torch.stack(padded_texts)
    labels = torch.tensor(labels, dtype=torch.float).unsqueeze(1)
    return padded_texts, labels

# 从本地txt文件中读取文本内容
with open("F:/365data/N1/任务文件.txt", 'r', encoding='utf-8') as file:
    texts1 = [line.strip() for line in file]

# 分词
tokenized_texts = [list(jieba.cut(text)) for text in texts1]

# 构建词汇表
word_index = {}
index_word = {}
for i, word in enumerate(set([word for text in tokenized_texts for word in text])):
    word_index[word] = i
    index_word[i] = word

# 计算词汇表大小
vocab_size = len(word_index) + 1  # +1是为了包括padding的0

# 将文本转换为序列
texts = [[word_index[word] for word in text] for text in tokenized_texts]

# 手动指定标签
# 假设第一行的标签为 1.0,第二行的标签为 2.0
labels = [1.0, 2.0]

# 定义自定义数据集类
class MyDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# 创建数据集
my_dataset = MyDataset(texts, labels)

# 创建数据加载器
data_loader = DataLoader(my_dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)

# 打印数据加载器中的批次
for batch in data_loader:
    texts, labels = batch
    print("texts:", texts)
    print("Labels:", labels)

# 定义模型
class EmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(EmbeddingModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, text):
        print('embedding输入文本是:', text)
        print('embedding输入文本shape:', text.shape)
        embedding = self.embedding(text)
        embedding_mean = embedding.mean(dim=1)
        print('embedding输出文本shape:', embedding_mean.shape)
        return self.fc(embedding_mean)

embed_dim = 6

model = EmbeddingModel(vocab_size, embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts, labels = batch
        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print(f'Epoch {epoch+1}, Loss: {loss.item()}')
python 复制代码
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.566 seconds.
Prefix dict has been built successfully.
texts: tensor([[22, 45, 69, 23, 24, 70, 73, 34, 25, 75, 21, 52, 78, 62, 64, 21, 10, 56,
         34, 25, 75, 21,  4, 42, 47, 27, 35, 32, 54, 16, 36,  7, 83, 24, 74, 80,
          7, 81,  4, 15, 51, 17, 24, 67, 81,  4, 15, 51, 46, 56, 79, 24, 32, 54,
         44,  8, 82, 66,  4, 24, 12, 49, 31, 71,  6, 59, 56, 65, 24, 38, 41, 54,
          4, 20, 24, 58, 40, 60,  4, 34, 25, 75, 21, 68],
        [85, 86,  4, 11, 27, 84, 57, 33,  4, 14,  1, 56, 65, 24, 38,  7,  5, 41,
         54,  4, 23, 24, 58,  3, 17,  2, 77, 63, 72, 19, 55, 37, 41, 54, 56, 18,
         24, 69, 11, 49,  7, 23, 24,  8, 28, 30, 76, 48, 50, 61, 39, 54, 44, 49,
          0, 31, 71,  6, 59, 24,  9, 13, 29, 59, 30, 27, 12, 49,  4, 43, 12, 53,
         26,  4, 56,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
Labels: tensor([[2.],
        [1.]])
embedding输入文本是: tensor([[22, 45, 69, 23, 24, 70, 73, 34, 25, 75, 21, 52, 78, 62, 64, 21, 10, 56,
         34, 25, 75, 21,  4, 42, 47, 27, 35, 32, 54, 16, 36,  7, 83, 24, 74, 80,
          7, 81,  4, 15, 51, 17, 24, 67, 81,  4, 15, 51, 46, 56, 79, 24, 32, 54,
         44,  8, 82, 66,  4, 24, 12, 49, 31, 71,  6, 59, 56, 65, 24, 38, 41, 54,
          4, 20, 24, 58, 40, 60,  4, 34, 25, 75, 21, 68],
        [85, 86,  4, 11, 27, 84, 57, 33,  4, 14,  1, 56, 65, 24, 38,  7,  5, 41,
         54,  4, 23, 24, 58,  3, 17,  2, 77, 63, 72, 19, 55, 37, 41, 54, 56, 18,
         24, 69, 11, 49,  7, 23, 24,  8, 28, 30, 76, 48, 50, 61, 39, 54, 44, 49,
          0, 31, 71,  6, 59, 24,  9, 13, 29, 59, 30, 27, 12, 49,  4, 43, 12, 53,
         26,  4, 56,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
embedding输入文本shape: torch.Size([2, 84])
embedding输出文本shape: torch.Size([2, 6])
Epoch 1, Loss: 0.5101226568222046

EmbeddingBag

python 复制代码
import torch
from torch import nn
import torch.nn.functional as F 
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import jieba
import numpy as np

# 自定义填充函数
def collate_batch(batch):
    texts, labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [F.pad(text, (0, max_len - len(text)), value=0) for text in texts]
    padded_texts = torch.stack(padded_texts)
    labels = torch.tensor(labels, dtype=torch.float).unsqueeze(1)
    return padded_texts, labels

# 从本地txt文件中读取文本内容
with open("F:/365data/N2/任务文件.txt", 'r', encoding='utf-8') as file:
    texts1 = [line.strip() for line in file]

# 分词
tokenized_texts = [list(jieba.cut(text)) for text in texts1]

# 构建词汇表
word_index = {}
index_word = {}
for i, word in enumerate(set([word for text in tokenized_texts for word in text])):
    word_index[word] = i
    index_word[i] = word

# 计算词汇表大小
vocab_size = len(word_index) + 1  # +1是为了包括padding的0

# 将文本转换为序列
texts = [[word_index[word] for word in text] for text in tokenized_texts]

# 手动指定标签
# 假设第一行的标签为 1.0,第二行的标签为 2.0
labels = [1.0, 2.0]

# 定义自定义数据集类
class MyDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        return torch.tensor(text, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# 创建数据集
my_dataset = MyDataset(texts, labels)

# 创建数据加载器
data_loader = DataLoader(my_dataset, batch_size=2, shuffle=True, collate_fn=collate_batch)

# 打印数据加载器中的批次
for batch in data_loader:
    texts, labels = batch
    print("texts:", texts)
    print("Labels:", labels)

# 定义模型
class EmbeddingBagModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(EmbeddingBagModel, self).__init__()
        self.embedding_bag = nn.EmbeddingBag(vocab_size, embed_dim, mode='mean')
        self.fc = nn.Linear(embed_dim, 1)

    def forward(self, text):
        print('embedding bag 输入文本是:', text)
        print('embedding bag 输入文本shape:', text.shape)
        embedding = self.embedding_bag(text)
        print('embedding bag 输出文本shape:', embedding.shape)
        return self.fc(embedding)

embed_dim = 6

model = EmbeddingBagModel(vocab_size, embed_dim)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

for epoch in range(1):
    for batch in data_loader:
        texts, labels = batch
        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

print(f'Epoch {epoch+1}, Loss: {loss.item()}')
python 复制代码
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\admin\AppData\Local\Temp\jieba.cache
Loading model cost 0.490 seconds.
Prefix dict has been built successfully.
texts: tensor([[64, 17,  6, 48, 78, 52, 56,  8,  4, 49, 35, 19, 43, 41, 30, 35, 59, 34,
          8,  4, 49, 35, 67, 38, 26, 50,  7, 21, 74, 70, 24, 84, 20, 78, 72, 37,
         84, 82, 67, 73, 54, 22, 78, 57, 82, 67, 73, 54,  2, 34,  3, 78, 21, 74,
         44, 66, 16, 11, 67, 78, 23, 63, 36, 85, 12,  0, 34, 79, 78, 13, 76, 74,
         67, 42, 78, 58, 75, 86, 67,  8,  4, 49, 35, 80],
        [61, 28, 67, 83, 50, 15, 25, 60, 67, 77, 18, 34, 79, 78, 13, 84, 81, 76,
         74, 67, 48, 78, 58, 14, 22, 69,  5, 10, 32, 45, 46, 51, 76, 74, 34, 68,
         78,  6, 83, 63, 84, 48, 78, 66,  9, 53, 31, 39, 40, 62, 33, 74, 44, 63,
         47, 36, 85, 12,  0, 78,  1, 65, 55,  0, 53, 50, 23, 63, 67, 27, 23, 71,
         29, 67, 34,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
Labels: tensor([[2.],
        [1.]])
embedding bag 输入文本是: tensor([[64, 17,  6, 48, 78, 52, 56,  8,  4, 49, 35, 19, 43, 41, 30, 35, 59, 34,
          8,  4, 49, 35, 67, 38, 26, 50,  7, 21, 74, 70, 24, 84, 20, 78, 72, 37,
         84, 82, 67, 73, 54, 22, 78, 57, 82, 67, 73, 54,  2, 34,  3, 78, 21, 74,
         44, 66, 16, 11, 67, 78, 23, 63, 36, 85, 12,  0, 34, 79, 78, 13, 76, 74,
         67, 42, 78, 58, 75, 86, 67,  8,  4, 49, 35, 80],
        [61, 28, 67, 83, 50, 15, 25, 60, 67, 77, 18, 34, 79, 78, 13, 84, 81, 76,
         74, 67, 48, 78, 58, 14, 22, 69,  5, 10, 32, 45, 46, 51, 76, 74, 34, 68,
         78,  6, 83, 63, 84, 48, 78, 66,  9, 53, 31, 39, 40, 62, 33, 74, 44, 63,
         47, 36, 85, 12,  0, 78,  1, 65, 55,  0, 53, 50, 23, 63, 67, 27, 23, 71,
         29, 67, 34,  0,  0,  0,  0,  0,  0,  0,  0,  0]])
embedding bag 输入文本shape: torch.Size([2, 84])
embedding bag 输出文本shape: torch.Size([2, 6])
Epoch 1, Loss: 0.6875726580619812

总结

  • Embedding与EmbeddingBag都是将离散的词汇映射到一个低维的连续向量空间中,并且保持了词汇之间的语义关系
  • EmbeddingBag是Embedding的优化,减少了计算和存储的开销
  • 在将.txt文件嵌入时,首先需要用N1、N2中jieba分词法将文本内容导入并转换成向量
  • 接着使用Dataset加载进DataLoader中并训练模型
  • 值得注意的是导入文本之后,label需要手动设置
相关推荐
ZOMI酱5 分钟前
【AI系统】GPU 架构与 CUDA 关系
人工智能·架构
deephub12 分钟前
使用 PyTorch-BigGraph 构建和部署大规模图嵌入的完整教程
人工智能·pytorch·深度学习·图嵌入
deephub44 分钟前
优化注意力层提升 Transformer 模型效率:通过改进注意力机制降低机器学习成本
人工智能·深度学习·transformer·大语言模型·注意力机制
搏博1 小时前
神经网络问题之二:梯度爆炸(Gradient Explosion)
人工智能·深度学习·神经网络
KGback1 小时前
【论文解析】HAQ: Hardware-Aware Automated Quantization With Mixed Precision
人工智能
电子手信1 小时前
知识中台在多语言客户中的应用
大数据·人工智能·自然语言处理·数据挖掘·知识图谱
不高明的骗子1 小时前
【深度学习之一】2024最新pytorch+cuda+cudnn下载安装搭建开发环境
人工智能·pytorch·深度学习·cuda
Chef_Chen1 小时前
从0开始学习机器学习--Day33--机器学习阶段总结
人工智能·学习·机器学习
搏博1 小时前
神经网络问题之:梯度不稳定
人工智能·深度学习·神经网络
Sxiaocai2 小时前
使用 PyTorch 实现并训练 VGGNet 用于 MNIST 分类
pytorch·深度学习·分类