N-gram算法的pytorch代码实现

代码实现

python 复制代码
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def tri_gramizer(test_sentence):
    # 将单词序列转化为数据元组列表,
    # 其中的每个元组格式为([ word_i-2, word_i-1 ], target word)
    trigrams = [ ([test_sentence[i], test_sentence[i+1]], test_sentence[i+2]) for i in range(len(test_sentence) - 2) ]

    # 给14行诗建立单词表
    # set 即去除重复的词
    vocab = set(test_sentence)
    # 建立词典,它比单词表多了每个词的索引
    word_to_ix = { word: i for i, word in enumerate(vocab) }
    
    print('The vocab length:', len(vocab))
    
    return trigrams, vocab, word_to_ix

class NGramLanguageModeler(nn.Module):
    # 初始化时需要指定:单词表大小、想要嵌入的维度大小、上下文的长度
    def __init__(self, vocab_size, embedding_dim, context_size):
        # 继承自nn.Module,例行执行父类super 初始化方法
        super(NGramLanguageModeler, self).__init__()
        # 建立词嵌入模块
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 线性层1
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)
        # 线性层2,隐藏层 hidden_size 为128
        self.linear2 = nn.Linear(128, vocab_size)

    # 重写的网络正向传播方法
    # 只要正确定义了正向传播
    # PyTorch 可以自动进行反向传播
    def forward(self, inputs):
        # 将输入进行"嵌入",并转化为"行向量"
        embeds = self.embeddings(inputs).view((1, -1))
        # 嵌入后的数据通过线性层1后,进行非线性函数 ReLU 的运算
        out = F.relu(self.linear1(embeds))
        # 通过线性层2后
        out = self.linear2(out)
        # 通过 log_softmax 方法将结果映射为概率的log
        # log 概率是用于下面计算负对数似然损失函数时方便使用的
        return out

def train(trigrams, vocab, word_to_ix):
    print('Training...')
    
    # 上下文大小
    # 即 前两个词
    CONTEXT_SIZE = 2
    # 嵌入维度
    EMBEDDING_DIM = 10

    # 计算损失
    losses = []
    # 损失函数为 交叉熵损失函数(Cross Entropy Loss)
    loss_function = nn.CrossEntropyLoss()  # 将NLLLoss替换为CrossEntropyLoss
    # 实例化我们的模型,传入:
    # 单词表的大小、嵌入维度、上下文长度
    model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
    # 优化函数使用随机梯度下降算法,学习率设置为0.001
    optimizer = optim.SGD(model.parameters(), lr=0.001)

    for epoch in range(1000):
        print(f'epoch: {epoch}')
        total_loss = 0
        # 循环context上下文,比如:['When', 'forty']
        # target,比如:winters
        for context, target in trigrams:

            # 步骤1:准备数据
            # 将context如"['When', 'forty']"
            # 转化为索引,如[68, 15]
            # 不再需要建立为 PyTorch Variable 变量,张量默认支持自动求导
            context_idxs = torch.LongTensor(list(map(lambda w: word_to_ix[w], context)))

            # 步骤2:清空梯度值,防止上次的梯度累计
            model.zero_grad()

            # 步骤3:运行网络的正向传播,获得 log 概率
            out = model(context_idxs)

            # 步骤4:计算损失函数
            # 不再需要传入 autograd.Variable
            loss = loss_function(out, torch.LongTensor([word_to_ix[target]]))

            # 步骤5:进行反向传播并更新梯度
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        losses.append(total_loss)

    print('Finished')    
    # 保存模型的状态字典和相关信息
    torch.save(model.state_dict(), 'model_state_dict.pth')
    return model, losses

def plot_losses(losses):
    plt.figure()
    plt.plot(losses)


def predict(input_data, model):
    first_word, second_word = input_data
    if first_word not in vocab or second_word not in vocab:
        print('Unknown word')
        return '-1'
    input_tensor = torch.LongTensor([word_to_ix[first_word], word_to_ix[second_word]])
    predict_idx = torch.argmax(model(input_tensor)).item()
    predict_word = list(vocab)[predict_idx]
    print('input words:', first_word, second_word)
    print('predicted word:', predict_word)
    return predict_word

if __name__ == '__main__':
    # 数据我们使用的是莎士比亚的14行诗
    test_sentence = """When forty winters shall besiege thy brow,
    And dig deep trenches in thy beauty's field,
    Thy youth's proud livery so gazed on now,
    Will be a totter'd weed of small worth held:
    Then being asked, where all thy beauty lies,
    Where all the treasure of thy lusty days;
    To say, within thine own deep sunken eyes,
    Were an all-eating shame, and thriftless praise.
    How much more praise deserv'd thy beauty's use,
    If thou couldst answer 'This fair child of mine
    Shall sum my count, and make my old excuse,'
    Proving his beauty by succession thine!
    This were to be new made when thou art old,
    And see thy blood warm when thou feel'st it cold.""".split()    # 按空格切分 


    trigrams, vocab, word_to_ix = tri_gramizer(test_sentence)

    # model, losses = train(trigrams, vocab, word_to_ix)
    # plot_losses(losses)
    
    # 上下文大小
    # 即 前两个词
    CONTEXT_SIZE = 2
    # 嵌入维度
    EMBEDDING_DIM = 10    
    model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE)
    model.load_state_dict(torch.load('model_state_dict.pth'))
    
    input_data = ['When', 'forty']
    word = predict(input_data, model)

    

参考文章:深度学习新手必学:使用 Pytorch 搭建一个 N-Gram 模型

相关推荐
AI视觉网奇几秒前
Detected at node ‘truediv‘ defined at (most recent call last): Node: ‘truediv‘
人工智能·python·tensorflow
西西弗Sisyphus3 分钟前
开放世界目标检测 Grounding DINO
人工智能·目标检测·计算机视觉·大模型
抓哇能手21 分钟前
数据库系统概论
数据库·人工智能·sql·mysql·计算机
GuYue.bing23 分钟前
网络下载ts流媒体
开发语言·python
IT古董26 分钟前
【机器学习】机器学习的基本分类-半监督学习(Semi-supervised Learning)
学习·机器学习·分类·半监督学习
火云洞红孩儿27 分钟前
基于AI IDE 打造快速化的游戏LUA脚本的生成系统
c++·人工智能·inscode·游戏引擎·lua·游戏开发·脚本系统
牛顿喜欢吃苹果35 分钟前
linux创建虚拟串口
python
-Mr_X-42 分钟前
FFmpeg在python里推流被处理过的视频流
python·ffmpeg
风清扬雨1 小时前
【计算机视觉】超简单!傅里叶变换的经典案例
人工智能·计算机视觉
一个不秃头的 程序员1 小时前
代码加入SFTP JAVA ---(小白篇3)
java·python·github