AI 基础知识十五 Decoder-only基础架构

Transformer架构的变体

Encoder-only

架构图

核心能力：文本理解（分类、NER等）

代表模型： BERT、RoBERTa、ALBER

当前地位：逐渐被替代（仅用于纯理解任务）

Transformer

核心能力：序列转换（翻译、摘要等）

代表模型： T5、BART

当前地位：特定场景使用（如翻译、摘要）

Decoder-only

架构图与 Encoder-only结构差不多，差别在掩码

核心能力：序列生成（对话、写作等）

代表模型： GPT、Llama、Claude、deepseek

当前地位：主流选择（通用大模型首选）

其优势在于强大的自回归生成能力、优秀的扩展性和任务通用性,能满足通用生成需求，模型复杂度低，易于训练，无监督文本数据训练，这是Decoder-only架构的优势。

手写Decoder-only架构代码

手写过Transformer架构代码，decoder-only架构相对简单点

1. 代码优化：

位置编码和词嵌入合成一个类

cpp 复制代码

class EmbeddingWithPositionImpl : public torch::nn::Module
{
public:
    EmbeddingWithPositionImpl(int64_t d_model, int64_t vocab_size, int64_t max_len)
    {
        m_iDmodel = d_model;
        m_iMaxlen = max_len;
        m_posEncode = torch::zeros({ m_iMaxlen, m_iDmodel }, torch::kFloat32);

        Encoding();
        emb_dropout = register_module("emb_dropout1", torch::nn::Dropout(dropoutp));
        m_embPosition = register_module("m_embPosition", torch::nn::Embedding(torch::nn::EmbeddingOptions(max_len, m_iDmodel)));
        m_emb = register_module("m_emb", torch::nn::Embedding(torch::nn::EmbeddingOptions(vocab_size, m_iDmodel)));
        register_buffer("posEncode", m_posEncode);
    }


    torch::Tensor& forward(torch::Tensor& x)
    {
        // x [bath, seq]
        //cout << x.sizes()<<endl;
        //cout <<m_emb << endl;

        x = m_emb->forward(x) * std::sqrt(m_iDmodel);
       
       // cout << x.sizes() << endl;

        // x [bath, seq, dim]

        auto seq = x.size(1);
        auto B = x.size(0);
        // cout << "x  " << x.sizes() << endl;
         //cout << "m_posEncode  "  << m_posEncode.sizes() << endl;
         //cout << "m_posEncode.slice  " << m_posEncode.slice(1, 0, dim).sizes() << endl;

        auto pos = torch::arange(seq, x.device()).unsqueeze(0).expand({ B, seq });
       
         x = x + m_posEncode.slice(1, 0, seq);
         //x = x + m_embPosition->forward(pos);

         x = emb_dropout->forward(x);

        return  x ;
    }

private:
    void Encoding()
    {
        auto pos = torch::arange(0, m_iMaxlen, torch::kFloat32).reshape({ m_iMaxlen, 1 });
        auto den_indices = torch::arange(0, m_iDmodel, 2, torch::kFloat32);
        auto den = torch::exp(-den_indices * std::log(10000.0f) / m_iDmodel);
        m_posEncode.index_put_({ torch::indexing::Slice(), torch::indexing::Slice(0, m_iDmodel, 2) }, torch::sin(pos * den));
        m_posEncode.index_put_({ torch::indexing::Slice(), torch::indexing::Slice(1, m_iDmodel, 2) }, torch::cos(pos * den));
        m_posEncode.unsqueeze_(0);
        // [bath,seq,dim]

    }
    torch::nn::Dropout emb_dropout{ nullptr };
    torch::nn::Embedding m_emb{ nullptr };
    torch::nn::Embedding m_embPosition{ nullptr };
    torch::Tensor m_posEncode;
    int64_t m_iDmodel = 0;
    int64_t m_iMaxlen = 0;
};

TORCH_MODULE(EmbeddingWithPosition);

2. 代码优化：

前馈神经网络合并到解码器

cpp 复制代码

class DeOnlyLayerImpl : public torch::nn::Module
{
public:
    DeOnlyLayerImpl(int64_t dmodel, int64_t nheads,int64_t feedforward)
    {


        m_fFeedForward = register_module("feedForward", torch::nn::Sequential(torch::nn::Linear(dmodel, feedforward),
                         torch::nn::GELU(), torch::nn::Linear(feedforward, dmodel)));

 
    }



    
    //torch::nn::MultiheadAttention m_attention{ nullptr };
    XMultiHeadAttention m_attention{ nullptr };
    torch::nn::Sequential m_fFeedForward{ nullptr };
 
   
};
TORCH_MODULE(DeOnlyLayer);

decoder-only实现代码

cpp 复制代码

class DecodersOnlyImpl : public torch::nn::Module
{

public:

    DecodersOnlyImpl(const DeOnlyOptions& inOpt)
    {
        fc = register_module("fc", torch::nn::Linear(inOpt.dmodel, inOpt.vocab_size));

        m_emb = register_module("m_emb", EmbeddingWithPosition(inOpt.dmodel, inOpt.vocab_size, inOpt.max_len));

        moduleLayers = register_module("EncoderLayers", torch::nn::ModuleList());

        m_option = inOpt;

        for (int i = 0; i < inOpt.layers; i++)
        {
            torch::nn::TransformerEncoderLayerOptions opt(inOpt.dmodel, inOpt.head);
            opt.dim_feedforward(inOpt.ffn);
            //opt.dropout(0);

            moduleLayers->push_back(DeOnlyLayer(inOpt.dmodel, inOpt.head, inOpt.ffn));

           /// moduleLayers->push_back(torch::nn::TransformerEncoderLayer(opt));
        }


    }

    auto forward(torch::Tensor x)
    {
        // x [bath, seq]
        int64_t batch = x.size(0);
        int64_t seq = x.size(1);
        auto src_mask = generate_square_subsequent_mask(seq).to(x.device());

        auto tgt_key_padding_mask = (x == gPad).to(torch::kBool).to(x.device());

        x = m_emb->forward(x);

        x = x.permute({ 1,0, 2 });
        // cout <<x.sizes()<< endl;



        for (auto& item : *moduleLayers)
        {
             x = item->as<DeOnlyLayer>()->forward(x, src_mask);
            //x = item->as<torch::nn::TransformerEncoderLayer>()->forward(x, src_mask, tgt_key_padding_mask);
            
        }

        return fc->forward(x);
    }




    torch::Tensor generate_square_subsequent_mask(int64_t sz)
    {
        auto mask = torch::triu(torch::ones({ sz, sz }, torch::kFloat32), 1);

        mask = mask.masked_fill(mask == 1, -std::numeric_limits<float>::infinity());
        return mask;
    }


public:
    EmbeddingWithPosition m_emb{ nullptr };
    torch::nn::ModuleList moduleLayers{ nullptr };
    torch::nn::Linear fc{ nullptr };
    DeOnlyOptions m_option;
};
TORCH_MODULE(DecodersOnly);

多头注意力、训练代码、三角掩码和上一章相同

3. 推理预测：

举例说明诗集："春眠不觉晓，处处闻啼鸟。夜来风雨声，花落知多少。" ，当我们把"春眠不觉晓" 输入给模型后产生流程： 1. "<S>春眠不觉晓" -> "春眠不觉晓，"

"<S>春眠不觉晓，" -> "春眠不觉晓，处"
"<S>春眠不觉晓，处" -> "春眠不觉晓，处处"

....

相应代码

cpp 复制代码

  string predict(string ch, translatDatasetOnly& dataTest)
  {
      std::vector<int64_t> tgtpad;

      tgtpad.push_back(gBOS);
      tgtpad.push_back(5);
      tgtpad.push_back(6);
      tgtpad.push_back(7);
      int start = tgtpad.size() - 1;
      std::vector<int64_t> outVector;
       cout << "input: ";
        for (auto xx : tgtpad)
        {
            cout << xx<< " , ";
        }
        cout << endl;  

      int i = 0;
      while (i < 100)
      {
          torch::Tensor tgt = torch::tensor(tgtpad, torch::kLong).to(gDType);
          auto out = forward(tgt.unsqueeze(0));

         // out = out.squeeze();

         //cout << out.sizes() << endl;
          auto next_token = out.argmax(-1).cpu();
         // cout << next_token << endl;
          
          int64_t key = next_token[i+ start].item<int64_t>();
          
         
          tgtpad.push_back(key);
          if (key == gEOS)
          {
              break;
          }
          i++;
          
      }
      cout  << "tgtpad ";
      for (auto& key : tgtpad)
      {
          cout << key << " ";
      }

      cout << endl << endl<<endl;
      

      return "__TestData__";

踩过一个坑跟大家分享一下，本着最简单的原则 decoder-onl与Encoder-only结构差不多，最要区别在掩码，用torch::nn::TransformerEncoderLayer作为解码器层再接上全连接层，写完用二条数据测试发现没问题正常，用300一首诗集数据训练实现诗集填写模型，无论怎样模型维度、解码器层数、学习率、批次大小、调整词表大小都无法收敛，为了验证模型架构的正确性，使用一组有规律数据作为数据集

4.测试数据集

有这样的数据组合第一组从1到50 有50个数字

第二组从2到50 有49个数字

第三组从3到50 有48个数字

.....

第四十组从40到50 有10个数字

相应数据生成代码

cpp 复制代码

vector<vector<int64_t>> MakeTestData(int count)
{
    count = min(count, 50);

    gVocabCount = 55;

    gBOS = 51;
    gEOS = 52;
    gPad = 53;

    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_int_distribution<> dis(0, 49);
    vector<vector<int64_t>> data;
    for (size_t i = 0; i < count; i++)
    {
        vector<int64_t> item;
    
        for (int j = 1; j < 50 -  i; j++)
        {
            int randomNumber =  i + j;//dis(gen);
            item.push_back(randomNumber);
          
        }
        
        data.push_back(item);

    }

    return data;
}

这些数据长度不一样，同一批次长度必须一样，要重写torch::data::samplers::next函数

cpp 复制代码

struct BatchSampler : public torch::data::samplers::RandomSampler
{
    BatchSampler(translatDatasetOnly* dataset) : RandomSampler(*dataset->size())
    {
        m_dataset = dataset;
    }

    std::optional<std::vector<size_t>> next(size_t batch_size) override
    {
        auto  vlist = RandomSampler::next(batch_size);
        if (vlist != std::nullopt)
        {
            m_dataset->UpdateBatchMax(*vlist);
        }  
        return vlist;
    }

    translatDatasetOnly* m_dataset;
};

得到同一批次的下标集合,找出本批次最大长度，

cpp 复制代码

    void UpdateBatchMax(std::vector<size_t>& vlist)
    {
        m_gMaxBatch = 0;


        for (size_t i = 0; i < vlist.size(); i++)
        {
            m_gMaxBatch = max(m_gMaxBatch, m_vTestData.at(vlist.at(i)).size());
        }
     }

同一批次长度相同，目的是为了节省点算力和内存

cpp 复制代码

    torch::data::Example<torch::Tensor, torch::Tensor>  get(size_t index) override
    {

        auto item  = m_vTestData.at(index);

        int  len = m_gMaxBatch - item.size();
        item.insert(item.begin(), gBOS);
        item.push_back(gEOS);

        for (int i = 0; i < len; i++)
        {
            item.push_back(gPad);
        }

        auto inpput = torch::tensor(item, torch::kLong);

        item.erase(item.begin());
        item.push_back(gPad);

        auto lable = torch::tensor(item, torch::kLong);

        return { inpput, lable };

    }

加载数据代码

cpp 复制代码

   BatchSampler sampler(&dataTrain);

   auto datasetTrain = dataTrain.map(torch::data::transforms::Stack<>());
   auto train_data_loader = torch::data::make_data_loader(std::move(datasetTrain),    std::move(sampler),torch::data::DataLoaderOptions().batch_size(batchsize));

训练时数据：

输入： <s> 5,6,..50</s>

标签：5,6,...50</s>

5.训练和推理

用40组数据来验证框架的正确性，运行效果

为了减少计算量模型参数设置比较少，为提高计算速度要使用GPU来计算，用cpu数据量大时计算太慢了，使用GPU来提高计算速度，需要硬件支持和安装环境。

CUDA 配置和使用

1.硬件是否支持

CUDA 硬件是否支持，取决于所使用的 GPU 是否为 NVIDIA 生产且支持 CUDA 架构。

用命令行查看： nvidia-smi

2.选择下载CUDA版本

根据nvidia-smi 命令查询选择下载对应版本，如上选择 cuda12.8 下载安装 https://developer.nvidia.com/cuda-toolkit-archive

3. 选择下载libtorch-cuda版本

下载点 Get Started

如上找不到CUDA 12.8版本, 是因为最新版本出来了隐藏了一些版本, URL 模板（自己拼任何版本）

cpp 复制代码

cuda-debug:

https://download.pytorch.org/libtorch/cuXXX/libtorch-win-shared-with-deps-debug-VERSION%2BcuXXX.zip


cuXXX： cu126, cu128
VERSION: 2.10.0 2.11.0 


下载 cu128 + 2.11.0

https://download.pytorch.org/libtorch/cu128/libtorch-win-shared-with-deps-debug-2.11.0%2Bcu128.zip

4.工程项目

vs2022+cmake工程，CMakeLists.txt

cpp 复制代码

# CMakeList.txt: GpuMakeProject 的 CMake 项目，在此处包括源代码并定义
# 项目特定的逻辑。
#
cmake_minimum_required (VERSION 3.8)

# 如果支持，请为 MSVC 编译器启用热重载。
if (POLICY CMP0141)
  cmake_policy(SET CMP0141 NEW)
  set(CMAKE_MSVC_DEBUG_INFORMATION_FORMAT "$<IF:$<AND:$<C_COMPILER_ID:MSVC>,$<CXX_COMPILER_ID:MSVC>>,$<$<CONFIG:Debug,RelWithDebInfo>:EditAndContinue>,$<$<CONFIG:Debug,RelWithDebInfo>:ProgramDatabase>>")
endif()

project ("GpuMakeProject")

set(targetExeName "MakeProject")

 

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    set(torchpath "D:/libtorch_gpu2.11.0/debug")
else()
    set(torchpath "D:/libtorch_gpu2.11.0/debug")
endif()

find_package(Torch REQUIRED PATHS "${torchpath}/share/cmake/Torch/")


include_directories(${TORCH_INCLUDE_DIRS})

if(CMAKE_BUILD_TYPE STREQUAL "Debug")
    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_SOURCE_DIR}../Debug/)
else()
    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_SOURCE_DIR}../bin2/)
endif()

file(GLOB SRC_FILES ${PROJECT_SOURCE_DIR}/*.cpp)

add_executable (${targetExeName} ${SRC_FILES})

target_link_libraries(${targetExeName} "${TORCH_LIBRARIES}")

file(GLOB TORCH_DLLS "${TORCH_INSTALL_PREFIX}/lib/*.dll"
                     )

add_custom_command(TARGET ${targetExeName}
                     POST_BUILD
                     COMMAND ${CMAKE_COMMAND} -E copy_if_different
                     ${TORCH_DLLS}
                     $<TARGET_FILE_DIR:${targetExeName}>)





# 将源代码添加到此项目的可执行文件。
##add_executable (GpuMakeProject "GpuMakeProject.cpp" "GpuMakeProject.h")

if (CMAKE_VERSION VERSION_GREATER 3.12)
  set_property(TARGET ${targetExeName} PROPERTY CXX_STANDARD 20)
endif()

# TODO: 如有需要，请添加测试并安装目标。

到这里decoder-only架构基本完成下一章接着实现一个诗集填空示例。

感谢大家的支持。