llama.cpp LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

llama.cpp LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

  • [1. `LLM_ARCH_DEEPSEEK` and `LLM_ARCH_DEEPSEEK2`](#1. LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2)
  • [2. `LLM_ARCH_DEEPSEEK` and `LLM_ARCH_DEEPSEEK2`](#2. LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2)
  • [3. `struct ggml_cgraph * build_deepseek()` and `struct ggml_cgraph * build_deepseek2()`](#3. struct ggml_cgraph * build_deepseek() and struct ggml_cgraph * build_deepseek2())
  • References

不宜吹捧中国大语言模型的同时,又去贬低美国大语言模型。

水是人体的主要化学成分,约占体重的 50% 至 70%。大语言模型的含水量也不会太少。

llama.cpp
https://github.com/ggerganov/llama.cpp

1. LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-arch.h
/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-arch.cpp

  • LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

    //
    // gguf constants (sync with gguf.py)
    //

    enum llm_arch {
    LLM_ARCH_LLAMA,
    LLM_ARCH_DECI,
    LLM_ARCH_FALCON,
    LLM_ARCH_BAICHUAN,
    LLM_ARCH_GROK,
    LLM_ARCH_GPT2,
    LLM_ARCH_GPTJ,
    LLM_ARCH_GPTNEOX,
    LLM_ARCH_MPT,
    LLM_ARCH_STARCODER,
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
    LLM_ARCH_NOMIC_BERT,
    LLM_ARCH_JINA_BERT_V2,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
    LLM_ARCH_QWEN,
    LLM_ARCH_QWEN2,
    LLM_ARCH_QWEN2MOE,
    LLM_ARCH_QWEN2VL,
    LLM_ARCH_PHI2,
    LLM_ARCH_PHI3,
    LLM_ARCH_PHIMOE,
    LLM_ARCH_PLAMO,
    LLM_ARCH_CODESHELL,
    LLM_ARCH_ORION,
    LLM_ARCH_INTERNLM2,
    LLM_ARCH_MINICPM,
    LLM_ARCH_MINICPM3,
    LLM_ARCH_GEMMA,
    LLM_ARCH_GEMMA2,
    LLM_ARCH_STARCODER2,
    LLM_ARCH_MAMBA,
    LLM_ARCH_XVERSE,
    LLM_ARCH_COMMAND_R,
    LLM_ARCH_COHERE2,
    LLM_ARCH_DBRX,
    LLM_ARCH_OLMO,
    LLM_ARCH_OLMO2,
    LLM_ARCH_OLMOE,
    LLM_ARCH_OPENELM,
    LLM_ARCH_ARCTIC,
    LLM_ARCH_DEEPSEEK,
    LLM_ARCH_DEEPSEEK2,
    LLM_ARCH_CHATGLM,
    LLM_ARCH_BITNET,
    LLM_ARCH_T5,
    LLM_ARCH_T5ENCODER,
    LLM_ARCH_JAIS,
    LLM_ARCH_NEMOTRON,
    LLM_ARCH_EXAONE,
    LLM_ARCH_RWKV6,
    LLM_ARCH_RWKV6QWEN2,
    LLM_ARCH_GRANITE,
    LLM_ARCH_GRANITE_MOE,
    LLM_ARCH_CHAMELEON,
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_UNKNOWN,
    };

  • { LLM_ARCH_DEEPSEEK, "deepseek" } and { LLM_ARCH_DEEPSEEK2, "deepseek2" }

    static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_LLAMA, "llama" },
    { LLM_ARCH_DECI, "deci" },
    { LLM_ARCH_FALCON, "falcon" },
    { LLM_ARCH_GROK, "grok" },
    { LLM_ARCH_GPT2, "gpt2" },
    { LLM_ARCH_GPTJ, "gptj" },
    { LLM_ARCH_GPTNEOX, "gptneox" },
    { LLM_ARCH_MPT, "mpt" },
    { LLM_ARCH_BAICHUAN, "baichuan" },
    { LLM_ARCH_STARCODER, "starcoder" },
    { LLM_ARCH_REFACT, "refact" },
    { LLM_ARCH_BERT, "bert" },
    { LLM_ARCH_NOMIC_BERT, "nomic-bert" },
    { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
    { LLM_ARCH_BLOOM, "bloom" },
    { LLM_ARCH_STABLELM, "stablelm" },
    { LLM_ARCH_QWEN, "qwen" },
    { LLM_ARCH_QWEN2, "qwen2" },
    { LLM_ARCH_QWEN2MOE, "qwen2moe" },
    { LLM_ARCH_QWEN2VL, "qwen2vl" },
    { LLM_ARCH_PHI2, "phi2" },
    { LLM_ARCH_PHI3, "phi3" },
    { LLM_ARCH_PHIMOE, "phimoe" },
    { LLM_ARCH_PLAMO, "plamo" },
    { LLM_ARCH_CODESHELL, "codeshell" },
    { LLM_ARCH_ORION, "orion" },
    { LLM_ARCH_INTERNLM2, "internlm2" },
    { LLM_ARCH_MINICPM, "minicpm" },
    { LLM_ARCH_MINICPM3, "minicpm3" },
    { LLM_ARCH_GEMMA, "gemma" },
    { LLM_ARCH_GEMMA2, "gemma2" },
    { LLM_ARCH_STARCODER2, "starcoder2" },
    { LLM_ARCH_MAMBA, "mamba" },
    { LLM_ARCH_XVERSE, "xverse" },
    { LLM_ARCH_COMMAND_R, "command-r" },
    { LLM_ARCH_COHERE2, "cohere2" },
    { LLM_ARCH_DBRX, "dbrx" },
    { LLM_ARCH_OLMO, "olmo" },
    { LLM_ARCH_OLMO2, "olmo2" },
    { LLM_ARCH_OLMOE, "olmoe" },
    { LLM_ARCH_OPENELM, "openelm" },
    { LLM_ARCH_ARCTIC, "arctic" },
    { LLM_ARCH_DEEPSEEK, "deepseek" },
    { LLM_ARCH_DEEPSEEK2, "deepseek2" },
    { LLM_ARCH_CHATGLM, "chatglm" },
    { LLM_ARCH_BITNET, "bitnet" },
    { LLM_ARCH_T5, "t5" },
    { LLM_ARCH_T5ENCODER, "t5encoder" },
    { LLM_ARCH_JAIS, "jais" },
    { LLM_ARCH_NEMOTRON, "nemotron" },
    { LLM_ARCH_EXAONE, "exaone" },
    { LLM_ARCH_RWKV6, "rwkv6" },
    { LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
    { LLM_ARCH_GRANITE, "granite" },
    { LLM_ARCH_GRANITE_MOE, "granitemoe" },
    { LLM_ARCH_CHAMELEON, "chameleon" },
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_UNKNOWN, "(unknown)" },
    };

2. LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama-arch.cpp

  • LLM_ARCH_DEEPSEEK and LLM_ARCH_DEEPSEEK2

    static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
    {
    LLM_ARCH_LLAMA,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
    { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
    { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_DECI,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
    { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
    { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_BAICHUAN,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_FALCON,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_GROK,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
    { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
    { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
    { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
    },
    },
    {
    LLM_ARCH_GPT2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_POS_EMBD, "position_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    },
    },
    {
    LLM_ARCH_GPTJ,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    },
    },
    {
    LLM_ARCH_GPTNEOX,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_MPT,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output"},
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" },
    { LLM_TENSOR_POS_EMBD, "position_embd" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"},
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"},
    },
    },
    {
    LLM_ARCH_STARCODER,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_POS_EMBD, "position_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    },
    },
    {
    LLM_ARCH_REFACT,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_BERT,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
    { LLM_TENSOR_TOKEN_TYPES, "token_types" },
    { LLM_TENSOR_POS_EMBD, "position_embd" },
    { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_CLS, "cls" },
    { LLM_TENSOR_CLS_OUT, "cls.output" },
    },
    },
    {
    LLM_ARCH_NOMIC_BERT,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
    { LLM_TENSOR_TOKEN_TYPES, "token_types" },
    { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_JINA_BERT_V2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
    { LLM_TENSOR_TOKEN_TYPES, "token_types" },
    { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
    { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_CLS, "cls" },
    },
    },
    {
    LLM_ARCH_BLOOM,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    },
    },
    {
    LLM_ARCH_STABLELM,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    },
    },
    {
    LLM_ARCH_QWEN,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_QWEN2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_QWEN2VL,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_QWEN2MOE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
    { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
    { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
    { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
    },
    },
    {
    LLM_ARCH_PHI2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_PHI3,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
    { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_PHIMOE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
    { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_PLAMO,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_CODESHELL,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_ORION,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_INTERNLM2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_MINICPM,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
    { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" },
    { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" },
    { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" },
    },
    },
    {
    LLM_ARCH_MINICPM3,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FACTORS_LONG, "rope_factors_long" },
    { LLM_TENSOR_ROPE_FACTORS_SHORT, "rope_factors_short" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
    { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
    { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
    { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
    { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    },
    },
    {
    LLM_ARCH_GEMMA,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_GEMMA2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
    },
    },
    {
    LLM_ARCH_STARCODER2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_MAMBA,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
    { LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
    { LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
    { LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
    { LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
    { LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
    { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
    },
    },
    {
    LLM_ARCH_XVERSE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_COMMAND_R,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    },
    },
    {
    LLM_ARCH_COHERE2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_DBRX,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_OLMO,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_OLMO2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_OLMOE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_OPENELM,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_ARCTIC,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_NORM_EXPS, "blk.%d.ffn_norm_exps" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_DEEPSEEK,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
    { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
    { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
    { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
    },
    },
    {
    LLM_ARCH_DEEPSEEK2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" },
    { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_Q_A, "blk.%d.attn_q_a" },
    { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" },
    { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" },
    { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
    { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
    { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
    { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
    { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
    },
    },
    {
    LLM_ARCH_CHATGLM,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    },
    },
    {
    LLM_ARCH_BITNET,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_SUB_NORM, "blk.%d.attn_sub_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_SUB_NORM, "blk.%d.ffn_sub_norm" },
    },
    },
    {
    LLM_ARCH_T5,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_DEC_OUTPUT_NORM, "dec.output_norm" },
    { LLM_TENSOR_DEC_ATTN_NORM, "dec.blk.%d.attn_norm" },
    { LLM_TENSOR_DEC_ATTN_Q, "dec.blk.%d.attn_q" },
    { LLM_TENSOR_DEC_ATTN_K, "dec.blk.%d.attn_k" },
    { LLM_TENSOR_DEC_ATTN_V, "dec.blk.%d.attn_v" },
    { LLM_TENSOR_DEC_ATTN_OUT, "dec.blk.%d.attn_o" },
    { LLM_TENSOR_DEC_ATTN_REL_B, "dec.blk.%d.attn_rel_b" },
    { LLM_TENSOR_DEC_CROSS_ATTN_NORM, "dec.blk.%d.cross_attn_norm" },
    { LLM_TENSOR_DEC_CROSS_ATTN_Q, "dec.blk.%d.cross_attn_q" },
    { LLM_TENSOR_DEC_CROSS_ATTN_K, "dec.blk.%d.cross_attn_k" },
    { LLM_TENSOR_DEC_CROSS_ATTN_V, "dec.blk.%d.cross_attn_v" },
    { LLM_TENSOR_DEC_CROSS_ATTN_OUT, "dec.blk.%d.cross_attn_o" },
    { LLM_TENSOR_DEC_CROSS_ATTN_REL_B, "dec.blk.%d.cross_attn_rel_b" },
    { LLM_TENSOR_DEC_FFN_NORM, "dec.blk.%d.ffn_norm" },
    { LLM_TENSOR_DEC_FFN_GATE, "dec.blk.%d.ffn_gate" },
    { LLM_TENSOR_DEC_FFN_DOWN, "dec.blk.%d.ffn_down" },
    { LLM_TENSOR_DEC_FFN_UP, "dec.blk.%d.ffn_up" },
    { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
    { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
    { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
    { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
    { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
    { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
    { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
    { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
    { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
    { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
    { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_T5ENCODER,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
    { LLM_TENSOR_ENC_ATTN_NORM, "enc.blk.%d.attn_norm" },
    { LLM_TENSOR_ENC_ATTN_Q, "enc.blk.%d.attn_q" },
    { LLM_TENSOR_ENC_ATTN_K, "enc.blk.%d.attn_k" },
    { LLM_TENSOR_ENC_ATTN_V, "enc.blk.%d.attn_v" },
    { LLM_TENSOR_ENC_ATTN_OUT, "enc.blk.%d.attn_o" },
    { LLM_TENSOR_ENC_ATTN_REL_B, "enc.blk.%d.attn_rel_b" },
    { LLM_TENSOR_ENC_FFN_NORM, "enc.blk.%d.ffn_norm" },
    { LLM_TENSOR_ENC_FFN_GATE, "enc.blk.%d.ffn_gate" },
    { LLM_TENSOR_ENC_FFN_DOWN, "enc.blk.%d.ffn_down" },
    { LLM_TENSOR_ENC_FFN_UP, "enc.blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_JAIS,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    },
    },
    {
    LLM_ARCH_NEMOTRON,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_EXAONE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_RWKV6,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_NORM_2, "blk.%d.attn_norm_2" },
    { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
    { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
    { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
    { LLM_TENSOR_TIME_MIX_LERP_W, "blk.%d.time_mix_lerp_w" },
    { LLM_TENSOR_TIME_MIX_LERP_K, "blk.%d.time_mix_lerp_k" },
    { LLM_TENSOR_TIME_MIX_LERP_V, "blk.%d.time_mix_lerp_v" },
    { LLM_TENSOR_TIME_MIX_LERP_R, "blk.%d.time_mix_lerp_r" },
    { LLM_TENSOR_TIME_MIX_LERP_G, "blk.%d.time_mix_lerp_g" },
    { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
    { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
    { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
    { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
    { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
    { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
    { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
    { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
    { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
    { LLM_TENSOR_TIME_MIX_LN, "blk.%d.time_mix_ln" },
    { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
    { LLM_TENSOR_CHANNEL_MIX_LERP_K, "blk.%d.channel_mix_lerp_k" },
    { LLM_TENSOR_CHANNEL_MIX_LERP_R, "blk.%d.channel_mix_lerp_r" },
    { LLM_TENSOR_CHANNEL_MIX_KEY, "blk.%d.channel_mix_key" },
    { LLM_TENSOR_CHANNEL_MIX_VALUE, "blk.%d.channel_mix_value" },
    { LLM_TENSOR_CHANNEL_MIX_RECEPTANCE, "blk.%d.channel_mix_receptance" },
    },
    },
    {
    LLM_ARCH_RWKV6QWEN2,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_TIME_MIX_W1, "blk.%d.time_mix_w1" },
    { LLM_TENSOR_TIME_MIX_W2, "blk.%d.time_mix_w2" },
    { LLM_TENSOR_TIME_MIX_LERP_X, "blk.%d.time_mix_lerp_x" },
    { LLM_TENSOR_TIME_MIX_LERP_FUSED, "blk.%d.time_mix_lerp_fused" },
    { LLM_TENSOR_TIME_MIX_FIRST, "blk.%d.time_mix_first" },
    { LLM_TENSOR_TIME_MIX_DECAY, "blk.%d.time_mix_decay" },
    { LLM_TENSOR_TIME_MIX_DECAY_W1, "blk.%d.time_mix_decay_w1" },
    { LLM_TENSOR_TIME_MIX_DECAY_W2, "blk.%d.time_mix_decay_w2" },
    { LLM_TENSOR_TIME_MIX_KEY, "blk.%d.time_mix_key" },
    { LLM_TENSOR_TIME_MIX_VALUE, "blk.%d.time_mix_value" },
    { LLM_TENSOR_TIME_MIX_RECEPTANCE, "blk.%d.time_mix_receptance" },
    { LLM_TENSOR_TIME_MIX_GATE, "blk.%d.time_mix_gate" },
    { LLM_TENSOR_TIME_MIX_OUTPUT, "blk.%d.time_mix_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_GRANITE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    },
    },
    {
    LLM_ARCH_GRANITE_MOE,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
    { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
    { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
    { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
    },
    },
    {
    LLM_ARCH_CHAMELEON,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
    { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
    { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
    { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
    { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
    { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
    { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
    { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
    { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
    { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
    { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
    },
    },
    {
    LLM_ARCH_WAVTOKENIZER_DEC,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
    { LLM_TENSOR_CONV1D, "conv1d" },
    { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" },
    { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" },
    { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" },
    { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" },
    { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" },
    { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
    { LLM_TENSOR_OUTPUT, "output" },
    { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" },
    { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" },
    { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" },
    { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" },
    { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" },
    { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" },
    { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" },
    { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" },
    { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" },
    { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" },
    },
    },
    {
    LLM_ARCH_UNKNOWN,
    {
    { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
    },
    },
    };

3. struct ggml_cgraph * build_deepseek() and struct ggml_cgraph * build_deepseek2()

/home/yongqiang/llm_work/llama_cpp_25_01_05/llama.cpp/src/llama.cpp

  • struct ggml_cgraph * build_deepseek()

      struct ggml_cgraph * build_deepseek() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
    
          // mutable variable, needed during the last layer of the computation to skip unused tokens
          int32_t n_tokens = this->n_tokens;
    
          const int64_t n_embd_head = hparams.n_embd_head_v;
          GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
          GGML_ASSERT(n_embd_head == hparams.n_rot);
    
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
    
          inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
    
          // inp_pos - contains the positions
          struct ggml_tensor * inp_pos = build_inp_pos();
    
          // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
          struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
          const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale;
          for (int il = 0; il < n_layer; ++il) {
              struct ggml_tensor * inpSA = inpL;
    
              // norm
              cur = llm_build_norm(ctx0, inpL, hparams,
                      model.layers[il].attn_norm, NULL,
                      LLM_NORM_RMS, cb, il);
              cb(cur, "attn_norm", il);
    
              // self-attention
              {
                  // rope freq factors for llama3; may return nullptr for llama2 and other models
                  struct ggml_tensor * rope_factors = build_rope_factors(il);
    
                  // compute Q and K and RoPE them
                  struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
                  cb(Qcur, "Qcur", il);
                  if (model.layers[il].bq) {
                      Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                      cb(Qcur, "Qcur", il);
                  }
    
                  struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
                  cb(Kcur, "Kcur", il);
                  if (model.layers[il].bk) {
                      Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                      cb(Kcur, "Kcur", il);
                  }
    
                  struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
                  cb(Vcur, "Vcur", il);
                  if (model.layers[il].bv) {
                      Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                      cb(Vcur, "Vcur", il);
                  }
    
                  Qcur = ggml_rope_ext(
                      ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors,
                      n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                      ext_factor, attn_factor, beta_fast, beta_slow
                  );
                  cb(Qcur, "Qcur", il);
    
                  Kcur = ggml_rope_ext(
                      ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors,
                      n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                      ext_factor, attn_factor, beta_fast, beta_slow
                  );
                  cb(Kcur, "Kcur", il);
    
                  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                          model.layers[il].wo, model.layers[il].bo,
                          Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
              }
    
              if (il == n_layer - 1) {
                  // skip computing output for unused tokens
                  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                  n_tokens = n_outputs;
                  cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
              }
    
    
              struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
              cb(ffn_inp, "ffn_inp", il);
    
              cur = llm_build_norm(ctx0, ffn_inp, hparams,
                      model.layers[il].ffn_norm, NULL,
                      LLM_NORM_RMS, cb, il);
              cb(cur, "ffn_norm", il);
    
              if ((uint32_t) il < hparams.n_layer_dense_lead) {
                  cur = llm_build_ffn(ctx0, lctx, cur,
                          model.layers[il].ffn_up,   NULL, NULL,
                          model.layers[il].ffn_gate, NULL, NULL,
                          model.layers[il].ffn_down, NULL, NULL,
                          NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              } else {
                  // MoE branch
                  ggml_tensor * moe_out =
                          llm_build_moe_ffn(ctx0, lctx, cur,
                              model.layers[il].ffn_gate_inp,
                              model.layers[il].ffn_up_exps,
                              model.layers[il].ffn_gate_exps,
                              model.layers[il].ffn_down_exps,
                              nullptr,
                              n_expert, n_expert_used,
                              LLM_FFN_SILU, false,
                              false, hparams.expert_weights_scale,
                              LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
                              cb, il);
                  cb(moe_out, "ffn_moe_out", il);
    
                  // FFN shared expert
                  {
                      ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
                              model.layers[il].ffn_up_shexp,   NULL, NULL,
                              model.layers[il].ffn_gate_shexp, NULL, NULL,
                              model.layers[il].ffn_down_shexp, NULL, NULL,
                              NULL,
                              LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                      cb(ffn_shexp, "ffn_shexp", il);
    
                      cur = ggml_add(ctx0, moe_out, ffn_shexp);
                      cb(cur, "ffn_out", il);
                  }
              }
    
              cur = ggml_add(ctx0, cur, ffn_inp);
              cur = lctx.cvec.apply_to(ctx0, cur, il);
              cb(cur, "l_out", il);
    
              // input for next layer
              inpL = cur;
          }
    
          cur = inpL;
    
          cur = llm_build_norm(ctx0, cur, hparams,
                  model.output_norm, NULL,
                  LLM_NORM_RMS, cb, -1);
          cb(cur, "result_norm", -1);
    
          // lm_head
          cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
    
          cb(cur, "result_output", -1);
    
          ggml_build_forward_expand(gf, cur);
    
          return gf;
      }
    
  • struct ggml_cgraph * build_deepseek2()

      struct ggml_cgraph * build_deepseek2() {
          struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
    
          // mutable variable, needed during the last layer of the computation to skip unused tokens
          int32_t n_tokens = this->n_tokens;
    
          bool is_lite = (hparams.n_layer == 27);
    
          // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly.
          // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
          const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
          const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k));
          const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
    
          const uint32_t n_embd_head_qk_rope = hparams.n_rot;
          const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot;
          const uint32_t kv_lora_rank = hparams.n_lora_kv;
    
          struct ggml_tensor * cur;
          struct ggml_tensor * inpL;
    
          // {n_embd, n_tokens}
          inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
    
          // inp_pos - contains the positions
          struct ggml_tensor * inp_pos = build_inp_pos();
    
          // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
          struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
    
          for (int il = 0; il < n_layer; ++il) {
              struct ggml_tensor * inpSA = inpL;
    
              // norm
              cur = llm_build_norm(ctx0, inpL, hparams,
                      model.layers[il].attn_norm, NULL,
                      LLM_NORM_RMS, cb, il);
              cb(cur, "attn_norm", il);
    
              // self_attention
              {
                  struct ggml_tensor * q = NULL;
                  if (!is_lite) {
                      // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens}
                      q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur);
                      cb(q, "q", il);
    
                      q = llm_build_norm(ctx0, q, hparams,
                              model.layers[il].attn_q_a_norm, NULL,
                              LLM_NORM_RMS, cb, il);
                      cb(q, "q", il);
    
                      // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens}
                      q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q);
                      cb(q, "q", il);
                  } else {
                      q = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
                      cb(q, "q", il);
                  }
    
                  // split into {n_head * n_embd_head_qk_nope, n_tokens}
                  struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
                          ggml_row_size(q->type, hparams.n_embd_head_k),
                          ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
                          0);
                  cb(q_nope, "q_nope", il);
    
                  // and {n_head * n_embd_head_qk_rope, n_tokens}
                  struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
                          ggml_row_size(q->type, hparams.n_embd_head_k),
                          ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
                          ggml_row_size(q->type, n_embd_head_qk_nope));
                  cb(q_pe, "q_pe", il);
    
                  // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
                  struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
                  cb(kv_pe_compresseed, "kv_pe_compresseed", il);
    
                  // split into {kv_lora_rank, n_tokens}
                  struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
                          kv_pe_compresseed->nb[1],
                          0);
                  cb(kv_compressed, "kv_compressed", il);
    
                  // and {n_embd_head_qk_rope, n_tokens}
                  struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
                          kv_pe_compresseed->nb[1],
                          kv_pe_compresseed->nb[1],
                          ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
                  cb(k_pe, "k_pe", il);
    
                  kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
                  kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
                          model.layers[il].attn_kv_a_norm, NULL,
                          LLM_NORM_RMS, cb, il);
                  cb(kv_compressed, "kv_compressed", il);
    
                  // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
                  struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
                  cb(kv, "kv", il);
    
                  // split into {n_head * n_embd_head_qk_nope, n_tokens}
                  struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
                          ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
                          ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
                          0);
                  cb(k_nope, "k_nope", il);
    
                  // and {n_head * n_embd_head_v, n_tokens}
                  struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
                          ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
                          ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
                          ggml_row_size(kv->type, (n_embd_head_qk_nope)));
                  cb(v_states, "v_states", il);
    
                  v_states = ggml_cont(ctx0, v_states);
                  cb(v_states, "v_states", il);
    
                  v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
                      ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
                      0);
                  cb(v_states, "v_states", il);
    
                  q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                  q_pe = ggml_rope_ext(
                      ctx0, q_pe, inp_pos, nullptr,
                      n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                      ext_factor, attn_factor_scaled, beta_fast, beta_slow
                  );
                  cb(q_pe, "q_pe", il);
    
                  // shared RoPE key
                  k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this
                  k_pe = ggml_rope_ext(
                      ctx0, k_pe, inp_pos, nullptr,
                      n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                      ext_factor, attn_factor_scaled, beta_fast, beta_slow
                  );
                  cb(k_pe, "k_pe", il);
    
                  struct ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0);
                  cb(q_states, "q_states", il);
    
                  struct ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0);
                  cb(k_states, "k_states", il);
    
                  cur = llm_build_kv(ctx0, lctx, kv_self, gf,
                          model.layers[il].wo, NULL,
                          k_states, v_states, q_states, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il);
              }
    
              if (il == n_layer - 1) {
                  // skip computing output for unused tokens
                  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
                  n_tokens = n_outputs;
                  cur   = ggml_get_rows(ctx0,   cur, inp_out_ids);
                  inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
              }
    
              struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
              cb(ffn_inp, "ffn_inp", il);
    
              cur = llm_build_norm(ctx0, ffn_inp, hparams,
                      model.layers[il].ffn_norm, NULL,
                      LLM_NORM_RMS, cb, il);
              cb(cur, "ffn_norm", il);
    
              if ((uint32_t) il < hparams.n_layer_dense_lead) {
                  cur = llm_build_ffn(ctx0, lctx, cur,
                          model.layers[il].ffn_up,   NULL, NULL,
                          model.layers[il].ffn_gate, NULL, NULL,
                          model.layers[il].ffn_down, NULL, NULL,
                          NULL,
                          LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                  cb(cur, "ffn_out", il);
              } else {
                  // MoE branch
                  ggml_tensor * moe_out =
                          llm_build_moe_ffn(ctx0, lctx, cur,
                              model.layers[il].ffn_gate_inp,
                              model.layers[il].ffn_up_exps,
                              model.layers[il].ffn_gate_exps,
                              model.layers[il].ffn_down_exps,
                              model.layers[il].ffn_exp_probs_b,
                              n_expert, n_expert_used,
                              LLM_FFN_SILU, hparams.expert_weights_norm,
                              true, hparams.expert_weights_scale,
                              (enum llama_expert_gating_func_type) hparams.expert_gating_func,
                              cb, il);
                  cb(moe_out, "ffn_moe_out", il);
    
                  // FFN shared expert
                  {
                      ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur,
                              model.layers[il].ffn_up_shexp,   NULL, NULL,
                              model.layers[il].ffn_gate_shexp, NULL, NULL,
                              model.layers[il].ffn_down_shexp, NULL, NULL,
                              NULL,
                              LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
                      cb(ffn_shexp, "ffn_shexp", il);
    
                      cur = ggml_add(ctx0, moe_out, ffn_shexp);
                      cb(cur, "ffn_out", il);
                  }
              }
    
              cur = ggml_add(ctx0, cur, ffn_inp);
              cur = lctx.cvec.apply_to(ctx0, cur, il);
              cb(cur, "l_out", il);
    
              // input for next layer
              inpL = cur;
          }
    
          cur = inpL;
    
          cur = llm_build_norm(ctx0, cur, hparams,
                  model.output_norm, NULL,
                  LLM_NORM_RMS, cb, -1);
          cb(cur, "result_norm", -1);
    
          // lm_head
          cur = ggml_mul_mat(ctx0, model.output, cur);
          cb(cur, "result_output", -1);
    
          ggml_build_forward_expand(gf, cur);
    
          return gf;
      }
    
  • case LLM_ARCH_DEEPSEEK: and case LLM_ARCH_DEEPSEEK2:

      switch (model.arch) {
          case LLM_ARCH_LLAMA:
          case LLM_ARCH_MINICPM:
          case LLM_ARCH_GRANITE:
          case LLM_ARCH_GRANITE_MOE:
              {
                  result = llm.build_llama();
              } break;
          case LLM_ARCH_DECI:
              {
                  result = llm.build_deci();
              } break;
          case LLM_ARCH_BAICHUAN:
              {
                  result = llm.build_baichuan();
              } break;
          case LLM_ARCH_FALCON:
              {
                  result = llm.build_falcon();
              } break;
          case LLM_ARCH_GROK:
              {
                  result = llm.build_grok();
              } break;
          case LLM_ARCH_STARCODER:
              {
                  result = llm.build_starcoder();
              } break;
          case LLM_ARCH_REFACT:
              {
                  result = llm.build_refact();
              } break;
          case LLM_ARCH_BERT:
          case LLM_ARCH_JINA_BERT_V2:
          case LLM_ARCH_NOMIC_BERT:
              {
                  result = llm.build_bert();
              } break;
          case LLM_ARCH_BLOOM:
              {
                  result = llm.build_bloom();
              } break;
          case LLM_ARCH_MPT:
              {
                  result = llm.build_mpt();
              } break;
           case LLM_ARCH_STABLELM:
              {
                  result = llm.build_stablelm();
              } break;
          case LLM_ARCH_QWEN:
              {
                  result = llm.build_qwen();
              } break;
          case LLM_ARCH_QWEN2:
              {
                  result = llm.build_qwen2();
              } break;
          case LLM_ARCH_QWEN2VL:
              {
                  lctx.n_pos_per_token = 4;
                  result = llm.build_qwen2vl();
              } break;
          case LLM_ARCH_QWEN2MOE:
              {
                  result = llm.build_qwen2moe();
              } break;
          case LLM_ARCH_PHI2:
              {
                  result = llm.build_phi2();
              } break;
          case LLM_ARCH_PHI3:
          case LLM_ARCH_PHIMOE:
              {
                  result = llm.build_phi3();
              } break;
          case LLM_ARCH_PLAMO:
              {
                  result = llm.build_plamo();
              } break;
          case LLM_ARCH_GPT2:
              {
                  result = llm.build_gpt2();
              } break;
          case LLM_ARCH_CODESHELL:
              {
                  result = llm.build_codeshell();
              } break;
          case LLM_ARCH_ORION:
              {
                  result = llm.build_orion();
              } break;
          case LLM_ARCH_INTERNLM2:
              {
                  result = llm.build_internlm2();
              } break;
          case LLM_ARCH_MINICPM3:
              {
                  result = llm.build_minicpm3();
              } break;
          case LLM_ARCH_GEMMA:
              {
                  result = llm.build_gemma();
              } break;
          case LLM_ARCH_GEMMA2:
              {
                  result = llm.build_gemma2();
              } break;
          case LLM_ARCH_STARCODER2:
              {
                  result = llm.build_starcoder2();
              } break;
          case LLM_ARCH_MAMBA:
              {
                  result = llm.build_mamba();
              } break;
          case LLM_ARCH_XVERSE:
              {
                  result = llm.build_xverse();
              } break;
          case LLM_ARCH_COMMAND_R:
              {
                  result = llm.build_command_r();
              } break;
          case LLM_ARCH_COHERE2:
              {
                  result = llm.build_cohere2();
              } break;
          case LLM_ARCH_DBRX:
              {
                  result = llm.build_dbrx();
              } break;
          case LLM_ARCH_OLMO:
              {
                  result = llm.build_olmo();
              } break;
          case LLM_ARCH_OLMO2:
              {
                  result = llm.build_olmo2();
              } break;
          case LLM_ARCH_OLMOE:
              {
                  result = llm.build_olmoe();
              } break;
          case LLM_ARCH_OPENELM:
              {
                  result = llm.build_openelm();
              } break;
          case LLM_ARCH_GPTNEOX:
              {
                  result = llm.build_gptneox();
              } break;
          case LLM_ARCH_ARCTIC:
              {
                  result = llm.build_arctic();
              } break;
          case LLM_ARCH_DEEPSEEK:
              {
                  result = llm.build_deepseek();
              } break;
          case LLM_ARCH_DEEPSEEK2:
              {
                  result = llm.build_deepseek2();
              } break;
          case LLM_ARCH_CHATGLM:
              {
                  result = llm.build_chatglm();
              } break;
          case LLM_ARCH_BITNET:
              {
                  result = llm.build_bitnet();
              } break;
          case LLM_ARCH_T5:
              {
                  if (lctx.is_encoding) {
                      result = llm.build_t5_enc();
                  } else {
                      result = llm.build_t5_dec();
                  }
              } break;
          case LLM_ARCH_T5ENCODER:
              {
                  result = llm.build_t5_enc();
              } break;
          case LLM_ARCH_JAIS:
              {
                  result = llm.build_jais();
              } break;
          case LLM_ARCH_NEMOTRON:
              {
                  result = llm.build_nemotron();
              } break;
          case LLM_ARCH_EXAONE:
              {
                  result = llm.build_exaone();
              } break;
          case LLM_ARCH_RWKV6:
              {
                  result = llm.build_rwkv6();
              } break;
          case LLM_ARCH_RWKV6QWEN2:
              {
                  result = llm.build_rwkv6qwen2();
              } break;
          case LLM_ARCH_CHAMELEON:
              {
                  result = llm.build_chameleon();
              } break;
          case LLM_ARCH_WAVTOKENIZER_DEC:
              {
                  result = llm.build_wavtokenizer_dec();
              } break;
          default:
              GGML_ABORT("fatal error");
      }
    

References

[1] Yongqiang Cheng, https://yongqiang.blog.csdn.net/

[2] huggingface/gguf, https://github.com/huggingface/huggingface.js/tree/main/packages/gguf

相关推荐
武陵悭臾9 小时前
网络爬虫学习:应用selenium获取Edge浏览器版本号,自动下载对应版本msedgedriver,确保Edge浏览器顺利打开。
学习·selenium·edge·deepseek·winreg·zipfile
因_果_律15 小时前
基于 AWS SageMaker 对 DeepSeek-R1-Distilled-Llama-8B 模型的精调与实践
人工智能·云计算·llama·aws·deepseek
振华OPPO1 天前
DeepSeek大模型技术解析:从架构到应用的全面探索
深度学习·神经网络·大模型·deepseek
若年封尘1 天前
OpenAI的真正对手?DeepSeek-R1如何用强化学习重构LLM能力边界——DeepSeek-R1论文精读
开源·openai·强化学习·deepseek·deepseek-r1
L~river1 天前
5分钟带你获取deepseek api并搭建简易问答应用
llm·api·deepseek·deepseek-v3·deepseek-r1
Evenurs1 天前
【deepseek】deepseek-r1本地部署-第一步:下载LM Studio
ai·deepseek
AIQL1 天前
Deepseek的RL算法GRPO解读
人工智能·算法·机器学习·deepseek·grpo算法
Yongqiang Cheng2 天前
llama.cpp LLM_ARCH_LLAMA
llama.cpp·arch_llama
ht巷子4 天前
Zotero中使用Deepseek翻译
ai·zotero·deepseek