开源大模型框架llama.cpp使用C++ api开发入门

llama.cpp是一个C++编写的轻量级开源类AIGC大模型框架，可以支持在消费级普通设备上本地部署运行大模型，以及作为依赖库集成的到应用程序中提供类GPT的功能。

以下基于llama.cpp的源码利用C++ api来开发实例demo演示加载本地模型文件并提供GPT文本生成。

项目结构

bash 复制代码

llamacpp_starter
    - llama.cpp-b1547
    - src
      |- main.cpp
    - CMakeLists.txt

CMakeLists.txt

bash 复制代码

cmake_minimum_required(VERSION 3.15)

# this only works for unix, xapian source code not support compile in windows yet

project(llamacpp_starter)

set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

add_subdirectory(llama.cpp-b1547)

include_directories(
    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547
    ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-b1547/common
)

file(GLOB SRC
    src/*.h
    src/*.cpp
)

add_executable(${PROJECT_NAME} ${SRC})

target_link_libraries(${PROJECT_NAME}
    common
    llama
)

main.cpp

cpp 复制代码

#include <iostream>
#include <string>
#include <vector>
#include "common.h"
#include "llama.h"

int main(int argc, char** argv)
{
    bool numa_support = false;
    const std::string model_file_path = "./llama-ggml.gguf";
    const std::string prompt = "once upon a time"; // input words
    const int n_len = 32;     // total length of the sequence including the prompt

    // set gpt params
    gpt_params params;
    params.model = model_file_path;
    params.prompt = prompt;


    // init LLM
    llama_backend_init(false);

    // load model
    llama_model_params model_params = llama_model_default_params();
    //model_params.n_gpu_layers = 99; // offload all layers to the GPU

    llama_model* model = llama_load_model_from_file(model_file_path.c_str(), model_params);

    if (model == NULL)
    {
        std::cerr << __func__ << " load model file error" << std::endl;
        return 1;
    }

    // init context
    llama_context_params ctx_params = llama_context_default_params();

    ctx_params.seed = 1234;
    ctx_params.n_ctx = 2048;
    ctx_params.n_threads = params.n_threads;
    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;

    llama_context* ctx = llama_new_context_with_model(model, ctx_params);

    if (ctx == NULL)
    {
        std::cerr << __func__ << " failed to create the llama_context" << std::endl;
        return 1;
    }

    // tokenize the prompt
    std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

    const int n_ctx = llama_n_ctx(ctx);
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());

    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx)
    {
        std::cerr << __func__ << " error: n_kv_req > n_ctx, the required KV cache size is not big enough" << std::endl;
        std::cerr << __func__ << " either reduce n_parallel or increase n_ctx" << std::endl;
        return 1;
    }

    // print the prompt token-by-token
    for (auto id : tokens_list)
        std::cout << llama_token_to_piece(ctx, id) << " ";
    std::cout << std::endl;

    // create a llama_batch with size 512
    // we use this object to submit token data for decoding
    llama_batch batch = llama_batch_init(512, 0, 1);

    // evaluate the initial prompt
    for (size_t i = 0; i < tokens_list.size(); i++)
        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);

    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;

    if (llama_decode(ctx, batch) != 0)
    {
        std::cerr << __func__ << " llama_decode failed" << std::endl;
        return 1;
    }

    // main loop to generate words
    int n_cur = batch.n_tokens;
    int n_decode = 0;

    const auto t_main_start = ggml_time_us();

    while (n_cur <= n_len)
    {
        // sample the next token
        auto n_vocab = llama_n_vocab(model);
        auto* logits = llama_get_logits_ith(ctx, batch.n_tokens - 1);

        std::vector<llama_token_data> candidates;
        candidates.reserve(n_vocab);

        for (llama_token token_id = 0; token_id < n_vocab; token_id++)
        {
            candidates.emplace_back(llama_token_data{ token_id, logits[token_id], 0.0f });
        }

        llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };

        // sample the most likely token
        const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);

        // is it an end of stream?
        if (new_token_id == llama_token_eos(model) || n_cur == n_len)
        {
            std::cout << std::endl;
            break;
        }

        std::cout << llama_token_to_piece(ctx, new_token_id) << " ";

        // prepare the next batch
        llama_batch_clear(batch);

        // push this new token for next evaluation
        llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);

        n_decode += 1;

        n_cur += 1;

        // evaluate the current batch with the transformer model
        if (llama_decode(ctx, batch))
        {
            std::cerr << __func__ << " failed to eval" << std::endl;
            return 1;
        }
    }
    std::cout << std::endl;

    const auto t_main_end = ggml_time_us();

    std::cout << __func__ << " decoded " << n_decode << " tokens in " << (t_main_end - t_main_start) / 1000000.0f << " s, speed: " << n_decode / ((t_main_end - t_main_start) / 1000000.0f) << " t / s" << std::endl;

    llama_print_timings(ctx);

    llama_batch_free(batch);

    // free context
    llama_free(ctx);
    llama_free_model(model);

    // free LLM
    llama_backend_free();

    return 0;
}

源码

llamacpp_starter

本文由博客一文多发平台 OpenWrite 发布！