【rk】——rk3588推理获得logits

说明:用python进行rk3588模型RKLLM_INFER_GET_LOGITS模式推理获得logits

代码:

python 复制代码
import ctypes
import sys
import os
import subprocess
import resource
import threading
import time
import argparse
import json
from flask import Flask, request, jsonify, Response
from transformers import AutoTokenizer
import numpy as np
from tqdm import tqdm

# Set the dynamic library path
rkllm_lib = ctypes.CDLL('lib/librkllmrt.so')
# Define the structures from the library
RKLLM_Handle_t = ctypes.c_void_p
userdata = ctypes.c_void_p(None)

LLMCallState = ctypes.c_int
LLMCallState.RKLLM_RUN_NORMAL  = 0
LLMCallState.RKLLM_RUN_WAITING  = 1
LLMCallState.RKLLM_RUN_FINISH  = 2
LLMCallState.RKLLM_RUN_ERROR   = 3

RKLLMInputType = ctypes.c_int
RKLLMInputType.RKLLM_INPUT_PROMPT      = 0
RKLLMInputType.RKLLM_INPUT_TOKEN       = 1
RKLLMInputType.RKLLM_INPUT_EMBED       = 2
RKLLMInputType.RKLLM_INPUT_MULTIMODAL  = 3

RKLLMInferMode = ctypes.c_int
RKLLMInferMode.RKLLM_INFER_GENERATE = 0
RKLLMInferMode.RKLLM_INFER_GET_LAST_HIDDEN_LAYER = 1
RKLLMInferMode.RKLLM_INFER_GET_LOGITS = 2
class RKLLMExtendParam(ctypes.Structure):
    _fields_ = [
        ("base_domain_id", ctypes.c_int32),
        ("embed_flash", ctypes.c_int8),
        ("enabled_cpus_num", ctypes.c_int8),
        ("enabled_cpus_mask", ctypes.c_uint32),
        ("n_batch", ctypes.c_uint8),
        ("use_cross_attn", ctypes.c_int8),
        ("reserved", ctypes.c_uint8 * 104)
    ]

class RKLLMParam(ctypes.Structure):
    _fields_ = [
        ("model_path", ctypes.c_char_p),
        ("max_context_len", ctypes.c_int32),
        ("max_new_tokens", ctypes.c_int32),
        ("top_k", ctypes.c_int32),
        ("n_keep", ctypes.c_int32),
        ("top_p", ctypes.c_float),
        ("temperature", ctypes.c_float),
        ("repeat_penalty", ctypes.c_float),
        ("frequency_penalty", ctypes.c_float),
        ("presence_penalty", ctypes.c_float),
        ("mirostat", ctypes.c_int32),
        ("mirostat_tau", ctypes.c_float),
        ("mirostat_eta", ctypes.c_float),
        ("skip_special_token", ctypes.c_bool),
        ("is_async", ctypes.c_bool),
        ("img_start", ctypes.c_char_p),
        ("img_end", ctypes.c_char_p),
        ("img_content", ctypes.c_char_p),
        ("extend_param", RKLLMExtendParam),
    ]

class RKLLMLoraAdapter(ctypes.Structure):
    _fields_ = [
        ("lora_adapter_path", ctypes.c_char_p),
        ("lora_adapter_name", ctypes.c_char_p),
        ("scale", ctypes.c_float)
    ]

class RKLLMEmbedInput(ctypes.Structure):
    _fields_ = [
        ("embed", ctypes.POINTER(ctypes.c_float)),
        ("n_tokens", ctypes.c_size_t)
    ]

class RKLLMTokenInput(ctypes.Structure):
    _fields_ = [
        ("input_ids", ctypes.POINTER(ctypes.c_int32)),
        ("n_tokens", ctypes.c_size_t)
    ]

class RKLLMMultiModalInput(ctypes.Structure):
    _fields_ = [
        ("prompt", ctypes.c_char_p),
        ("image_embed", ctypes.POINTER(ctypes.c_float)),
        ("n_image_tokens", ctypes.c_size_t),
        ("n_image", ctypes.c_size_t),
        ("image_width", ctypes.c_size_t),
        ("image_height", ctypes.c_size_t)
    ]

class RKLLMInputUnion(ctypes.Union):
    _fields_ = [
        ("prompt_input", ctypes.c_char_p),
        ("embed_input", RKLLMEmbedInput),
        ("token_input", RKLLMTokenInput),
        ("multimodal_input", RKLLMMultiModalInput)
    ]

class RKLLMInput(ctypes.Structure):
    _fields_ = [
        ("role", ctypes.c_char_p),
        ("enable_thinking", ctypes.c_bool),
        ("input_type", RKLLMInputType),
        ("input_data", RKLLMInputUnion)
    ]

class RKLLMLoraParam(ctypes.Structure):
    _fields_ = [
        ("lora_adapter_name", ctypes.c_char_p)
    ]

class RKLLMPromptCacheParam(ctypes.Structure):
    _fields_ = [
        ("save_prompt_cache", ctypes.c_int),
        ("prompt_cache_path", ctypes.c_char_p)
    ]

class RKLLMInferParam(ctypes.Structure):
    _fields_ = [
        ("mode", RKLLMInferMode),
        ("lora_params", ctypes.POINTER(RKLLMLoraParam)),
        ("prompt_cache_params", ctypes.POINTER(RKLLMPromptCacheParam)),
        ("keep_history", ctypes.c_int)
    ]

class RKLLMResultLastHiddenLayer(ctypes.Structure):
    _fields_ = [
        ("hidden_states", ctypes.POINTER(ctypes.c_float)),
        ("embd_size", ctypes.c_int),
        ("num_tokens", ctypes.c_int)
    ]

class RKLLMResultLogits(ctypes.Structure):
    _fields_ = [
        ("logits", ctypes.POINTER(ctypes.c_float)),
        ("vocab_size", ctypes.c_int),
        ("num_tokens", ctypes.c_int)
    ]

class RKLLMPerfStat(ctypes.Structure):
    _fields_ = [
        ("prefill_time_ms", ctypes.c_float),
        ("prefill_tokens", ctypes.c_int),
        ("generate_time_ms", ctypes.c_float),
        ("generate_tokens", ctypes.c_int),
        ("memory_usage_mb", ctypes.c_float)
    ]

class RKLLMResult(ctypes.Structure):
    _fields_ = [
        ("text", ctypes.c_char_p),
        ("token_id", ctypes.c_int),
        ("last_hidden_layer", RKLLMResultLastHiddenLayer),
        ("logits", RKLLMResultLogits),
        ("perf", RKLLMPerfStat)
    ]

# Create a lock to control multi-user access to the server.
lock = threading.Lock()

# Create a global variable to indicate whether the server is currently in a blocked state.
is_blocking = False

# Define global variables to store the callback function output for displaying in the Gradio interface
system_prompt = ''
global_text = []
global_state = -1
split_byte_data = bytes(b"") # Used to store the segmented byte data
global_logits = None
global_input_ids_len = 0

recevied_messages = []

# Define the callback function
def callback_impl(result, userdata, state):
    global global_text, global_state, split_byte_data
    if state == LLMCallState.RKLLM_RUN_FINISH:
        global_state = state
        print("\n")
        sys.stdout.flush()
    elif state == LLMCallState.RKLLM_RUN_ERROR:
        global_state = state
        print("run error")
        sys.stdout.flush()
    elif state == LLMCallState.RKLLM_RUN_NORMAL:
        global_state = state
        global_text += result.contents.text.decode('utf-8')
    return 0


def ppl_callback_impl(result, userdata, state):
    global global_input_ids_len, global_logits
    if state == LLMCallState.RKLLM_RUN_NORMAL:
        if global_input_ids_len != result.contents.logits.num_tokens:
            print(f"input_ids_len:{global_input_ids_len}, num_tokens:{result.contents.logits.num_tokens}")
        num_tokens = result.contents.logits.num_tokens
        vocab_size = result.contents.logits.vocab_size
        global_logits = np.ctypeslib.as_array(result.contents.logits.logits, shape=(num_tokens, vocab_size))
    elif state == LLMCallState.RKLLM_RUN_FINISH:
        pass
    else:
        raise Exception("ppl Call Error")
    return 0
    

# Connect the callback function between the Python side and the C++ side
callback_type = ctypes.CFUNCTYPE(ctypes.c_int, ctypes.POINTER(RKLLMResult), ctypes.c_void_p, ctypes.c_int)
callback = callback_type(ppl_callback_impl)

# Define the RKLLM class, which includes initialization, inference, and release operations for the RKLLM model in the dynamic library
class RKLLM(object):
    def __init__(self, model_path, lora_model_path = None, prompt_cache_path = None, platform = "rk3588"):

        self.rkllm_createDefaultParam = rkllm_lib.rkllm_createDefaultParam
        self.rkllm_createDefaultParam.argtypes = []
        self.rkllm_createDefaultParam.restype = RKLLMParam
        rkllm_param = self.rkllm_createDefaultParam()
        rkllm_param.model_path = bytes(model_path, 'utf-8')
        rkllm_param.top_k = 1
        rkllm_param.top_p = 0.95
        rkllm_param.temperature = 0.8
        rkllm_param.repeat_penalty = 1.1
        rkllm_param.frequency_penalty = 0.0
        rkllm_param.presence_penalty = 0.0

        rkllm_param.max_new_tokens = 1
        rkllm_param.max_context_len = 2048
        rkllm_param.skip_special_token = ctypes.c_bool(True)
        rkllm_param.extend_param.base_domain_id = 0
        rkllm_param.extend_param.embed_flash = 1

        rkllm_param.extend_param.enabled_cpus_num = 4
        if platform.lower() in ["rk3576", "rk3588"]:
            rkllm_param.extend_param.enabled_cpus_mask = (1 << 4)|(1 << 5)|(1 << 6)|(1 << 7)
        else:
            rkllm_param.extend_param.enabled_cpus_mask = (1 << 0)|(1 << 1)|(1 << 2)|(1 << 3)

        self.handle = RKLLM_Handle_t()

        self.rkllm_init = rkllm_lib.rkllm_init
        self.rkllm_init.argtypes = [ctypes.POINTER(RKLLM_Handle_t), ctypes.POINTER(RKLLMParam), callback_type]
        self.rkllm_init.restype = ctypes.c_int
        ret = self.rkllm_init(ctypes.byref(self.handle), ctypes.byref(rkllm_param), callback)
        if (ret != 0):
            print("\nrkllm init failed\n")
            exit(0)
        else:
            print("\nrkllm init success!\n")

        self.rkllm_run = rkllm_lib.rkllm_run
        self.rkllm_run.argtypes = [RKLLM_Handle_t, ctypes.POINTER(RKLLMInput), ctypes.POINTER(RKLLMInferParam), ctypes.c_void_p]
        self.rkllm_run.restype = ctypes.c_int

        self.rkllm_clear_kv_cache = rkllm_lib.rkllm_clear_kv_cache
        self.rkllm_clear_kv_cache.argtypes = [RKLLM_Handle_t, ctypes.c_int, ctypes.POINTER(ctypes.c_int), ctypes.POINTER(ctypes.c_int)]
        self.rkllm_clear_kv_cache.restype = ctypes.c_int
        
        self.rkllm_destroy = rkllm_lib.rkllm_destroy
        self.rkllm_destroy.argtypes = [RKLLM_Handle_t]
        self.rkllm_destroy.restype = ctypes.c_int
        
        self.rkllm_abort = rkllm_lib.rkllm_abort

        self.rkllm_infer_params = RKLLMInferParam()
        ctypes.memset(ctypes.byref(self.rkllm_infer_params), 0, ctypes.sizeof(RKLLMInferParam))
        self.rkllm_infer_params.mode = RKLLMInferMode.RKLLM_INFER_GET_LOGITS
        self.rkllm_infer_params.keep_history = 0


    def run_with_ids(self, *param):
        role, enable_thinking, ids = param
        rkllm_input = RKLLMInput()
        ctypes.memset(ctypes.byref(rkllm_input), 0, ctypes.sizeof(RKLLMInput))

        rkllm_input.input_type = RKLLMInputType.RKLLM_INPUT_TOKEN
        rkllm_input.input_data.token_input.input_ids = ids.ctypes.data_as(ctypes.POINTER(ctypes.c_int32))
        rkllm_input.input_data.token_input.n_tokens = ctypes.c_size_t(ids.shape[0])
        self.rkllm_run(self.handle, ctypes.byref(rkllm_input), ctypes.byref(self.rkllm_infer_params), None)
        return

    def abort(self):
        return self.rkllm_abort(self.handle)
    
    def release(self):
        self.rkllm_destroy(self.handle)


def cross_entropy_loss_numpy(logits, targets):
    shift_logits = logits - np.max(logits, axis=1, keepdims=True)
    log_probs = shift_logits - np.log(np.sum(np.exp(shift_logits), axis=1, keepdims=True))
    n = logits.shape[0]
    loss = -log_probs[np.arange(n), targets]
    return np.mean(loss)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--rkllm_model_path', type=str, default="models/Qwen3-0.6B-rk3588-w8a8.rkllm", help='Absolute path of the converted RKLLM model on the Linux board;')
    parser.add_argument('--target_platform', type=str, default="rk3588", help='Target platform: e.g., rk3588/rk3576;')
    parser.add_argument('--lora_model_path', type=str, help='Absolute path of the lora_model on the Linux board;')
    parser.add_argument('--prompt_cache_path', type=str, help='Absolute path of the prompt_cache file on the Linux board;')
    args = parser.parse_args()

    ## qwen3-1.7b
    args.rkllm_model_path = "models/Qwen3-1.7B-rk3588-w8a8.rkllm"
    json_file = "assert/gsm8k_17d799_qwen3_1.7b_pred.json"

    ## qwen3-0.6b
    args.rkllm_model_path = "models/Qwen3-0.6B-rk3588-w8a8.rkllm"
    json_file = "assert/gsm8k_17d799_qwen3_0.6B_pred.json"

    if not os.path.exists(args.rkllm_model_path):
        print("Error: Please provide the correct rkllm model path, and ensure it is the absolute path on the board.")
        sys.stdout.flush()
        exit()

    if not (args.target_platform in ["rk3588", "rk3576", "rv1126b", "rk3562"]):
        print("Error: Please specify the correct target platform: rk3588/rk3576/rv1126b/rk3562.")
        sys.stdout.flush()
        exit()

    if args.lora_model_path:
        if not os.path.exists(args.lora_model_path):
            print("Error: Please provide the correct lora_model path, and advise it is the absolute path on the board.")
            sys.stdout.flush()
            exit()

    if args.prompt_cache_path:
        if not os.path.exists(args.prompt_cache_path):
            print("Error: Please provide the correct prompt_cache_file path, and advise it is the absolute path on the board.")
            sys.stdout.flush()
            exit()

    # Fix frequency
    command = "sudo bash fix_freq_{}.sh".format(args.target_platform)
    subprocess.run(command, shell=True)

    # Set resource limit
    resource.setrlimit(resource.RLIMIT_NOFILE, (102400, 102400))

    # Initialize RKLLM model
    print("=========init....===========")
    sys.stdout.flush()
    model_path = args.rkllm_model_path
    rkllm_model = RKLLM(model_path, args.lora_model_path, args.prompt_cache_path, args.target_platform)
    print("load rkllm model from : {}".format(model_path))
    print("==============================")
    sys.stdout.flush()

    
    ## json_file
    with open(json_file, "r") as f:
        json_datas = json.load(f)

    loss = 0.0
    num_batches = 0
    for key, item in tqdm(json_datas.items()):
        input_ids = item['input_ids']
        gen_index = item['gen_index']
        input_ids = np.array(input_ids, dtype=np.int32)
        print("input_ids len:", input_ids.shape[0])

        # Reset global variables.
        ret = rkllm_model.rkllm_clear_kv_cache(rkllm_model.handle, 1, None, None)
        if ret != 0:
            print("clear kv cache failed")

        global_input_ids_len = input_ids.shape[0]
        inputs = ["user", False, input_ids]
        rkllm_model.run_with_ids(*inputs)   

        shift_logits = global_logits[gen_index:-1, :]
        shift_labels = input_ids[gen_index+1:]

        ce_loss = cross_entropy_loss_numpy(shift_logits, shift_labels)
        print("ce Loss: {:.4f}, ppl loss: {:.4f}".format(ce_loss, np.exp(ce_loss)))
        loss += ce_loss
        num_batches += 1
        if num_batches >= 100:
                break

        pass

    ppl_loss = np.exp(loss / num_batches)
    print("ppl loss: {:.4f}".format(ppl_loss))
相关推荐
新缸中之脑9 小时前
Graphlit: AI代理的上下文图层
大数据·人工智能
用户986356545709 小时前
肝了一个周末,终于把 n8n 自动化工作流完美部署了(附私有化防坑指南)
人工智能
田里的水稻10 小时前
FA_规划和控制(PC)-瑞德斯.谢普路径规划(RSPP))
人工智能·算法·数学建模·机器人·自动驾驶
初恋叫萱萱10 小时前
基于 Rust 与 DeepSeek V3.2 构建高性能插件化 LLM 应用框架深度解析
网络·人工智能·rust
heimeiyingwang10 小时前
大模型 RAG 技术原理与企业级落地实践
大数据·数据库·人工智能·架构
Yuer202510 小时前
Model Admission White-List Spec高收益任务 · 模型准入白名单规范 v0.1(冻结草案)
人工智能·edca os·可控ai
小殊小殊10 小时前
2025 年人工智能的现状:Agent从黑马到泡沫?
人工智能·chatgpt
Gaosiy10 小时前
脑机接口与神经调控专栏开通
人工智能·脑机接口·神经调控
Faker66363aaa10 小时前
Mask R-CNN实现植物存在性检测与分类详解_基于R50-FPN-GRoIE_1x_COCO模型分析
人工智能·分类·cnn
Hui Baby11 小时前
推理引擎vLLM & TensorRT
人工智能