瑞莎星瑞(Radxa Orion O6) 基于 Android OS 使用 NPU的图片模糊查找APP 开发

安谋科技、此芯科技与瑞莎计算机共同推出"星睿O6"开发套件,专为AI PC、边缘计算及机器人等应用场景设计。该套件创新性地整合了Arm®v9 CPU核心、Arm Immortalis™ GPU以及安谋科技自主研发的"周易"NPU。

在Android操作系统环境下,开发者可利用这套开发套件,通过原生JNI方式对开源项目进行优化改造,实现基于NPU加速的CLIP技术,从而提升图片模糊搜索的性能表现。

🔍 Search local images with natural language on Android, powered by OpenAI's CLIP model. / 在 Android 上用自然语言搜索本地图片 (基于 OpenAI 的 CLIP 模型)

https://github.com/greyovo/PicQuery

下面我们开始改造之路:

  1. 导入项目到 AndroidStudio
  2. 导入 O6 NPU的 native lib 库
  3. 新增 JNI CPP代码,
  4. 改造项目 kotlin 代码
  5. 编译
  6. 上板执行!

下面给出具体的操作细节:

  • O6 NPU 的 native lib 库是在 Android 镜像的如下位置
bash 复制代码
.
└── vendor
    ├── include
    │   └── npu
    │       ├── kmd
    │       │   ├── armchina_aipu.h
    │       │   └── tcb.h
    │       └── standard_api.h
    └── lib64
        └── libaipudrv.so

这里还需要注意一下的就是,我们还需要把 libc++的库包含到 apk 的 jnilibs 中去,因为系统里提供的这个 so 应该是动态编译的,不包含这个 c++的 so,运行会报错的

bash 复制代码
vendor/lib64/libc++.so
  • 主要是用到的 API 可以参考 CIX NPU 开发指导手册

首先使用方法和 Linux 端的使用基本一致,只是需要根据 JNI 的方式做一些符合 JNI 要求的修改即可。

大家也可以参考我这边的代码来组织自己的code,这边应该可以说是通用的。

cpp 复制代码
extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_preprocessNpuInference(JNIEnv *env, jobject thiz, jbyteArray model, jint model_size)
{
    initTestBench(&opt);

    jbyte* model_bin = env->GetByteArrayElements(model, NULL);
    char* buffer1 = new char[model_size];

    memcpy(buffer1, model_bin, model_size);
    opt.model_bin=(buffer1);
    opt.bin_size = model_size;

    LOGE("[TEST INFO] preprocessNpuInference\n");

    memset(&sim_glb_config, 0, sizeof(sim_glb_config));
    memset(&sim_job_config, 0, sizeof(sim_job_config));
    memset(&mem_dump_config, 0, sizeof(mem_dump_config));
    mem_dump_config.dump_dir = opt.dump_dir;

    aipu_init_context(&ctx);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_init_ctx: %s\n", msg);
//        return -1;
    }

    ret = aipu_config_global(ctx, AIPU_CONFIG_TYPE_SIMULATION, &sim_glb_config);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_config_simulation: %s\n", msg);
//        goto deinit_ctx;
    }

    if (part_cnt == 0) {
        ret = aipu_get_partition_count(ctx, &part_cnt);
        if (ret != AIPU_STATUS_SUCCESS) {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("aipu_get_partition_count: %s \n", msg);
//            goto unload_graph;
        }

        for (uint32_t i = 0; i < part_cnt; i++) {
            ret = aipu_get_cluster_count(ctx, i, &cluster_cnt);
            if (ret != AIPU_STATUS_SUCCESS) {
                aipu_get_error_message(ctx, ret, &msg);
                LOGE("aipu_get_cluster_count: %s \n", msg);
//                goto unload_graph;
            }

            for (uint32_t j = 0; j < cluster_cnt; j++) {
                ret = aipu_get_core_count(ctx, i, j, &core_cnt);
                if (ret != AIPU_STATUS_SUCCESS) {
                    aipu_get_error_message(ctx, ret, &msg);
                    LOGE("aipu_get_core_count: %s \n", msg);
//                    goto unload_graph;
                }
                LOGE("[TEST INFO] <part_idx, cluster_idx, core_cnt> = <%u, %u, %u>\n", i, j, core_cnt);
            }
        }
    }

    ret = aipu_load_graph_helper(ctx, opt.model_bin,opt.bin_size, &graph_id);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_load_graph_helper: %s\n", msg);
//        goto deinit_ctx;
    }
    LOGE("[TEST INFO] AIPU load graph successfully.\n");

    ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, &input_cnt);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] INPUT: aipu_get_tensor_count: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] INPUT: aipu_get_tensor_count success: input_cnt = %d\n",input_cnt);

    for (uint32_t i = 0; i < input_cnt; i++)
    {
        aipu_tensor_desc_t desc;
        ret = aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, i, &desc);
        if (ret != AIPU_STATUS_SUCCESS)
        {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] INPUT: aipu_get_tensor_descriptor: %s\n", msg);
        }

        LOGE("[TEST INFO] INPUT[%d]: desc.size: %u\n", i, desc.size);
        LOGE("[TEST INFO] INPUT[%d]: desc.scale: %f\n", i, desc.scale);
        LOGE("[TEST INFO] INPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);
        LOGE("[TEST INFO] INPUT[%d]: desc.data_type: %u\n", i, desc.data_type);
        LOGE("[TEST INFO] INPUT[%d]: desc.id: %u\n", i, desc.id);

        input_desc.push_back(desc);
    }

    ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, &output_cnt);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        fprintf(stderr, "[TEST ERROR] aipu_get_tensor_count: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] OUTPUT: aipu_get_tensor_count success: output_cnt = %d\n", output_cnt);

    for (uint32_t i = 0; i < output_cnt; i++)
    {
        aipu_tensor_desc_t desc;
        aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, i, &desc);
        if (ret != AIPU_STATUS_SUCCESS)
        {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] aipu_get_tensor_descriptor: %s\n", msg);
//            goto unload_graph;
        }

        LOGE("[TEST INFO] OUTPUT[%d]: desc.size: %u\n", i, desc.size);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.scale: %f\n", i, desc.scale);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.data_type: %u\n", i, desc.data_type);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.id: %u\n", i, desc.id);

        output_desc.push_back(desc);
    }

    for (uint32_t i = 0; i < output_cnt; i++)
    {
        char* output = new char[output_desc[i].size];
        output_data.push_back(output);
    }

    env->ReleaseByteArrayElements(model, model_bin, 0);
    delete[] buffer1;

    return 0;
}
cpp 复制代码
extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_processNpuInference(JNIEnv *env, jobject thiz,
                                                       jintArray inputBin, jint inputLength,
//                                                       jbyteArray goldenOutputBin, jint outputLength,
                                                       jbyteArray output)
{
    char* buffer2 = new char[inputLength * sizeof(int)];
    jint* inputData = env->GetIntArrayElements(inputBin, NULL);
    jbyte* outputData = env->GetByteArrayElements(output, NULL);

    void* voidInputData = malloc(inputLength * sizeof(jint));
    if (voidInputData != nullptr) {
        memcpy(voidInputData, inputData, inputLength * sizeof(jint));
    }
    opt.inputs.push_back(voidInputData);
    opt.inputs_size.push_back(inputLength * sizeof(int));

    LOGE("[TEST INFO] NpuInference void* type inputLength= %lu \n", inputLength* sizeof(int));

//    char* buffer3 = new char[outputLength];
//    jbyte* outputGoldenData = env->GetByteArrayElements(goldenOutputBin, NULL);
//    if (outputGoldenData != NULL) {
//        memcpy(buffer3, outputGoldenData, outputLength);
//        opt.gt = buffer3;
//        opt.gt_size = outputLength;
//    }

    LOGE("[TEST INFO] do npu inference now\n");

    create_job_cfg.partition_id = 0;
    create_job_cfg.qos_level = AIPU_JOB_QOS_HIGH;
    ret = aipu_create_job(ctx, graph_id, &job_id, &create_job_cfg);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] aipu_create_job: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] aipu_create_job success\n");

//    cfg_types = AIPU_JOB_CONFIG_TYPE_DUMP_INPUT | AIPU_JOB_CONFIG_TYPE_DUMP_OUTPUT;
//    ret = aipu_config_job(ctx, job_id, cfg_types, &mem_dump_config);
//    if (ret != AIPU_STATUS_SUCCESS) {
//        aipu_get_error_message(ctx, ret, &msg);
//        LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
//    }

    aipu_config_job(ctx, job_id, AIPU_CONFIG_TYPE_SIMULATION, &sim_job_config);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
//        goto clean_job;
    }
    LOGE("[TEST INFO] set job simulation config success\n");

    if (opt.inputs.size() != input_cnt) {
        LOGE("[TEST WARN] input file count (%u) != input tensor count (%u)\n",
             (uint32_t)opt.inputs.size(), input_cnt);
    }

    for (uint32_t i = 0; i < min((uint32_t)opt.inputs.size(), input_cnt); i++) {
        if (input_desc[i].size > opt.inputs_size[i]) {
            LOGE("[TEST INFO] input file %s len 0x%x < input tensor %u size 0x%x\n",
                 opt.input_files[i].c_str(), opt.inputs_size[i], i, input_desc[i].size);
//            goto clean_job;
        }

        ret = aipu_load_tensor(ctx, job_id, i, opt.inputs[i]);
        if (ret != AIPU_STATUS_SUCCESS) {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] aipu_load_tensor: %s\n", msg);
//            goto clean_job;
        }
        LOGE("[TEST INFO] load input tensor %d from (%u/%u)\n", i, i+1, input_cnt);
    }

    gettimeofday(&timestart, NULL);

    aipu_finish_job(ctx, job_id, -1);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] aipu_finish_job: %s\n", msg);
        pass = -1;
//        goto clean_job;
    }
    LOGE("[TEST INFO] aipu_finish_job success\n");

    gettimeofday(&timeend, NULL);

    for (uint32_t i = 0; i < input_cnt; i++)
    {
        opt.inputs.pop_back();
        opt.inputs_size.pop_back();
    }

    for (uint32_t i = 0; i < output_cnt; i++)
    {
        ret = aipu_get_tensor(ctx, job_id, AIPU_TENSOR_TYPE_OUTPUT, i, output_data[i]);
        if (ret != AIPU_STATUS_SUCCESS)
        {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] aipu_get_tensor: %s\n", msg);
//            goto clean_job;
        }
        LOGE("[TEST INFO] get output tensor %u success (%u/%u)\n", i, i+1, output_cnt);
    }

//    pass = check_result_helper(output_data, output_desc, opt.gt, opt.gt_size);

    LOGE("[TEST INFO] output_desc[0].size 0x%x\n",output_desc[0].size);

    // post process
    LOGE("[TEST INFO] npu post process\n");
    memcpy(outputData, output_data[0], output_desc[0].size);

//    input_desc.clear();
//    output_desc.clear();
    clean_job:
    ret = aipu_clean_job(ctx, job_id);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_clean_job: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] aipu_clean_job success\n");

///////////////////////////////////////////
//unload_graph:
//    ret = aipu_unload_graph(ctx, graph_id);
//    if (ret != AIPU_STATUS_SUCCESS)
//    {
//        aipu_get_error_message(ctx, ret, &msg);
//        LOGE("[TEST ERROR] aipu_unload_graph: %s\n", msg);
//        goto deinit_ctx;
//    }
//    LOGE("[TEST INFO] aipu_unload_graph success\n");
//
//deinit_ctx:
//    ret = aipu_deinit_context(ctx);
//    if (ret != AIPU_STATUS_SUCCESS)
//    {
//        aipu_get_error_message(ctx, ret, &msg);
//        LOGE("[TEST ERROR] aipu_deinit_ctx: %s\n", msg);
////        return -1;
//    }
//    LOGE("[TEST INFO] aipu_deinit_context success\n");

////#endif
////    return 0;
//    finish:
//    if (AIPU_STATUS_SUCCESS != ret) {
//        pass = -1;
//    }
//    for (uint32_t i = 0; i < output_data.size(); i++) {
//        delete[] output_data[i];
//    }
//
//    output_data.clear();
///////////////////////////////////////////

    env->ReleaseIntArrayElements(inputBin, inputData, 0);
    env->ReleaseByteArrayElements(output, outputData, 0);
//    env->ReleaseByteArrayElements(goldenOutputBin, outputGoldenData, 0);

    delete[] buffer2;
//    delete[] buffer3;
    delete[] voidInputData;
    return 0;
}

CMakeLists.txt 也可以参考我的代码:

bash 复制代码
include_directories(
        ${CMAKE_CURRENT_SOURCE_DIR}
)

add_library(npu_inference SHARED ${CMAKE_CURRENT_SOURCE_DIR}/npu_inference.cpp)

target_link_libraries(npu_inference
        aipudrv
        android
        log
)
相关推荐
极客学术工坊2 小时前
2023年第二十届五一数学建模竞赛-A题 无人机定点投放问题-基于抛体运动的无人机定点投放问题研究
人工智能·机器学习·数学建模·启发式算法
Theodore_10223 小时前
深度学习(9)导数与计算图
人工智能·深度学习·机器学习·矩阵·线性回归
PPIO派欧云3 小时前
PPIO上新GPU实例模板,一键部署PaddleOCR-VL
人工智能
TGITCIC5 小时前
金融RAG落地之痛:不在模型,而在数据结构
人工智能·ai大模型·ai agent·ai智能体·开源大模型·金融ai·金融rag
六件套是我7 小时前
redission实现延时队列
android·java·servlet
chenzhiyuan20188 小时前
《十五五规划》下的AI边缘计算机遇:算力下沉与工业智能化
人工智能·边缘计算
whaosoft-1438 小时前
51c深度学习~合集11
人工智能
Tiandaren9 小时前
大模型应用03 || 函数调用 Function Calling || 概念、思想、流程
人工智能·算法·microsoft·数据分析
领航猿1号9 小时前
Pytorch 内存布局优化:Contiguous Memory
人工智能·pytorch·深度学习·机器学习
综合热讯9 小时前
宠智灵宠物识别AI:从犬猫到鸟鱼的全生态智能识别
人工智能·宠物