瑞莎星瑞（Radxa Orion O6) 基于 Android OS 使用 NPU的图片模糊查找APP 开发

安谋科技、此芯科技与瑞莎计算机共同推出"星睿O6"开发套件，专为AI PC、边缘计算及机器人等应用场景设计。该套件创新性地整合了Arm®v9 CPU核心、Arm Immortalis™ GPU以及安谋科技自主研发的"周易"NPU。

在Android操作系统环境下，开发者可利用这套开发套件，通过原生JNI方式对开源项目进行优化改造，实现基于NPU加速的CLIP技术，从而提升图片模糊搜索的性能表现。

🔍 Search local images with natural language on Android, powered by OpenAI's CLIP model. / 在 Android 上用自然语言搜索本地图片 (基于 OpenAI 的 CLIP 模型)

https://github.com/greyovo/PicQuery

下面我们开始改造之路：

导入项目到 AndroidStudio
导入 O6 NPU的 native lib 库
新增 JNI CPP代码，
改造项目 kotlin 代码
编译
上板执行！

下面给出具体的操作细节：

O6 NPU 的 native lib 库是在 Android 镜像的如下位置

bash 复制代码

.
└── vendor
    ├── include
    │   └── npu
    │       ├── kmd
    │       │   ├── armchina_aipu.h
    │       │   └── tcb.h
    │       └── standard_api.h
    └── lib64
        └── libaipudrv.so

这里还需要注意一下的就是，我们还需要把 libc++的库包含到 apk 的 jnilibs 中去，因为系统里提供的这个 so 应该是动态编译的，不包含这个 c++的 so,运行会报错的

bash 复制代码

vendor/lib64/libc++.so

主要是用到的 API 可以参考 CIX NPU 开发指导手册

首先使用方法和 Linux 端的使用基本一致，只是需要根据 JNI 的方式做一些符合 JNI 要求的修改即可。

大家也可以参考我这边的代码来组织自己的code，这边应该可以说是通用的。

cpp 复制代码

extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_preprocessNpuInference(JNIEnv *env, jobject thiz, jbyteArray model, jint model_size)
{
    initTestBench(&opt);

    jbyte* model_bin = env->GetByteArrayElements(model, NULL);
    char* buffer1 = new char[model_size];

    memcpy(buffer1, model_bin, model_size);
    opt.model_bin=(buffer1);
    opt.bin_size = model_size;

    LOGE("[TEST INFO] preprocessNpuInference\n");

    memset(&sim_glb_config, 0, sizeof(sim_glb_config));
    memset(&sim_job_config, 0, sizeof(sim_job_config));
    memset(&mem_dump_config, 0, sizeof(mem_dump_config));
    mem_dump_config.dump_dir = opt.dump_dir;

    aipu_init_context(&ctx);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_init_ctx: %s\n", msg);
//        return -1;
    }

    ret = aipu_config_global(ctx, AIPU_CONFIG_TYPE_SIMULATION, &sim_glb_config);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_config_simulation: %s\n", msg);
//        goto deinit_ctx;
    }

    if (part_cnt == 0) {
        ret = aipu_get_partition_count(ctx, &part_cnt);
        if (ret != AIPU_STATUS_SUCCESS) {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("aipu_get_partition_count: %s \n", msg);
//            goto unload_graph;
        }

        for (uint32_t i = 0; i < part_cnt; i++) {
            ret = aipu_get_cluster_count(ctx, i, &cluster_cnt);
            if (ret != AIPU_STATUS_SUCCESS) {
                aipu_get_error_message(ctx, ret, &msg);
                LOGE("aipu_get_cluster_count: %s \n", msg);
//                goto unload_graph;
            }

            for (uint32_t j = 0; j < cluster_cnt; j++) {
                ret = aipu_get_core_count(ctx, i, j, &core_cnt);
                if (ret != AIPU_STATUS_SUCCESS) {
                    aipu_get_error_message(ctx, ret, &msg);
                    LOGE("aipu_get_core_count: %s \n", msg);
//                    goto unload_graph;
                }
                LOGE("[TEST INFO] <part_idx, cluster_idx, core_cnt> = <%u, %u, %u>\n", i, j, core_cnt);
            }
        }
    }

    ret = aipu_load_graph_helper(ctx, opt.model_bin,opt.bin_size, &graph_id);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_load_graph_helper: %s\n", msg);
//        goto deinit_ctx;
    }
    LOGE("[TEST INFO] AIPU load graph successfully.\n");

    ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, &input_cnt);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] INPUT: aipu_get_tensor_count: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] INPUT: aipu_get_tensor_count success: input_cnt = %d\n",input_cnt);

    for (uint32_t i = 0; i < input_cnt; i++)
    {
        aipu_tensor_desc_t desc;
        ret = aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, i, &desc);
        if (ret != AIPU_STATUS_SUCCESS)
        {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] INPUT: aipu_get_tensor_descriptor: %s\n", msg);
        }

        LOGE("[TEST INFO] INPUT[%d]: desc.size: %u\n", i, desc.size);
        LOGE("[TEST INFO] INPUT[%d]: desc.scale: %f\n", i, desc.scale);
        LOGE("[TEST INFO] INPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);
        LOGE("[TEST INFO] INPUT[%d]: desc.data_type: %u\n", i, desc.data_type);
        LOGE("[TEST INFO] INPUT[%d]: desc.id: %u\n", i, desc.id);

        input_desc.push_back(desc);
    }

    ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, &output_cnt);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        fprintf(stderr, "[TEST ERROR] aipu_get_tensor_count: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] OUTPUT: aipu_get_tensor_count success: output_cnt = %d\n", output_cnt);

    for (uint32_t i = 0; i < output_cnt; i++)
    {
        aipu_tensor_desc_t desc;
        aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, i, &desc);
        if (ret != AIPU_STATUS_SUCCESS)
        {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] aipu_get_tensor_descriptor: %s\n", msg);
//            goto unload_graph;
        }

        LOGE("[TEST INFO] OUTPUT[%d]: desc.size: %u\n", i, desc.size);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.scale: %f\n", i, desc.scale);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.data_type: %u\n", i, desc.data_type);
        LOGE("[TEST INFO] OUTPUT[%d]: desc.id: %u\n", i, desc.id);

        output_desc.push_back(desc);
    }

    for (uint32_t i = 0; i < output_cnt; i++)
    {
        char* output = new char[output_desc[i].size];
        output_data.push_back(output);
    }

    env->ReleaseByteArrayElements(model, model_bin, 0);
    delete[] buffer1;

    return 0;
}

cpp 复制代码

extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_processNpuInference(JNIEnv *env, jobject thiz,
                                                       jintArray inputBin, jint inputLength,
//                                                       jbyteArray goldenOutputBin, jint outputLength,
                                                       jbyteArray output)
{
    char* buffer2 = new char[inputLength * sizeof(int)];
    jint* inputData = env->GetIntArrayElements(inputBin, NULL);
    jbyte* outputData = env->GetByteArrayElements(output, NULL);

    void* voidInputData = malloc(inputLength * sizeof(jint));
    if (voidInputData != nullptr) {
        memcpy(voidInputData, inputData, inputLength * sizeof(jint));
    }
    opt.inputs.push_back(voidInputData);
    opt.inputs_size.push_back(inputLength * sizeof(int));

    LOGE("[TEST INFO] NpuInference void* type inputLength= %lu \n", inputLength* sizeof(int));

//    char* buffer3 = new char[outputLength];
//    jbyte* outputGoldenData = env->GetByteArrayElements(goldenOutputBin, NULL);
//    if (outputGoldenData != NULL) {
//        memcpy(buffer3, outputGoldenData, outputLength);
//        opt.gt = buffer3;
//        opt.gt_size = outputLength;
//    }

    LOGE("[TEST INFO] do npu inference now\n");

    create_job_cfg.partition_id = 0;
    create_job_cfg.qos_level = AIPU_JOB_QOS_HIGH;
    ret = aipu_create_job(ctx, graph_id, &job_id, &create_job_cfg);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] aipu_create_job: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] aipu_create_job success\n");

//    cfg_types = AIPU_JOB_CONFIG_TYPE_DUMP_INPUT | AIPU_JOB_CONFIG_TYPE_DUMP_OUTPUT;
//    ret = aipu_config_job(ctx, job_id, cfg_types, &mem_dump_config);
//    if (ret != AIPU_STATUS_SUCCESS) {
//        aipu_get_error_message(ctx, ret, &msg);
//        LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
//    }

    aipu_config_job(ctx, job_id, AIPU_CONFIG_TYPE_SIMULATION, &sim_job_config);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
//        goto clean_job;
    }
    LOGE("[TEST INFO] set job simulation config success\n");

    if (opt.inputs.size() != input_cnt) {
        LOGE("[TEST WARN] input file count (%u) != input tensor count (%u)\n",
             (uint32_t)opt.inputs.size(), input_cnt);
    }

    for (uint32_t i = 0; i < min((uint32_t)opt.inputs.size(), input_cnt); i++) {
        if (input_desc[i].size > opt.inputs_size[i]) {
            LOGE("[TEST INFO] input file %s len 0x%x < input tensor %u size 0x%x\n",
                 opt.input_files[i].c_str(), opt.inputs_size[i], i, input_desc[i].size);
//            goto clean_job;
        }

        ret = aipu_load_tensor(ctx, job_id, i, opt.inputs[i]);
        if (ret != AIPU_STATUS_SUCCESS) {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] aipu_load_tensor: %s\n", msg);
//            goto clean_job;
        }
        LOGE("[TEST INFO] load input tensor %d from (%u/%u)\n", i, i+1, input_cnt);
    }

    gettimeofday(&timestart, NULL);

    aipu_finish_job(ctx, job_id, -1);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] aipu_finish_job: %s\n", msg);
        pass = -1;
//        goto clean_job;
    }
    LOGE("[TEST INFO] aipu_finish_job success\n");

    gettimeofday(&timeend, NULL);

    for (uint32_t i = 0; i < input_cnt; i++)
    {
        opt.inputs.pop_back();
        opt.inputs_size.pop_back();
    }

    for (uint32_t i = 0; i < output_cnt; i++)
    {
        ret = aipu_get_tensor(ctx, job_id, AIPU_TENSOR_TYPE_OUTPUT, i, output_data[i]);
        if (ret != AIPU_STATUS_SUCCESS)
        {
            aipu_get_error_message(ctx, ret, &msg);
            LOGE("[TEST ERROR] aipu_get_tensor: %s\n", msg);
//            goto clean_job;
        }
        LOGE("[TEST INFO] get output tensor %u success (%u/%u)\n", i, i+1, output_cnt);
    }

//    pass = check_result_helper(output_data, output_desc, opt.gt, opt.gt_size);

    LOGE("[TEST INFO] output_desc[0].size 0x%x\n",output_desc[0].size);

    // post process
    LOGE("[TEST INFO] npu post process\n");
    memcpy(outputData, output_data[0], output_desc[0].size);

//    input_desc.clear();
//    output_desc.clear();
    clean_job:
    ret = aipu_clean_job(ctx, job_id);
    if (ret != AIPU_STATUS_SUCCESS)
    {
        aipu_get_error_message(ctx, ret, &msg);
        LOGE("[TEST ERROR] AIPU_clean_job: %s\n", msg);
//        goto unload_graph;
    }
    LOGE("[TEST INFO] aipu_clean_job success\n");

///////////////////////////////////////////
//unload_graph:
//    ret = aipu_unload_graph(ctx, graph_id);
//    if (ret != AIPU_STATUS_SUCCESS)
//    {
//        aipu_get_error_message(ctx, ret, &msg);
//        LOGE("[TEST ERROR] aipu_unload_graph: %s\n", msg);
//        goto deinit_ctx;
//    }
//    LOGE("[TEST INFO] aipu_unload_graph success\n");
//
//deinit_ctx:
//    ret = aipu_deinit_context(ctx);
//    if (ret != AIPU_STATUS_SUCCESS)
//    {
//        aipu_get_error_message(ctx, ret, &msg);
//        LOGE("[TEST ERROR] aipu_deinit_ctx: %s\n", msg);
////        return -1;
//    }
//    LOGE("[TEST INFO] aipu_deinit_context success\n");

////#endif
////    return 0;
//    finish:
//    if (AIPU_STATUS_SUCCESS != ret) {
//        pass = -1;
//    }
//    for (uint32_t i = 0; i < output_data.size(); i++) {
//        delete[] output_data[i];
//    }
//
//    output_data.clear();
///////////////////////////////////////////

    env->ReleaseIntArrayElements(inputBin, inputData, 0);
    env->ReleaseByteArrayElements(output, outputData, 0);
//    env->ReleaseByteArrayElements(goldenOutputBin, outputGoldenData, 0);

    delete[] buffer2;
//    delete[] buffer3;
    delete[] voidInputData;
    return 0;
}

CMakeLists.txt 也可以参考我的代码：

bash 复制代码

include_directories(
        ${CMAKE_CURRENT_SOURCE_DIR}
)

add_library(npu_inference SHARED ${CMAKE_CURRENT_SOURCE_DIR}/npu_inference.cpp)

target_link_libraries(npu_inference
        aipudrv
        android
        log
)