安谋科技、此芯科技与瑞莎计算机共同推出"星睿O6"开发套件,专为AI PC、边缘计算及机器人等应用场景设计。该套件创新性地整合了Arm®v9 CPU核心、Arm Immortalis™ GPU以及安谋科技自主研发的"周易"NPU。
在Android操作系统环境下,开发者可利用这套开发套件,通过原生JNI方式对开源项目进行优化改造,实现基于NPU加速的CLIP技术,从而提升图片模糊搜索的性能表现。
🔍 Search local images with natural language on Android, powered by OpenAI's CLIP model. / 在 Android 上用自然语言搜索本地图片 (基于 OpenAI 的 CLIP 模型)
https://github.com/greyovo/PicQuery
下面我们开始改造之路:
- 导入项目到 AndroidStudio
- 导入 O6 NPU的 native lib 库
- 新增 JNI CPP代码,
- 改造项目 kotlin 代码
- 编译
- 上板执行!
下面给出具体的操作细节:
- O6 NPU 的 native lib 库是在 Android 镜像的如下位置
bash
.
└── vendor
├── include
│ └── npu
│ ├── kmd
│ │ ├── armchina_aipu.h
│ │ └── tcb.h
│ └── standard_api.h
└── lib64
└── libaipudrv.so
这里还需要注意一下的就是,我们还需要把 libc++的库包含到 apk 的 jnilibs 中去,因为系统里提供的这个 so 应该是动态编译的,不包含这个 c++的 so,运行会报错的
bash
vendor/lib64/libc++.so
- 主要是用到的 API 可以参考 CIX NPU 开发指导手册
首先使用方法和 Linux 端的使用基本一致,只是需要根据 JNI 的方式做一些符合 JNI 要求的修改即可。
大家也可以参考我这边的代码来组织自己的code,这边应该可以说是通用的。
cpp
extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_preprocessNpuInference(JNIEnv *env, jobject thiz, jbyteArray model, jint model_size)
{
initTestBench(&opt);
jbyte* model_bin = env->GetByteArrayElements(model, NULL);
char* buffer1 = new char[model_size];
memcpy(buffer1, model_bin, model_size);
opt.model_bin=(buffer1);
opt.bin_size = model_size;
LOGE("[TEST INFO] preprocessNpuInference\n");
memset(&sim_glb_config, 0, sizeof(sim_glb_config));
memset(&sim_job_config, 0, sizeof(sim_job_config));
memset(&mem_dump_config, 0, sizeof(mem_dump_config));
mem_dump_config.dump_dir = opt.dump_dir;
aipu_init_context(&ctx);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] AIPU_init_ctx: %s\n", msg);
// return -1;
}
ret = aipu_config_global(ctx, AIPU_CONFIG_TYPE_SIMULATION, &sim_glb_config);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] AIPU_config_simulation: %s\n", msg);
// goto deinit_ctx;
}
if (part_cnt == 0) {
ret = aipu_get_partition_count(ctx, &part_cnt);
if (ret != AIPU_STATUS_SUCCESS) {
aipu_get_error_message(ctx, ret, &msg);
LOGE("aipu_get_partition_count: %s \n", msg);
// goto unload_graph;
}
for (uint32_t i = 0; i < part_cnt; i++) {
ret = aipu_get_cluster_count(ctx, i, &cluster_cnt);
if (ret != AIPU_STATUS_SUCCESS) {
aipu_get_error_message(ctx, ret, &msg);
LOGE("aipu_get_cluster_count: %s \n", msg);
// goto unload_graph;
}
for (uint32_t j = 0; j < cluster_cnt; j++) {
ret = aipu_get_core_count(ctx, i, j, &core_cnt);
if (ret != AIPU_STATUS_SUCCESS) {
aipu_get_error_message(ctx, ret, &msg);
LOGE("aipu_get_core_count: %s \n", msg);
// goto unload_graph;
}
LOGE("[TEST INFO] <part_idx, cluster_idx, core_cnt> = <%u, %u, %u>\n", i, j, core_cnt);
}
}
}
ret = aipu_load_graph_helper(ctx, opt.model_bin,opt.bin_size, &graph_id);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] AIPU_load_graph_helper: %s\n", msg);
// goto deinit_ctx;
}
LOGE("[TEST INFO] AIPU load graph successfully.\n");
ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, &input_cnt);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] INPUT: aipu_get_tensor_count: %s\n", msg);
// goto unload_graph;
}
LOGE("[TEST INFO] INPUT: aipu_get_tensor_count success: input_cnt = %d\n",input_cnt);
for (uint32_t i = 0; i < input_cnt; i++)
{
aipu_tensor_desc_t desc;
ret = aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_INPUT, i, &desc);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] INPUT: aipu_get_tensor_descriptor: %s\n", msg);
}
LOGE("[TEST INFO] INPUT[%d]: desc.size: %u\n", i, desc.size);
LOGE("[TEST INFO] INPUT[%d]: desc.scale: %f\n", i, desc.scale);
LOGE("[TEST INFO] INPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);
LOGE("[TEST INFO] INPUT[%d]: desc.data_type: %u\n", i, desc.data_type);
LOGE("[TEST INFO] INPUT[%d]: desc.id: %u\n", i, desc.id);
input_desc.push_back(desc);
}
ret = aipu_get_tensor_count(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, &output_cnt);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
fprintf(stderr, "[TEST ERROR] aipu_get_tensor_count: %s\n", msg);
// goto unload_graph;
}
LOGE("[TEST INFO] OUTPUT: aipu_get_tensor_count success: output_cnt = %d\n", output_cnt);
for (uint32_t i = 0; i < output_cnt; i++)
{
aipu_tensor_desc_t desc;
aipu_get_tensor_descriptor(ctx, graph_id, AIPU_TENSOR_TYPE_OUTPUT, i, &desc);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] aipu_get_tensor_descriptor: %s\n", msg);
// goto unload_graph;
}
LOGE("[TEST INFO] OUTPUT[%d]: desc.size: %u\n", i, desc.size);
LOGE("[TEST INFO] OUTPUT[%d]: desc.scale: %f\n", i, desc.scale);
LOGE("[TEST INFO] OUTPUT[%d]: desc.zero_point: %f\n", i, desc.zero_point);
LOGE("[TEST INFO] OUTPUT[%d]: desc.data_type: %u\n", i, desc.data_type);
LOGE("[TEST INFO] OUTPUT[%d]: desc.id: %u\n", i, desc.id);
output_desc.push_back(desc);
}
for (uint32_t i = 0; i < output_cnt; i++)
{
char* output = new char[output_desc[i].size];
output_data.push_back(output);
}
env->ReleaseByteArrayElements(model, model_bin, 0);
delete[] buffer1;
return 0;
}
cpp
extern "C" JNIEXPORT jint JNICALL
Java_me_grey_picquery_NpuInference_processNpuInference(JNIEnv *env, jobject thiz,
jintArray inputBin, jint inputLength,
// jbyteArray goldenOutputBin, jint outputLength,
jbyteArray output)
{
char* buffer2 = new char[inputLength * sizeof(int)];
jint* inputData = env->GetIntArrayElements(inputBin, NULL);
jbyte* outputData = env->GetByteArrayElements(output, NULL);
void* voidInputData = malloc(inputLength * sizeof(jint));
if (voidInputData != nullptr) {
memcpy(voidInputData, inputData, inputLength * sizeof(jint));
}
opt.inputs.push_back(voidInputData);
opt.inputs_size.push_back(inputLength * sizeof(int));
LOGE("[TEST INFO] NpuInference void* type inputLength= %lu \n", inputLength* sizeof(int));
// char* buffer3 = new char[outputLength];
// jbyte* outputGoldenData = env->GetByteArrayElements(goldenOutputBin, NULL);
// if (outputGoldenData != NULL) {
// memcpy(buffer3, outputGoldenData, outputLength);
// opt.gt = buffer3;
// opt.gt_size = outputLength;
// }
LOGE("[TEST INFO] do npu inference now\n");
create_job_cfg.partition_id = 0;
create_job_cfg.qos_level = AIPU_JOB_QOS_HIGH;
ret = aipu_create_job(ctx, graph_id, &job_id, &create_job_cfg);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] aipu_create_job: %s\n", msg);
// goto unload_graph;
}
LOGE("[TEST INFO] aipu_create_job success\n");
// cfg_types = AIPU_JOB_CONFIG_TYPE_DUMP_INPUT | AIPU_JOB_CONFIG_TYPE_DUMP_OUTPUT;
// ret = aipu_config_job(ctx, job_id, cfg_types, &mem_dump_config);
// if (ret != AIPU_STATUS_SUCCESS) {
// aipu_get_error_message(ctx, ret, &msg);
// LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
// }
aipu_config_job(ctx, job_id, AIPU_CONFIG_TYPE_SIMULATION, &sim_job_config);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] aipu_config_job: %s\n", msg);
// goto clean_job;
}
LOGE("[TEST INFO] set job simulation config success\n");
if (opt.inputs.size() != input_cnt) {
LOGE("[TEST WARN] input file count (%u) != input tensor count (%u)\n",
(uint32_t)opt.inputs.size(), input_cnt);
}
for (uint32_t i = 0; i < min((uint32_t)opt.inputs.size(), input_cnt); i++) {
if (input_desc[i].size > opt.inputs_size[i]) {
LOGE("[TEST INFO] input file %s len 0x%x < input tensor %u size 0x%x\n",
opt.input_files[i].c_str(), opt.inputs_size[i], i, input_desc[i].size);
// goto clean_job;
}
ret = aipu_load_tensor(ctx, job_id, i, opt.inputs[i]);
if (ret != AIPU_STATUS_SUCCESS) {
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] aipu_load_tensor: %s\n", msg);
// goto clean_job;
}
LOGE("[TEST INFO] load input tensor %d from (%u/%u)\n", i, i+1, input_cnt);
}
gettimeofday(×tart, NULL);
aipu_finish_job(ctx, job_id, -1);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] aipu_finish_job: %s\n", msg);
pass = -1;
// goto clean_job;
}
LOGE("[TEST INFO] aipu_finish_job success\n");
gettimeofday(&timeend, NULL);
for (uint32_t i = 0; i < input_cnt; i++)
{
opt.inputs.pop_back();
opt.inputs_size.pop_back();
}
for (uint32_t i = 0; i < output_cnt; i++)
{
ret = aipu_get_tensor(ctx, job_id, AIPU_TENSOR_TYPE_OUTPUT, i, output_data[i]);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] aipu_get_tensor: %s\n", msg);
// goto clean_job;
}
LOGE("[TEST INFO] get output tensor %u success (%u/%u)\n", i, i+1, output_cnt);
}
// pass = check_result_helper(output_data, output_desc, opt.gt, opt.gt_size);
LOGE("[TEST INFO] output_desc[0].size 0x%x\n",output_desc[0].size);
// post process
LOGE("[TEST INFO] npu post process\n");
memcpy(outputData, output_data[0], output_desc[0].size);
// input_desc.clear();
// output_desc.clear();
clean_job:
ret = aipu_clean_job(ctx, job_id);
if (ret != AIPU_STATUS_SUCCESS)
{
aipu_get_error_message(ctx, ret, &msg);
LOGE("[TEST ERROR] AIPU_clean_job: %s\n", msg);
// goto unload_graph;
}
LOGE("[TEST INFO] aipu_clean_job success\n");
///////////////////////////////////////////
//unload_graph:
// ret = aipu_unload_graph(ctx, graph_id);
// if (ret != AIPU_STATUS_SUCCESS)
// {
// aipu_get_error_message(ctx, ret, &msg);
// LOGE("[TEST ERROR] aipu_unload_graph: %s\n", msg);
// goto deinit_ctx;
// }
// LOGE("[TEST INFO] aipu_unload_graph success\n");
//
//deinit_ctx:
// ret = aipu_deinit_context(ctx);
// if (ret != AIPU_STATUS_SUCCESS)
// {
// aipu_get_error_message(ctx, ret, &msg);
// LOGE("[TEST ERROR] aipu_deinit_ctx: %s\n", msg);
//// return -1;
// }
// LOGE("[TEST INFO] aipu_deinit_context success\n");
////#endif
//// return 0;
// finish:
// if (AIPU_STATUS_SUCCESS != ret) {
// pass = -1;
// }
// for (uint32_t i = 0; i < output_data.size(); i++) {
// delete[] output_data[i];
// }
//
// output_data.clear();
///////////////////////////////////////////
env->ReleaseIntArrayElements(inputBin, inputData, 0);
env->ReleaseByteArrayElements(output, outputData, 0);
// env->ReleaseByteArrayElements(goldenOutputBin, outputGoldenData, 0);
delete[] buffer2;
// delete[] buffer3;
delete[] voidInputData;
return 0;
}
CMakeLists.txt 也可以参考我的代码:
bash
include_directories(
${CMAKE_CURRENT_SOURCE_DIR}
)
add_library(npu_inference SHARED ${CMAKE_CURRENT_SOURCE_DIR}/npu_inference.cpp)
target_link_libraries(npu_inference
aipudrv
android
log
)