【关注我,后续持续新增专题博文,谢谢!!!】
上一篇我们讲了:
这一篇我们开始讲: 高通Camx hal进程CSLAcquireDeviceHW crash问题分析一:CAM-ICP FW response timeout导致 9573332
目录
[2.2 :解析堆栈](#2.2 :解析堆栈)
[2.3 :我们继续看看用户camx日志](#2.3 :我们继续看看用户camx日志)
[2.4 :我们继续看看内核KMD日志](#2.4 :我们继续看看内核KMD日志)
[2.5 :分析cam_icp_mgr_send_config_io代码](#2.5 :分析cam_icp_mgr_send_config_io代码)
[2.6 :cam_icp EREMOTEIO异常](#2.6 :cam_icp EREMOTEIO异常)
一、问题背景
老化低概率出现高通Camx hal进程CSLAcquireDeviceHW crash问题
二、问题分析过程
2.1:基于crash堆栈分析
堆栈如下:
cppCmdline: /vendor/bin/hw/vendor.qti.camera.provider-service_64 pid: 3607, tid: 3763, name: SoloTMgr_7 >>> /vendor/bin/hw/vendor.qti.camera.provider-service_64 <<< uid: 1047 tagged_addr_ctrl: 0000000000000001 (PR_TAGGED_ADDR_ENABLE) signal 6 (SIGABRT), code -6 (SI_TKILL), fault addr -------- x0 0000000000000000 x1 0000000000000eb3 x2 0000000000000006 x3 00000000063b622c x4 0000000000000000 x5 0000000000000000 x6 0000000000000000 x7 0000000000000080 x8 0000000000000083 x9 20229c56f3ecd134 x10 00000072ca612bc0 x11 0000007317a8bce1 x12 0000000000000000 x13 000000007fffffff x14 00000000063b622c x15 000003af2cc95930 x16 00000073ac149eb0 x17 00000073ac131080 x18 00000072c91ec000 x19 0000000000000001 x20 000000000000000a x21 b400007265f57f6c x22 000000731a031730 x23 00000072ca63af40 x24 0000000000000001 x25 000000731935ca28 x26 000000731a032c08 x27 0000007317abc010 x28 0000000000000000 x29 00000072ca6132e0 lr 00000073192a6400 sp 00000072ca613040 pc 00000073ac13108c pst 0000000000001000 9 total frames backtrace: #00 pc 00000000000eb08c /apex/com.android.runtime/lib64/bionic/libc.so (tgkill+12) (BuildId: cddb4a3e9dd8511821cfbd22aa0235dd) #01 pc 00000000016763fc /vendor/lib64/hw/camera.qcom.so (CSLAcquireDeviceHW(int, int*, int, CSLDeviceResource*, unsigned long, CSLDeviceAttribute*, unsigned long, char const*) (.cfi)+3068) (BuildId: b3c450ebf580515e8ac4e6947d94bed5) #02 pc 000000000166c030 /vendor/lib64/hw/camera.qcom.so (CSLAcquireDevice.cfi+288) (BuildId: b3c450ebf580515e8ac4e6947d94bed5) #03 pc 0000000000873210 /vendor/lib64/hw/camera.qcom.so (CamX::IPENode::PostPipelineCreate()+25600) (BuildId: b3c450ebf580515e8ac4e6947d94bed5) #04 pc 0000000001567268 /vendor/lib64/hw/camera.qcom.so (CamX::Node::DeferPipelineCreate(void*) (.cfi)+120) (BuildId: b3c450ebf580515e8ac4e6947d94bed5) #05 pc 000000000002f49c /vendor/lib64/libcamxcommonutils.so (CamX::ThreadCore::DispatchJob(CamX::RuntimeJob*)+620) (BuildId: bf116e62e768d04d263003927ac4ee67) #06 pc 0000000000030334 /vendor/lib64/libcamxcommonutils.so (CamX::ThreadCore::WorkerThreadBody(void*) (.cfi)+3188) (BuildId: bf116e62e768d04d263003927ac4ee67) #07 pc 0000000000095e2c /apex/com.android.runtime/lib64/bionic/libc.so (__pthread_start(void*)+184) (BuildId: cddb4a3e9dd8511821cfbd22aa0235dd) #08 pc 0000000000088648 /apex/com.android.runtime/lib64/bionic/libc.so (__start_thread+68) (BuildId: cddb4a3e9dd8511821cfbd22aa0235dd)
2.2 :解析堆栈
解析堆栈,crash代码如下:
cppCamxResult CSLAcquireDeviceHW( CSLHandle hCSL, CSLDeviceHandle* phDevice, INT32 deviceIndex, CSLDeviceResource* pDeviceResourceRequest, SIZE_T numDeviceResources, CSLDeviceAttribute* pDeviceAttribute, SIZE_T numDeviceAttributes, const CHAR* pDeviceName) { CamxResult result = CamxResultEFailed; if ((CSLInvalidHandle != hCSL) && (NULL != phDevice) && (0 <= deviceIndex) && ((deviceIndex) < CSLHwMaxKMDNumDevices)) // looks like sensor has no data for now so changing this to allow null and size zero data // && pDeviceResourceRequest && (deviceResourceSize != 0)) { CSLHandle hCSLHwSession = CAM_REQ_MGR_GET_HDL_IDX(hCSL); if (hCSLHwSession < CSLHwMaxNumSessions) { *phDevice = CSLInvalidHandle; if (TRUE == CSLHwInstanceGetRefCount()) { g_CSLHwInstance.lock->Lock(); CSLHwDevice* pHWDevice = &g_CSLHwInstance.CSLInternalKMDDevices[deviceIndex]; CSLHwsession* pSession = g_CSLHwInstance.pSessionList[hCSLHwSession]; g_CSLHwInstance.lock->Unlock(); if (NULL != pSession) { if ((hCSL == pSession->hSession) && (NULL != pHWDevice->deviceOp.Acquire)) { // The Instance refcount is released only in the CSLReleaseDeviceHW // if everything in this API is success. if (TRUE == CSLHwSessionGetRefCount(pSession)) { result = pHWDevice->deviceOp.Acquire(hCSL, phDevice, deviceIndex, pDeviceResourceRequest, numDeviceResources); if (CamxResultSuccess == result) ........................ else { CAMX_LOG_ERROR( CamxLogGroupCSL, "Acquire failed for deviceName=%s, %s, index=%d, aState=%s, name:%s", CSLHwInternalDeviceTypeStrings[pHWDevice->deviceType], pDeviceName, deviceIndex, CSLHwInternalDeviceStateStrings[pHWDevice->aState], pHWDevice->devName); } CSLHwSessionPutRefCount(pSession); } } if (CamxResultSuccess != result) { CSLLogAcquiredDevices(pSession); } } else { result = CamxResultEFailed; CAMX_LOG_ERROR(CamxLogGroupCSL, "pSession is NULL handler hCSLHwSession = %d", hCSLHwSession); } CSLHwInstancePutRefCount(); } } else { CAMX_LOG_ERROR(CamxLogGroupCSL, "hCSLHwSession >= CSLHwMaxNumSessions"); result = CamxResultEOutOfBounds; } if (CamxResultSuccess != result) { CSLHwDevice* pHWDevice = &g_CSLHwInstance.CSLInternalKMDDevices[deviceIndex]; CAMX_LOG_ERROR(CamxLogGroupCSL, "Acquire Device failure: hCSL: 0x%x, deviceIndex: %d, phDevice: 0x%x, Name=%s, result=%u", hCSL, deviceIndex, *phDevice, CSLHwInternalDeviceTypeStrings[pHWDevice->deviceType], result); OsUtils::RaiseSignalAbort(); //这里crash } } else { result = CamxResultEInvalidArg; CAMX_LOG_ERROR(CamxLogGroupCSL, "Acquire Device Invalid Arguments: hCSL: %d phDevice: %p deviceIndex: %d", hCSL, phDevice, deviceIndex); } return result; }
在OsUtils::RaiseSignalAbort();这里主动crash。那么必然打印Acquire Device failure: hCSL日志。
2.3 :我们继续看看用户camx日志
我们搜索日志如分析一致:CSLAcquireDeviceHW() Acquire Device failure: hCSL
cppX:\log\bug\9573332\android_log_20250717_045521.txt (匹配7次) 行 28719: 07-17 04:56:26.249400 3607 3762 E CamX : [ERROR][CSL ] camxcslhwinternal.cpp:3690 CSLHwInternalDefaultIoctl() Ioctl failed for device /dev/v4l-subdev14 (Type:CSLHwICP, FD:27, Index:10) with error reason Remote I/O error 行 28726: 07-17 04:56:26.253388 3607 3762 E CamX : [ERROR][CSL ] camxcslhwinternalicp.cpp:125 CSLHWICPKMDAcquire() ioctl failed for fd=27, index 10 行 28727: 07-17 04:56:26.253448 3607 3762 E CamX : [ERROR][CSL ] camxcslhw.cpp:1230 CSLAcquireDeviceHW() Acquire failed for deviceName=CSLHwICP, RealTimeFeatureZSLPreviewRaw_IPE1_cam0, index=10, aState=CSLHwValidState, name:/dev/v4l-subdev14 行 28730: 07-17 04:56:26.254402 3607 3762 E CamX : [ERROR][CSL ] camxcslhw.cpp:249 CSLLogAcquiredDevices() sessionName RealTimeFeatureZSLPreviewRaw deviceName=CSLHwImageSensor, RealTimeFeatureZSLPreviewRaw_Sensor0_cam0, index=1 hAcquired=13435137 aState=CSLHwValidState refCount=0 行 28731: 07-17 04:56:26.254422 3607 3762 E CamX : [ERROR][CSL ] camxcslhw.cpp:249 CSLLogAcquiredDevices() sessionName RealTimeFeatureZSLPreviewRaw deviceName=CSLHwCSIPHY, RealTimeFeatureZSLPreviewRaw_Sensor0_cam0, index=5 hAcquired=7930117 aState=CSLHwValidState refCount=0 行 28732: 07-17 04:56:26.254427 3607 3762 E CamX : [ERROR][CSL ] camxcslhw.cpp:249 CSLLogAcquiredDevices() sessionName RealTimeFeatureZSLPreviewRaw deviceName=CSLHwLensActuator, RealTimeFeatureZSLPreviewRaw_Sensor0_cam0, index=6 hAcquired=8585478 aState=CSLHwValidState refCount=0 行 28733: 07-17 04:56:26.254441 3607 3762 E CamX : [ERROR][CSL ] camxcslhw.cpp:249 CSLLogAcquiredDevices() sessionName RealTimeFeatureZSLPreviewRaw deviceName=CSLHwFlash, RealTimeFeatureZSLPreviewRaw_Sensor0_cam0, index=8 hAcquired=3277064 aState=CSLHwValidState refCount=0 行 28735: 07-17 04:56:26.249661 3607 3757 E CamX : [ERROR][CSL ] camxcslhwinternal.cpp:3690 CSLHwInternalDefaultIoctl() Ioctl failed for device /dev/v4l-subdev14 (Type:CSLHwICP, FD:27, Index:10) with error reason Remote I/O error 行 28736: 07-17 04:56:26.256607 3607 3757 E CamX : [ERROR][CSL ] camxcslhwinternalicp.cpp:125 CSLHWICPKMDAcquire() ioctl failed for fd=27, index 10 行 28756: 07-17 04:56:26.256978 3607 3762 E CamX : [ERROR][CSL ] camxcslhw.cpp:1264 CSLAcquireDeviceHW() Acquire Device failure: hCSL: 0xfb0200, deviceIndex: 10, phDevice: 0x0, Name=CSLHwICP, result=1 行 28757: 07-17 04:56:26.256985 3607 3757 E CamX : [ERROR][CSL ] camxcslhw.cpp:216 CSLLogAllAcquiredDevices() deviceName=CSLHwTFE, RealTimeFeatureZSLPreviewRaw_TFE0_cam0, index=9 hAcquired=8454409 aState=CSLHwValidState refCount=0 行 28758: 07-17 04:56:26.257070 3607 3757 E CamX : [ERROR][CSL ] camxcslhw.cpp:1264 CSLAcquireDeviceHW() Acquire Device failure: hCSL: 0xfb0200, deviceIndex: 10, phDevice: 0x0, Name=CSLHwICP, result=1
如果有了解过CSL,我们就知道是去调用驱动的,而且Ioctl failed,大概率驱动日志存在异常。
2.4 :我们继续看看内核KMD日志
发现KMD日志:
- 发现CAM-ICP FW response timed out
- 接着Acquire device failed for node cam-icp
- 接着acquire device failed
cpp行 143950: 07-17 04:56:24.084692 3762 3762 I [210985.956969]CAM_ERR: CAM-ICP: cam_icp_mgr_send_config_io: 4383 FW response timed out -121 ctx id:0 dev hdl:0x0 session hdl:0x0 dev_type:3 行 143955: 07-17 04:56:24.084729 3762 3762 I [210985.957006]CAM_ERR: CAM-ICP: cam_icp_mgr_acquire_hw: 6159 IO Config command failed -121 size:8344 行 143969: 07-17 04:56:25.109094 3762 3762 I [210986.981371]CAM_ERR: CAM-ICP: cam_icp_mgr_destroy_handle: 3716 FW response timeout: -110 for 0 行 143975: 07-17 04:56:25.110046 3762 3762 I [210986.982323]CAM_ERR: CAM-CTXT: cam_context_acquire_dev_to_hw: 705 [cam-icp][3] Acquire device failed 行 143976: 07-17 04:56:25.110053 3762 3762 I [210986.982330]CAM_ERR: CAM-CORE: __cam_node_handle_acquire_dev: 120 Acquire device failed for node cam-icp 行 143977: 07-17 04:56:25.110057 3762 3762 I [210986.982334]CAM_ERR: CAM-CORE: cam_node_handle_ioctl: 806 acquire device failed(rc = -121)
我们再看看cam_icp_mgr_send_config_io内核代码
2.5 :分析cam_icp_mgr_send_config_io代码
分析代码,发现是rc = -EREMOTEIO异常,查看定义是Remote I/O error
cpp#define EREMOTEIO 121 /* Remote I/O error */ cam_icp/icp_hw/icp_hw_mgr/cam_icp_hw_mgr.c 4319 static int cam_icp_mgr_send_config_io(struct cam_icp_hw_ctx_data *ctx_data, 4320 uint32_t io_buf_addr) 4321 { 4322 int rc = 0; 4323 struct hfi_cmd_work_data *task_data; 4324 struct hfi_cmd_ipebps_async ioconfig_cmd; 4325 unsigned long rem_jiffies; 4326 int timeout = 5000; 4327 struct crm_workq_task *task; 4328 uint32_t size_in_words; 4329 4330 task = cam_req_mgr_workq_get_task(icp_hw_mgr.cmd_work); 4331 if (!task) { 4332 CAM_ERR_RATE_LIMIT(CAM_ICP, 4333 "No free task ctx id:%d dev hdl:0x%x session hdl:0x%x dev_type:%d", 4334 ctx_data->ctx_id, ctx_data->acquire_dev_cmd.dev_handle, 4335 ctx_data->acquire_dev_cmd.session_handle, 4336 ctx_data->icp_dev_acquire_info->dev_type); 4337 return -ENOMEM; 4338 } 4339 4340 ioconfig_cmd.size = sizeof(struct hfi_cmd_ipebps_async); 4341 ioconfig_cmd.pkt_type = HFI_CMD_IPEBPS_ASYNC_COMMAND_INDIRECT; 4342 if (ctx_data->icp_dev_acquire_info->dev_type == CAM_ICP_RES_TYPE_BPS) 4343 ioconfig_cmd.opcode = HFI_IPEBPS_CMD_OPCODE_BPS_CONFIG_IO; 4344 else 4345 ioconfig_cmd.opcode = HFI_IPEBPS_CMD_OPCODE_IPE_CONFIG_IO; 4346 4347 reinit_completion(&ctx_data->wait_complete); 4348 4349 ioconfig_cmd.num_fw_handles = 1; 4350 ioconfig_cmd.fw_handles_flex[0] = ctx_data->fw_handle; 4351 ioconfig_cmd.payload.indirect = io_buf_addr; 4352 ioconfig_cmd.user_data1 = PTR_TO_U64(ctx_data); 4353 ioconfig_cmd.user_data2 = (uint64_t)0x0; 4354 task_data = (struct hfi_cmd_work_data *)task->payload; 4355 task_data->data = (void *)&ioconfig_cmd; 4356 task_data->request_id = 0; 4357 task_data->type = ICP_WORKQ_TASK_MSG_TYPE; 4358 task->process_cb = cam_icp_mgr_process_cmd; 4359 size_in_words = (*(uint32_t *)task_data->data) >> 2; 4360 CAM_DBG(CAM_ICP, "size_in_words %u", size_in_words); 4361 rc = cam_req_mgr_workq_enqueue_task(task, &icp_hw_mgr, 4362 CRM_TASK_PRIORITY_0); 4363 if (rc) { 4364 CAM_ERR_RATE_LIMIT(CAM_ICP, 4365 "Enqueue task failed ctx id:%d dev hdl:0x%x session hdl:0x%x dev_type:%d", 4366 ctx_data->ctx_id, ctx_data->acquire_dev_cmd.dev_handle, 4367 ctx_data->acquire_dev_cmd.session_handle, 4368 ctx_data->icp_dev_acquire_info->dev_type); 4369 return rc; 4370 } 4371 4372 rem_jiffies = cam_common_wait_for_completion_timeout( 4373 &ctx_data->wait_complete, 4374 msecs_to_jiffies((timeout))); 4375 if (!rem_jiffies) { 4376 /* send specific error for io config failure */ 4377 rc = -EREMOTEIO; 4378 CAM_ERR(CAM_ICP, 4379 "FW response timed out %d ctx id:%d dev hdl:0x%x session hdl:0x%x dev_type:%d", 4380 rc, 4381 ctx_data->ctx_id, ctx_data->acquire_dev_cmd.dev_handle, 4382 ctx_data->acquire_dev_cmd.session_handle, 4383 ctx_data->icp_dev_acquire_info->dev_type); 4384 cam_icp_dump_debug_info(false); 4385 } 4386 4387 return rc; 4388 }
2.6 :cam_icp EREMOTEIO异常方案
cam_icp: image control processor ,图像处理控制器驱动实现。
我们可以知道是和图像处理控制器进行远程IO操作时出错,如果不是硬件问题,需要高通平台提供平台patch,如下:
diffdiff --git a/drivers/cam_icp/icp_hw/icp_hw_mgr/cam_icp_hw_mgr.c b/drivers/cam_icp/icp_hw/icp_hw_mgr/cam_icp_hw_mgr.c index d254865..714bf41 100755 --- a/drivers/cam_icp/icp_hw/icp_hw_mgr/cam_icp_hw_mgr.c +++ b/drivers/cam_icp/icp_hw/icp_hw_mgr/cam_icp_hw_mgr.c @@ -3075,14 +3075,25 @@ uint32_t *msg_ptr) { int rc = 0; + uint32_t ctx_id; + + struct hfi_msg_dev_async_ack *ioconfig_ack = NULL; + struct cam_icp_hw_ctx_data *ctx_data = NULL; + + ioconfig_ack = (struct hfi_msg_dev_async_ack *)msg_ptr; + + ctx_id = (uint32_t)ioconfig_ack->user_data2; + if (!test_bit(ctx_id, hw_mgr->active_ctx_info.active_ctx_bitmap)) { + CAM_WARN(CAM_ICP, "ctx data is released before accessing it, ctx_id: %u", + ctx_id); + rc = -EFAULT; + goto end; + } + + ctx_data = U64_TO_PTR(ioconfig_ack->user_data1); switch (msg_ptr[ICP_PACKET_OPCODE]) { case HFI_OFE_CMD_OPCODE_ABORT: { - struct hfi_msg_dev_async_ack *ioconfig_ack = NULL; - struct cam_icp_hw_ctx_data *ctx_data = NULL; - - ioconfig_ack = (struct hfi_msg_dev_async_ack *)msg_ptr; - ctx_data = U64_TO_PTR(ioconfig_ack->user_data1); if (cam_presil_mode_enabled()) { if (atomic_read(&hw_mgr->frame_in_process)) { if (hw_mgr->frame_in_process_ctx_id == ctx_data->ctx_id) { @@ -3105,11 +3116,6 @@ break; } case HFI_OFE_CMD_OPCODE_DESTROY: { - struct hfi_msg_dev_async_ack *ioconfig_ack = NULL; - struct cam_icp_hw_ctx_data *ctx_data = NULL; - - ioconfig_ack = (struct hfi_msg_dev_async_ack *)msg_ptr; - ctx_data = U64_TO_PTR(ioconfig_ack->user_data1); CAM_DBG(CAM_ICP, "received OFE destroy done msg: %u", ctx_data->state); if ((ctx_data->state == CAM_ICP_CTX_STATE_RELEASE) || (ctx_data->state == CAM_ICP_CTX_STATE_IN_USE)) @@ -3122,6 +3128,7 @@ return -EINVAL; } +end: return rc; } @@ -4815,7 +4822,7 @@ abort_cmd->num_fw_handles = 1; abort_cmd->fw_handles_flex[0] = ctx_data->fw_handle; abort_cmd->user_data1 = PTR_TO_U64(ctx_data); - abort_cmd->user_data2 = (uint64_t)0x0; + abort_cmd->user_data2 = (uint64_t)ctx_data->ctx_id; *abort_cmd_ptr = abort_cmd; @@ -4968,7 +4975,7 @@ destroy_cmd->num_fw_handles = 1; destroy_cmd->fw_handles_flex[0] = ctx_data->fw_handle; destroy_cmd->user_data1 = PTR_TO_U64(ctx_data); - destroy_cmd->user_data2 = (uint64_t)0x0; + destroy_cmd->user_data2 = (uint64_t)ctx_data->ctx_id; reinit_completion(&ctx_data->wait_complete);
【关注我,后续持续新增专题博文,谢谢!!!】
下一篇讲解: