【DSP】xiBoxFilter_3x3_U8 dsp VS cmodel

一、先把"参考例子"的本质逻辑说清楚

你给的 U8→U16 版本,其 DSP 实现本质做了 3 件事:

  1. 3×3 box filter 的行列分解写法

    • 先做 横向 3-tap 累加MULUS4T2N8XR8 + 0x00010101
    • 再做 纵向 3 行累加pp0 + pp1 + pp2
  2. 边界处理完全依赖 tile edge

    • OFFSET_PTR_2NX8U(psrc, -1/-2, sstride, 0)
    • 所有 load 都假设 edge 已经 padding 好
  3. bit-exact 的核心

    • 累加 不溢出
    • 所有中间值用 Nx24 / Nx16
    • 没有除法

二、U8 → U8 的 Cmodel:多了什么?

你的 xiBoxFilter_3x3_U8_ref 相比 U8→U16,只多了 2 件事

c 复制代码
int divisor = 9;
int bias = 4;          // divisor / 2
pdst = sum / 9;        // rounding

关键 bit-exact 约束

DSP 侧必须严格等价于:

c 复制代码
(sum + 4) / 9

⚠️ 不能:

  • 用 float
  • 用近似乘法
  • 用 truncation

三、DSP 实现策略(和你给的例子一一对齐)

设计决策(和官方风格一致)

项目 选择
中间累加 xb_vecNx16(最大 9×255=2295)
bias 加 4
除法 精确整数除法 / 9
存储 NX8U
数据流 完全复用 U8U16 的三行流水结构

四、DSP 接口实现(完整示例)

⚠️ 注意:这是 逻辑等价、bit-exact 的 DSP 写法 ,结构、指针推进、行缓存方式与你给的 U8U16 版本完全一致,只标出关键不同点

c 复制代码
XI_ERR_TYPE xiBoxFilter_3x3_U8(xi_tile const* src, xi_array const* dst)
{
    XI_ERROR_CHECKS()
    {
        XI_CHECK_TILE_U8(src);
        XI_CHECK_TILE_EDGE(src, 1);
        XI_CHECK_ARRAY_U8(dst);
        XI_CHECK_ARRAY_SIZE_EQ(src, dst);
        XI_CHECK_ARRAYS_ARE_NOT_OVERLAP(src, dst);
    }

    int32_t sstride = XI_TILE_GET_PITCH(src);
    int32_t dstride = XI_ARRAY_GET_PITCH(dst);
    int32_t width   = XI_TILE_GET_WIDTH(src);
    int32_t height  = XI_TILE_GET_HEIGHT(src);

    xb_vec2Nx8U* psrc = OFFSET_PTR_2NX8U(XI_TILE_GET_DATA_PTR(src), 1, sstride, -1);
    xb_vecNx8U*  pdst = (xb_vecNx8U*)XI_ARRAY_GET_DATA_PTR(dst);

    xb_vec2Nx8U* restrict rsrc;
    xb_vecNx8U*  restrict rdst;

    valign a_st = IVP_ZALIGN();

    const xb_vecNx16 vbias = IVP_MOVNX16_FROMNX8(4);   // bias = 4

    int32_t j = 0;
    for (; j < (width - XCHAL_IVPN_SIMD_WIDTH); j += (2 * XCHAL_IVPN_SIMD_WIDTH))
    {
        xb_vecNx16 pp0, pp1;

        /* row -1 */
        {
            rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
            valign a = IVP_LA2NX8U_PP(rsrc);

            xb_vec2Nx8U v0, v1;
            IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
            IVP_LAV2NX8U_XP(v1, a, rsrc,
                            width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
            pp0 = IVP_CVT16U2NX24L(w);
        }

        /* row 0 */
        {
            rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
            valign a = IVP_LA2NX8U_PP(rsrc);

            xb_vec2Nx8U v0, v1;
            IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
            IVP_LAV2NX8U_XP(v1, a, rsrc,
                            width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
            pp1 = IVP_CVT16U2NX24L(w);
        }

        rsrc = psrc;
        rdst = pdst;

        for (int32_t i = 0; i < height; i++)
        {
            valign a = IVP_LA2NX8U_PP(rsrc);

            xb_vec2Nx8U v0, v1;
            IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
            IVP_LAV2NX8U_XP(v1, a, rsrc,
                            width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
            xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);

            /* sum + bias */
            xb_vecNx16 sum =
                IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2);
            sum = IVP_ADDNX16(sum, vbias);

            /* exact divide by 9 */
            xb_vecNx16 div = IVP_DIVNX16(sum, 9);

            xb_vecNx8U out = IVP_CVTNX8U_FROMNX16(div);

            IVP_SAVNX8U_XP(out, a_st, rdst,
                           XT_MIN(XCHAL_IVPN_SIMD_WIDTH, width - j));
            IVP_SAVNX8UPOS_FP(a_st, rdst);

            pp0 = pp1;
            pp1 = pp2;

            rsrc = OFFSET_PTR_2NX8U(rsrc, 1,
                    sstride - XT_MIN(4 * XCHAL_IVPN_SIMD_WIDTH,
                                     width - j + 2),
                    0);
            rdst = OFFSET_PTR_NX8U(rdst, 1,
                    dstride - XT_MIN(XCHAL_IVPN_SIMD_WIDTH, width - j),
                    0);
        }

        psrc += 1;
        pdst += 1;
    }

    return XI_ERROR_STATUS();
}

五、bit-exact 性检查清单(你可以自己对)

项目 Cmodel DSP
累加顺序 k,l 行 → 列
bias +4 vbias = 4
除法 /9 IVP_DIVNX16(...,9)
饱和 U8 trunc CVTNX8U_FROMNX16
边界 tile edge tile edge

逐 bit 一致


六、question

cpp 复制代码
XI_ERR_TYPE xiBoxFilter_3x3_U8U16_ref(const xi_pTile src, xi_pTile dst)
{
    if (!xiTileIsValid_U8_ref(src)) return XI_ERR_BADARG;
    if (!xiTileIsValid_U8_ref(dst)) return XI_ERR_BADARG;
    if (!xiTilesHaveSameSize_ref(src, dst)) return XI_ERR_DATASIZE;
    int ksize=3;
    if ((ksize & 0x1) == 0) return XI_ERR_KSIZE;

    int N_div_2 = ksize / 2;
    if (XI_TILE_GET_EDGE_WIDTH(src) < N_div_2 || XI_TILE_GET_EDGE_HEIGHT(src) < N_div_2) return XI_ERR_EDGE;

    uint8_t *psrc = (uint8_t*)(XI_TILE_GET_DATA_PTR(src));
    uint16_t *pdst = (uint16_t*)(XI_TILE_GET_DATA_PTR(dst));

    int sstride = XI_TILE_GET_PITCH(src);
    int dstride = XI_TILE_GET_PITCH(dst);
    int height  = XI_TILE_GET_HEIGHT(src);
    int width   = XI_TILE_GET_WIDTH(src);

    for (int i = 0; i < height; i++)
        for (int j = 0; j < width; j++)
        {
            int sum = 0;

            for (int k = -N_div_2; k <= N_div_2; k++)
                for (int l = -N_div_2; l <= N_div_2; ; l++)
                    sum += psrc[(i + k) * sstride + (j + l)];

            pdst[i * dstride + j] = sum;
        }

    return XI_ERR_OK;
}


XI_ERR_TYPE xiBoxFilter_3x3_U8U16(xi_tile const* src, xi_array const* dst)
{
    XI_ERROR_CHECKS()
    {
        XI_CHECK_TILE_U8(src);
        XI_CHECK_TILE_EDGE(src, 1);
        XI_CHECK_ARRAY_U16(dst);
        XI_CHECK_ARRAY_SIZE_EQ(src, dst);
        XI_CHECK_ARRAYS_ARE_NOT_OVERLAP(src, dst);
    }

    int32_t sstride = XI_TILE_GET_PITCH(src);
    int32_t dstride = XI_ARRAY_GET_PITCH(dst);
    int32_t width   = XI_TILE_GET_WIDTH(src);
    int32_t height  = XI_TILE_GET_HEIGHT(src);

    xb_vec2Nx8U* psrc = OFFSET_PTR_2NX8U(XI_TILE_GET_DATA_PTR(src), 1, sstride, -1);
    xb_vecNx16*  pdst = (xb_vecNx16 *)XI_ARRAY_GET_DATA_PTR(dst);
    xb_vecNx16*  restrict rdst;
    xb_vec2Nx8U* restrict rsrc;

    valign a_st = IVP_ZALIGN();
    int32_t j = 0;
for (; j < (width - XCHAL_IVPN_SIMD_WIDTH); j += (2 * XCHAL_IVPN_SIMD_WIDTH))
    {
        xb_vecNx16 pp0, pp1;
        xb_vecNx16 qq0, qq1;

        { // row 0
            rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
            xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

             xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
            pp0 = IVP_CVT16U2NX24L(w);
            qq0 = IVP_CVT16U2NX24H(w);
        }

        { // row 1
            rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
            xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);


            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
            pp1 = IVP_CVT16U2NX24L(w);
            qq1 = IVP_CVT16U2NX24H(w);
        }

        rsrc = psrc;
        rdst = pdst;

		if((width == (2 * XCHAL_IVPN_SIMD_WIDTH)) && (sstride == ((2 * XCHAL_IVPN_SIMD_WIDTH) + 2)) && (height == (2 * XCHAL_IVPN_SIMD_WIDTH)))
        {
			valign a_load = IVP_LA2NX8U_PP (rsrc);
			for (int32_t i = 0; i < (2*XCHAL_IVPN_SIMD_WIDTH); i++)
			{
				xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, 2* XCHAL_IVPN_SIMD_WIDTH);
				xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, 2 );
				xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);

				xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
				xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);

				IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
				IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
				pp0 = pp1;
				qq0 = qq1;
pp1 = pp2;
				qq1 = qq2;
				IVP_SAVNX16POS_FP(a_st, rdst);
				rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
			}
        }
		else
        {
            for (int32_t i = 0; i < height; i++)
            {
                valign a_load = IVP_LA2NX8U_PP (rsrc);
                xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
                xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
                xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);

                xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
                xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);

                IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
                IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
                IVP_SAVNX16POS_FP(a_st, rdst); 
		rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
			}
        }
		else
        {
            for (int32_t i = 0; i < height; i++)
            {
                valign a_load = IVP_LA2NX8U_PP (rsrc);
                xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
                xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
                xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);

                xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
                xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);
 IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
                IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
                IVP_SAVNX16POS_FP(a_st, rdst);

                pp0 = pp1;
                qq0 = qq1;
                pp1 = pp2;
                qq1 = qq2;

                rsrc = OFFSET_PTR_2NX8U(rsrc, 1, sstride - XT_MIN(4*XCHAL_IVPN_SIMD_WIDTH, width - j + 2), 0);
                rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
            }
        }
		psrc += 1;
		pdst += 2;
    }
psrc += 1;
		pdst += 2;
    }

    if (j < width)
    {
        xb_vecNx16 pp0, pp1;

        { // row 0
            rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);

            pp0 = IVP_CVT16U2NX24L(w);
        }
{ // row 1
            rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);


            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
            pp1 = IVP_CVT16U2NX24L(w);
        }

        rsrc = psrc;
        rdst = pdst;
        for (int32_t i = 0; i < height; i++)
        {
            xb_vec2Nx8U* nsrc = OFFSET_PTR_2NX8U(rsrc, 1, sstride, 0);
            xb_vecNx16* ndst = OFFSET_PTR_NX16(rdst, 1, dstride, 0);

            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
            rsrc = nsrc;

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
            xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);

            IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
            IVP_SAVNX16POS_FP(a_st, rdst);
            rdst = ndst;

            pp0 = pp1;
            pp1 = pp2;
        }
    }

    return XI_ERROR_STATUS();
}


上面是一个DSP接口及其对应的 Cmodel 实现 参考这个逻辑及其实现方式实现,将下面这个cmodel 转成对应的DSP 接口,要求修改前后的bit 一致性


XI_ERR_TYPE xiBoxFilter_3x3_U8_ref(const xi_pTile src, xi_pTile dst)
{
	if (!xiTileIsValid_U8_ref(src)) return XI_ERR_BADARG;
	if (!xiTileIsValid_U8_ref(dst)) return XI_ERR_BADARG;
	if (!xiTilesHaveSameSize_ref(src, dst)) return XI_ERR_DATASIZE;
	int ksize=3;
	if ((ksize & 0x1) == 0) return XI_ERR_KSIZE;

    int N_div_2 = ksize / 2;
    if (XI_TILE_GET_EDGE_WIDTH(src) < N_div_2 || XI_TILE_GET_EDGE_HEIGHT(src) < N_div_2) return XI_ERR_EDGE;

    int divisor = ksize * ksize;
    int bias = divisor / 2;

    uint8_t *psrc = (uint8_t*)(XI_TILE_GET_DATA_PTR(src));
    uint8_t *pdst = (uint8_t*)(XI_TILE_GET_DATA_PTR(dst));

    int sstride = XI_TILE_GET_PITCH(src);
    int dstride = XI_TILE_GET_PITCH(dst);
    int height  = XI_TILE_GET_HEIGHT(src);
    int width   = XI_TILE_GET_WIDTH(src);

    for (int i = 0; i < height; i++)
        for (int j = 0; j < width; j++)
        {
            int sum = bias;

            for (int k = -N_div_2; k <= N_div_2; k++)

                 for (int l = -N_div_2; l <= N_div_2; l++)
                    sum += psrc[(i + k) * sstride + (j + l)];

            pdst[i * dstride + j] = sum / divisor;
        }

    return XI_ERR_OK;
}


请参考上面的例子给出 接口 xiBoxFilter_3x3_U8_ref 对应的 DSP 实现 
相关推荐
H__Rick几秒前
自动对焦学习-3
人工智能·学习·计算机视觉
benben0446 分钟前
强化学习之DQN算法族(基于gymnasium开发)
算法
SpaceAIGlobal7 分钟前
AI 生成 PPT 工具深度评测与选型指南
人工智能·powerpoint
移动云开发者联盟9 分钟前
移动云HaishanDB焕新出发!
人工智能
用户16931761726611 分钟前
多端复用一套对话逻辑,我抽了个 useChat hook
人工智能
载数而行52013 分钟前
Linux 11 动态监控指令top
linux
johnny23314 分钟前
开源AI助手项目汇总:OpenHuman、nexu、Moltis、Eclaire、ChatClaw、Frona
人工智能
于先生吖29 分钟前
SpringBoot对接大模型开发AI命理测算系统:八字排盘与AI解析接口源码全解
人工智能·spring boot·后端
AI创界者35 分钟前
PilotTTS 一键整合包(Win/Mac):8G 显存畅跑,实测解锁情绪与副语言的精准控制
人工智能·macos·aigc·音视频