【DSP】xiBoxFilter_3x3_U8 dsp VS cmodel

一、先把"参考例子"的本质逻辑说清楚

你给的 U8→U16 版本,其 DSP 实现本质做了 3 件事:

  1. 3×3 box filter 的行列分解写法

    • 先做 横向 3-tap 累加MULUS4T2N8XR8 + 0x00010101
    • 再做 纵向 3 行累加pp0 + pp1 + pp2
  2. 边界处理完全依赖 tile edge

    • OFFSET_PTR_2NX8U(psrc, -1/-2, sstride, 0)
    • 所有 load 都假设 edge 已经 padding 好
  3. bit-exact 的核心

    • 累加 不溢出
    • 所有中间值用 Nx24 / Nx16
    • 没有除法

二、U8 → U8 的 Cmodel:多了什么?

你的 xiBoxFilter_3x3_U8_ref 相比 U8→U16,只多了 2 件事

c 复制代码
int divisor = 9;
int bias = 4;          // divisor / 2
pdst = sum / 9;        // rounding

关键 bit-exact 约束

DSP 侧必须严格等价于:

c 复制代码
(sum + 4) / 9

⚠️ 不能:

  • 用 float
  • 用近似乘法
  • 用 truncation

三、DSP 实现策略(和你给的例子一一对齐)

设计决策(和官方风格一致)

项目 选择
中间累加 xb_vecNx16(最大 9×255=2295)
bias 加 4
除法 精确整数除法 / 9
存储 NX8U
数据流 完全复用 U8U16 的三行流水结构

四、DSP 接口实现(完整示例)

⚠️ 注意:这是 逻辑等价、bit-exact 的 DSP 写法 ,结构、指针推进、行缓存方式与你给的 U8U16 版本完全一致,只标出关键不同点

c 复制代码
XI_ERR_TYPE xiBoxFilter_3x3_U8(xi_tile const* src, xi_array const* dst)
{
    XI_ERROR_CHECKS()
    {
        XI_CHECK_TILE_U8(src);
        XI_CHECK_TILE_EDGE(src, 1);
        XI_CHECK_ARRAY_U8(dst);
        XI_CHECK_ARRAY_SIZE_EQ(src, dst);
        XI_CHECK_ARRAYS_ARE_NOT_OVERLAP(src, dst);
    }

    int32_t sstride = XI_TILE_GET_PITCH(src);
    int32_t dstride = XI_ARRAY_GET_PITCH(dst);
    int32_t width   = XI_TILE_GET_WIDTH(src);
    int32_t height  = XI_TILE_GET_HEIGHT(src);

    xb_vec2Nx8U* psrc = OFFSET_PTR_2NX8U(XI_TILE_GET_DATA_PTR(src), 1, sstride, -1);
    xb_vecNx8U*  pdst = (xb_vecNx8U*)XI_ARRAY_GET_DATA_PTR(dst);

    xb_vec2Nx8U* restrict rsrc;
    xb_vecNx8U*  restrict rdst;

    valign a_st = IVP_ZALIGN();

    const xb_vecNx16 vbias = IVP_MOVNX16_FROMNX8(4);   // bias = 4

    int32_t j = 0;
    for (; j < (width - XCHAL_IVPN_SIMD_WIDTH); j += (2 * XCHAL_IVPN_SIMD_WIDTH))
    {
        xb_vecNx16 pp0, pp1;

        /* row -1 */
        {
            rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
            valign a = IVP_LA2NX8U_PP(rsrc);

            xb_vec2Nx8U v0, v1;
            IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
            IVP_LAV2NX8U_XP(v1, a, rsrc,
                            width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
            pp0 = IVP_CVT16U2NX24L(w);
        }

        /* row 0 */
        {
            rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
            valign a = IVP_LA2NX8U_PP(rsrc);

            xb_vec2Nx8U v0, v1;
            IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
            IVP_LAV2NX8U_XP(v1, a, rsrc,
                            width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
            pp1 = IVP_CVT16U2NX24L(w);
        }

        rsrc = psrc;
        rdst = pdst;

        for (int32_t i = 0; i < height; i++)
        {
            valign a = IVP_LA2NX8U_PP(rsrc);

            xb_vec2Nx8U v0, v1;
            IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
            IVP_LAV2NX8U_XP(v1, a, rsrc,
                            width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
            xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);

            /* sum + bias */
            xb_vecNx16 sum =
                IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2);
            sum = IVP_ADDNX16(sum, vbias);

            /* exact divide by 9 */
            xb_vecNx16 div = IVP_DIVNX16(sum, 9);

            xb_vecNx8U out = IVP_CVTNX8U_FROMNX16(div);

            IVP_SAVNX8U_XP(out, a_st, rdst,
                           XT_MIN(XCHAL_IVPN_SIMD_WIDTH, width - j));
            IVP_SAVNX8UPOS_FP(a_st, rdst);

            pp0 = pp1;
            pp1 = pp2;

            rsrc = OFFSET_PTR_2NX8U(rsrc, 1,
                    sstride - XT_MIN(4 * XCHAL_IVPN_SIMD_WIDTH,
                                     width - j + 2),
                    0);
            rdst = OFFSET_PTR_NX8U(rdst, 1,
                    dstride - XT_MIN(XCHAL_IVPN_SIMD_WIDTH, width - j),
                    0);
        }

        psrc += 1;
        pdst += 1;
    }

    return XI_ERROR_STATUS();
}

五、bit-exact 性检查清单(你可以自己对)

项目 Cmodel DSP
累加顺序 k,l 行 → 列
bias +4 vbias = 4
除法 /9 IVP_DIVNX16(...,9)
饱和 U8 trunc CVTNX8U_FROMNX16
边界 tile edge tile edge

逐 bit 一致


六、question

cpp 复制代码
XI_ERR_TYPE xiBoxFilter_3x3_U8U16_ref(const xi_pTile src, xi_pTile dst)
{
    if (!xiTileIsValid_U8_ref(src)) return XI_ERR_BADARG;
    if (!xiTileIsValid_U8_ref(dst)) return XI_ERR_BADARG;
    if (!xiTilesHaveSameSize_ref(src, dst)) return XI_ERR_DATASIZE;
    int ksize=3;
    if ((ksize & 0x1) == 0) return XI_ERR_KSIZE;

    int N_div_2 = ksize / 2;
    if (XI_TILE_GET_EDGE_WIDTH(src) < N_div_2 || XI_TILE_GET_EDGE_HEIGHT(src) < N_div_2) return XI_ERR_EDGE;

    uint8_t *psrc = (uint8_t*)(XI_TILE_GET_DATA_PTR(src));
    uint16_t *pdst = (uint16_t*)(XI_TILE_GET_DATA_PTR(dst));

    int sstride = XI_TILE_GET_PITCH(src);
    int dstride = XI_TILE_GET_PITCH(dst);
    int height  = XI_TILE_GET_HEIGHT(src);
    int width   = XI_TILE_GET_WIDTH(src);

    for (int i = 0; i < height; i++)
        for (int j = 0; j < width; j++)
        {
            int sum = 0;

            for (int k = -N_div_2; k <= N_div_2; k++)
                for (int l = -N_div_2; l <= N_div_2; ; l++)
                    sum += psrc[(i + k) * sstride + (j + l)];

            pdst[i * dstride + j] = sum;
        }

    return XI_ERR_OK;
}


XI_ERR_TYPE xiBoxFilter_3x3_U8U16(xi_tile const* src, xi_array const* dst)
{
    XI_ERROR_CHECKS()
    {
        XI_CHECK_TILE_U8(src);
        XI_CHECK_TILE_EDGE(src, 1);
        XI_CHECK_ARRAY_U16(dst);
        XI_CHECK_ARRAY_SIZE_EQ(src, dst);
        XI_CHECK_ARRAYS_ARE_NOT_OVERLAP(src, dst);
    }

    int32_t sstride = XI_TILE_GET_PITCH(src);
    int32_t dstride = XI_ARRAY_GET_PITCH(dst);
    int32_t width   = XI_TILE_GET_WIDTH(src);
    int32_t height  = XI_TILE_GET_HEIGHT(src);

    xb_vec2Nx8U* psrc = OFFSET_PTR_2NX8U(XI_TILE_GET_DATA_PTR(src), 1, sstride, -1);
    xb_vecNx16*  pdst = (xb_vecNx16 *)XI_ARRAY_GET_DATA_PTR(dst);
    xb_vecNx16*  restrict rdst;
    xb_vec2Nx8U* restrict rsrc;

    valign a_st = IVP_ZALIGN();
    int32_t j = 0;
for (; j < (width - XCHAL_IVPN_SIMD_WIDTH); j += (2 * XCHAL_IVPN_SIMD_WIDTH))
    {
        xb_vecNx16 pp0, pp1;
        xb_vecNx16 qq0, qq1;

        { // row 0
            rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
            xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);

             xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
            pp0 = IVP_CVT16U2NX24L(w);
            qq0 = IVP_CVT16U2NX24H(w);
        }

        { // row 1
            rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
            xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);


            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
            pp1 = IVP_CVT16U2NX24L(w);
            qq1 = IVP_CVT16U2NX24H(w);
        }

        rsrc = psrc;
        rdst = pdst;

		if((width == (2 * XCHAL_IVPN_SIMD_WIDTH)) && (sstride == ((2 * XCHAL_IVPN_SIMD_WIDTH) + 2)) && (height == (2 * XCHAL_IVPN_SIMD_WIDTH)))
        {
			valign a_load = IVP_LA2NX8U_PP (rsrc);
			for (int32_t i = 0; i < (2*XCHAL_IVPN_SIMD_WIDTH); i++)
			{
				xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, 2* XCHAL_IVPN_SIMD_WIDTH);
				xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, 2 );
				xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);

				xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
				xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);

				IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
				IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
				pp0 = pp1;
				qq0 = qq1;
pp1 = pp2;
				qq1 = qq2;
				IVP_SAVNX16POS_FP(a_st, rdst);
				rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
			}
        }
		else
        {
            for (int32_t i = 0; i < height; i++)
            {
                valign a_load = IVP_LA2NX8U_PP (rsrc);
                xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
                xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
                xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);

                xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
                xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);

                IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
                IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
                IVP_SAVNX16POS_FP(a_st, rdst); 
		rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
			}
        }
		else
        {
            for (int32_t i = 0; i < height; i++)
            {
                valign a_load = IVP_LA2NX8U_PP (rsrc);
                xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
                xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
                xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);

                xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
                xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);
 IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
                IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
                IVP_SAVNX16POS_FP(a_st, rdst);

                pp0 = pp1;
                qq0 = qq1;
                pp1 = pp2;
                qq1 = qq2;

                rsrc = OFFSET_PTR_2NX8U(rsrc, 1, sstride - XT_MIN(4*XCHAL_IVPN_SIMD_WIDTH, width - j + 2), 0);
                rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
            }
        }
		psrc += 1;
		pdst += 2;
    }
psrc += 1;
		pdst += 2;
    }

    if (j < width)
    {
        xb_vecNx16 pp0, pp1;

        { // row 0
            rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);

            pp0 = IVP_CVT16U2NX24L(w);
        }
{ // row 1
            rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);


            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
            pp1 = IVP_CVT16U2NX24L(w);
        }

        rsrc = psrc;
        rdst = pdst;
        for (int32_t i = 0; i < height; i++)
        {
            xb_vec2Nx8U* nsrc = OFFSET_PTR_2NX8U(rsrc, 1, sstride, 0);
            xb_vecNx16* ndst = OFFSET_PTR_NX16(rdst, 1, dstride, 0);

            valign a_load = IVP_LA2NX8U_PP (rsrc);
            xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
            rsrc = nsrc;

            xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
            xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);

            IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
            IVP_SAVNX16POS_FP(a_st, rdst);
            rdst = ndst;

            pp0 = pp1;
            pp1 = pp2;
        }
    }

    return XI_ERROR_STATUS();
}


上面是一个DSP接口及其对应的 Cmodel 实现 参考这个逻辑及其实现方式实现,将下面这个cmodel 转成对应的DSP 接口,要求修改前后的bit 一致性


XI_ERR_TYPE xiBoxFilter_3x3_U8_ref(const xi_pTile src, xi_pTile dst)
{
	if (!xiTileIsValid_U8_ref(src)) return XI_ERR_BADARG;
	if (!xiTileIsValid_U8_ref(dst)) return XI_ERR_BADARG;
	if (!xiTilesHaveSameSize_ref(src, dst)) return XI_ERR_DATASIZE;
	int ksize=3;
	if ((ksize & 0x1) == 0) return XI_ERR_KSIZE;

    int N_div_2 = ksize / 2;
    if (XI_TILE_GET_EDGE_WIDTH(src) < N_div_2 || XI_TILE_GET_EDGE_HEIGHT(src) < N_div_2) return XI_ERR_EDGE;

    int divisor = ksize * ksize;
    int bias = divisor / 2;

    uint8_t *psrc = (uint8_t*)(XI_TILE_GET_DATA_PTR(src));
    uint8_t *pdst = (uint8_t*)(XI_TILE_GET_DATA_PTR(dst));

    int sstride = XI_TILE_GET_PITCH(src);
    int dstride = XI_TILE_GET_PITCH(dst);
    int height  = XI_TILE_GET_HEIGHT(src);
    int width   = XI_TILE_GET_WIDTH(src);

    for (int i = 0; i < height; i++)
        for (int j = 0; j < width; j++)
        {
            int sum = bias;

            for (int k = -N_div_2; k <= N_div_2; k++)

                 for (int l = -N_div_2; l <= N_div_2; l++)
                    sum += psrc[(i + k) * sstride + (j + l)];

            pdst[i * dstride + j] = sum / divisor;
        }

    return XI_ERR_OK;
}


请参考上面的例子给出 接口 xiBoxFilter_3x3_U8_ref 对应的 DSP 实现 
相关推荐
超级小龙虾2 小时前
Augment Context Engine MCP
人工智能
行业探路者2 小时前
健康宣教二维码是什么?主要有哪些创新优势?
人工智能·学习·音视频·二维码·产品介绍
灏瀚星空2 小时前
基于 Python 与 GitHub,打造个人专属本地化思维导图工具全流程方案(上)
开发语言·人工智能·经验分享·笔记·python·个人开发·visual studio
xcLeigh2 小时前
AI的提示词专栏:Prompt 与 Python Pandas 的结合使用指南
人工智能·python·ai·prompt·提示词
羽小暮2 小时前
Yolo11环境配置win+Python+Anaconda--小白目标检测学习专用(超详细)
人工智能·yolo·目标检测
草莓熊Lotso2 小时前
Python 入门超详细指南:环境搭建 + 核心优势 + 应用场景(零基础友好)
运维·开发语言·人工智能·python·深度学习·学习·pycharm
zhuqiyua2 小时前
【无标题】
算法
雪寻梅*2 小时前
(深度学习)python+yolov11训练自己的数据集
人工智能·python·深度学习·yolo
tq10862 小时前
AI 重塑三层双链:从金字塔结构到人智协同网络
人工智能