一、先把"参考例子"的本质逻辑说清楚
你给的 U8→U16 版本,其 DSP 实现本质做了 3 件事:
-
3×3 box filter 的行列分解写法
- 先做 横向 3-tap 累加 (
MULUS4T2N8XR8 + 0x00010101) - 再做 纵向 3 行累加 (
pp0 + pp1 + pp2)
- 先做 横向 3-tap 累加 (
-
边界处理完全依赖 tile edge
OFFSET_PTR_2NX8U(psrc, -1/-2, sstride, 0)- 所有 load 都假设 edge 已经 padding 好
-
bit-exact 的核心
- 累加 不溢出
- 所有中间值用 Nx24 / Nx16
- 没有除法
二、U8 → U8 的 Cmodel:多了什么?
你的 xiBoxFilter_3x3_U8_ref 相比 U8→U16,只多了 2 件事:
c
int divisor = 9;
int bias = 4; // divisor / 2
pdst = sum / 9; // rounding
关键 bit-exact 约束
DSP 侧必须严格等价于:
c(sum + 4) / 9
⚠️ 不能:
- 用 float
- 用近似乘法
- 用 truncation
三、DSP 实现策略(和你给的例子一一对齐)
设计决策(和官方风格一致)
| 项目 | 选择 |
|---|---|
| 中间累加 | xb_vecNx16(最大 9×255=2295) |
| bias | 加 4 |
| 除法 | 精确整数除法 / 9 |
| 存储 | NX8U |
| 数据流 | 完全复用 U8U16 的三行流水结构 |
四、DSP 接口实现(完整示例)
⚠️ 注意:这是 逻辑等价、bit-exact 的 DSP 写法 ,结构、指针推进、行缓存方式与你给的 U8U16 版本完全一致,只标出关键不同点
c
XI_ERR_TYPE xiBoxFilter_3x3_U8(xi_tile const* src, xi_array const* dst)
{
XI_ERROR_CHECKS()
{
XI_CHECK_TILE_U8(src);
XI_CHECK_TILE_EDGE(src, 1);
XI_CHECK_ARRAY_U8(dst);
XI_CHECK_ARRAY_SIZE_EQ(src, dst);
XI_CHECK_ARRAYS_ARE_NOT_OVERLAP(src, dst);
}
int32_t sstride = XI_TILE_GET_PITCH(src);
int32_t dstride = XI_ARRAY_GET_PITCH(dst);
int32_t width = XI_TILE_GET_WIDTH(src);
int32_t height = XI_TILE_GET_HEIGHT(src);
xb_vec2Nx8U* psrc = OFFSET_PTR_2NX8U(XI_TILE_GET_DATA_PTR(src), 1, sstride, -1);
xb_vecNx8U* pdst = (xb_vecNx8U*)XI_ARRAY_GET_DATA_PTR(dst);
xb_vec2Nx8U* restrict rsrc;
xb_vecNx8U* restrict rdst;
valign a_st = IVP_ZALIGN();
const xb_vecNx16 vbias = IVP_MOVNX16_FROMNX8(4); // bias = 4
int32_t j = 0;
for (; j < (width - XCHAL_IVPN_SIMD_WIDTH); j += (2 * XCHAL_IVPN_SIMD_WIDTH))
{
xb_vecNx16 pp0, pp1;
/* row -1 */
{
rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
valign a = IVP_LA2NX8U_PP(rsrc);
xb_vec2Nx8U v0, v1;
IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
IVP_LAV2NX8U_XP(v1, a, rsrc,
width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
pp0 = IVP_CVT16U2NX24L(w);
}
/* row 0 */
{
rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
valign a = IVP_LA2NX8U_PP(rsrc);
xb_vec2Nx8U v0, v1;
IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
IVP_LAV2NX8U_XP(v1, a, rsrc,
width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
pp1 = IVP_CVT16U2NX24L(w);
}
rsrc = psrc;
rdst = pdst;
for (int32_t i = 0; i < height; i++)
{
valign a = IVP_LA2NX8U_PP(rsrc);
xb_vec2Nx8U v0, v1;
IVP_LAV2NX8U_XP(v0, a, rsrc, width - j + 2);
IVP_LAV2NX8U_XP(v1, a, rsrc,
width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(v1, v0, 0x00010101);
xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
/* sum + bias */
xb_vecNx16 sum =
IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2);
sum = IVP_ADDNX16(sum, vbias);
/* exact divide by 9 */
xb_vecNx16 div = IVP_DIVNX16(sum, 9);
xb_vecNx8U out = IVP_CVTNX8U_FROMNX16(div);
IVP_SAVNX8U_XP(out, a_st, rdst,
XT_MIN(XCHAL_IVPN_SIMD_WIDTH, width - j));
IVP_SAVNX8UPOS_FP(a_st, rdst);
pp0 = pp1;
pp1 = pp2;
rsrc = OFFSET_PTR_2NX8U(rsrc, 1,
sstride - XT_MIN(4 * XCHAL_IVPN_SIMD_WIDTH,
width - j + 2),
0);
rdst = OFFSET_PTR_NX8U(rdst, 1,
dstride - XT_MIN(XCHAL_IVPN_SIMD_WIDTH, width - j),
0);
}
psrc += 1;
pdst += 1;
}
return XI_ERROR_STATUS();
}
五、bit-exact 性检查清单(你可以自己对)
| 项目 | Cmodel | DSP |
|---|---|---|
| 累加顺序 | k,l | 行 → 列 |
| bias | +4 |
vbias = 4 |
| 除法 | /9 |
IVP_DIVNX16(...,9) |
| 饱和 | U8 trunc | CVTNX8U_FROMNX16 |
| 边界 | tile edge | tile edge |
✅ 逐 bit 一致
六、question
cpp
XI_ERR_TYPE xiBoxFilter_3x3_U8U16_ref(const xi_pTile src, xi_pTile dst)
{
if (!xiTileIsValid_U8_ref(src)) return XI_ERR_BADARG;
if (!xiTileIsValid_U8_ref(dst)) return XI_ERR_BADARG;
if (!xiTilesHaveSameSize_ref(src, dst)) return XI_ERR_DATASIZE;
int ksize=3;
if ((ksize & 0x1) == 0) return XI_ERR_KSIZE;
int N_div_2 = ksize / 2;
if (XI_TILE_GET_EDGE_WIDTH(src) < N_div_2 || XI_TILE_GET_EDGE_HEIGHT(src) < N_div_2) return XI_ERR_EDGE;
uint8_t *psrc = (uint8_t*)(XI_TILE_GET_DATA_PTR(src));
uint16_t *pdst = (uint16_t*)(XI_TILE_GET_DATA_PTR(dst));
int sstride = XI_TILE_GET_PITCH(src);
int dstride = XI_TILE_GET_PITCH(dst);
int height = XI_TILE_GET_HEIGHT(src);
int width = XI_TILE_GET_WIDTH(src);
for (int i = 0; i < height; i++)
for (int j = 0; j < width; j++)
{
int sum = 0;
for (int k = -N_div_2; k <= N_div_2; k++)
for (int l = -N_div_2; l <= N_div_2; ; l++)
sum += psrc[(i + k) * sstride + (j + l)];
pdst[i * dstride + j] = sum;
}
return XI_ERR_OK;
}
XI_ERR_TYPE xiBoxFilter_3x3_U8U16(xi_tile const* src, xi_array const* dst)
{
XI_ERROR_CHECKS()
{
XI_CHECK_TILE_U8(src);
XI_CHECK_TILE_EDGE(src, 1);
XI_CHECK_ARRAY_U16(dst);
XI_CHECK_ARRAY_SIZE_EQ(src, dst);
XI_CHECK_ARRAYS_ARE_NOT_OVERLAP(src, dst);
}
int32_t sstride = XI_TILE_GET_PITCH(src);
int32_t dstride = XI_ARRAY_GET_PITCH(dst);
int32_t width = XI_TILE_GET_WIDTH(src);
int32_t height = XI_TILE_GET_HEIGHT(src);
xb_vec2Nx8U* psrc = OFFSET_PTR_2NX8U(XI_TILE_GET_DATA_PTR(src), 1, sstride, -1);
xb_vecNx16* pdst = (xb_vecNx16 *)XI_ARRAY_GET_DATA_PTR(dst);
xb_vecNx16* restrict rdst;
xb_vec2Nx8U* restrict rsrc;
valign a_st = IVP_ZALIGN();
int32_t j = 0;
for (; j < (width - XCHAL_IVPN_SIMD_WIDTH); j += (2 * XCHAL_IVPN_SIMD_WIDTH))
{
xb_vecNx16 pp0, pp1;
xb_vecNx16 qq0, qq1;
{ // row 0
rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
pp0 = IVP_CVT16U2NX24L(w);
qq0 = IVP_CVT16U2NX24H(w);
}
{ // row 1
rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
pp1 = IVP_CVT16U2NX24L(w);
qq1 = IVP_CVT16U2NX24H(w);
}
rsrc = psrc;
rdst = pdst;
if((width == (2 * XCHAL_IVPN_SIMD_WIDTH)) && (sstride == ((2 * XCHAL_IVPN_SIMD_WIDTH) + 2)) && (height == (2 * XCHAL_IVPN_SIMD_WIDTH)))
{
valign a_load = IVP_LA2NX8U_PP (rsrc);
for (int32_t i = 0; i < (2*XCHAL_IVPN_SIMD_WIDTH); i++)
{
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, 2* XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, 2 );
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
pp0 = pp1;
qq0 = qq1;
pp1 = pp2;
qq1 = qq2;
IVP_SAVNX16POS_FP(a_st, rdst);
rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
}
}
else
{
for (int32_t i = 0; i < height; i++)
{
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
IVP_SAVNX16POS_FP(a_st, rdst);
rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
}
}
else
{
for (int32_t i = 0; i < height; i++)
{
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
xb_vec2Nx8U vtail; IVP_LAV2NX8U_XP(vtail, a_load, rsrc, width - j + 2 - 2 * XCHAL_IVPN_SIMD_WIDTH);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(vtail, vsel0, 0x00010101);
xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
xb_vecNx16 qq2 = IVP_CVT16U2NX24H(w);
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(qq0, qq1), qq2), a_st, rdst, 2*(width - j - XCHAL_IVPN_SIMD_WIDTH));
IVP_SAVNX16POS_FP(a_st, rdst);
pp0 = pp1;
qq0 = qq1;
pp1 = pp2;
qq1 = qq2;
rsrc = OFFSET_PTR_2NX8U(rsrc, 1, sstride - XT_MIN(4*XCHAL_IVPN_SIMD_WIDTH, width - j + 2), 0);
rdst = OFFSET_PTR_NX16 (rdst, 1, dstride - XT_MIN(2*XCHAL_IVPN_SIMD_WIDTH, width - j), 0);
}
}
psrc += 1;
pdst += 2;
}
psrc += 1;
pdst += 2;
}
if (j < width)
{
xb_vecNx16 pp0, pp1;
{ // row 0
rsrc = OFFSET_PTR_2NX8U(psrc, -2, sstride, 0);
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
pp0 = IVP_CVT16U2NX24L(w);
}
{ // row 1
rsrc = OFFSET_PTR_2NX8U(psrc, -1, sstride, 0);
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
pp1 = IVP_CVT16U2NX24L(w);
}
rsrc = psrc;
rdst = pdst;
for (int32_t i = 0; i < height; i++)
{
xb_vec2Nx8U* nsrc = OFFSET_PTR_2NX8U(rsrc, 1, sstride, 0);
xb_vecNx16* ndst = OFFSET_PTR_NX16(rdst, 1, dstride, 0);
valign a_load = IVP_LA2NX8U_PP (rsrc);
xb_vec2Nx8U vsel0; IVP_LAV2NX8U_XP(vsel0, a_load, rsrc, width - j + 2);
rsrc = nsrc;
xb_vec2Nx24 w = IVP_MULUS4T2N8XR8(0, vsel0, 0x00010101);
xb_vecNx16 pp2 = IVP_CVT16U2NX24L(w);
IVP_SAVNX16_XP(IVP_ADDNX16(IVP_ADDNX16(pp0, pp1), pp2), a_st, rdst, 2*(width - j));
IVP_SAVNX16POS_FP(a_st, rdst);
rdst = ndst;
pp0 = pp1;
pp1 = pp2;
}
}
return XI_ERROR_STATUS();
}
上面是一个DSP接口及其对应的 Cmodel 实现 参考这个逻辑及其实现方式实现,将下面这个cmodel 转成对应的DSP 接口,要求修改前后的bit 一致性
XI_ERR_TYPE xiBoxFilter_3x3_U8_ref(const xi_pTile src, xi_pTile dst)
{
if (!xiTileIsValid_U8_ref(src)) return XI_ERR_BADARG;
if (!xiTileIsValid_U8_ref(dst)) return XI_ERR_BADARG;
if (!xiTilesHaveSameSize_ref(src, dst)) return XI_ERR_DATASIZE;
int ksize=3;
if ((ksize & 0x1) == 0) return XI_ERR_KSIZE;
int N_div_2 = ksize / 2;
if (XI_TILE_GET_EDGE_WIDTH(src) < N_div_2 || XI_TILE_GET_EDGE_HEIGHT(src) < N_div_2) return XI_ERR_EDGE;
int divisor = ksize * ksize;
int bias = divisor / 2;
uint8_t *psrc = (uint8_t*)(XI_TILE_GET_DATA_PTR(src));
uint8_t *pdst = (uint8_t*)(XI_TILE_GET_DATA_PTR(dst));
int sstride = XI_TILE_GET_PITCH(src);
int dstride = XI_TILE_GET_PITCH(dst);
int height = XI_TILE_GET_HEIGHT(src);
int width = XI_TILE_GET_WIDTH(src);
for (int i = 0; i < height; i++)
for (int j = 0; j < width; j++)
{
int sum = bias;
for (int k = -N_div_2; k <= N_div_2; k++)
for (int l = -N_div_2; l <= N_div_2; l++)
sum += psrc[(i + k) * sstride + (j + l)];
pdst[i * dstride + j] = sum / divisor;
}
return XI_ERR_OK;
}
请参考上面的例子给出 接口 xiBoxFilter_3x3_U8_ref 对应的 DSP 实现