Xfeat部署系列-1-暴力匹配加速！

暴力匹配

总结
- 数据说明
- Python代码分析
- C++暴力实现
- [C++ CUDA](#C++ CUDA)
实验

总结

当匹配两个特征点时SLAM系统一般选择最近邻的一些缩小匹配范围的一些方法，直接暴力匹配一般是全局最优的，但是往往伴随着很多时间的消耗。因此这里使用cuda对暴力匹配做优化使得计算时间可被SLAM系统接受

数据说明

图1中特征点维度为(M, 64)

图2中特征点维度为(N, 64)

表示图1检测出M个特征点，图2中检测出N个特征点，我们知道

Python代码分析

① torch.bmm → 算相似度矩阵 sim = feats1 @ feats2^T $B, N, M$

② argmax(dim=-1) → 正向最近邻 nn12: 对 A 每行找 B 中最大 $B, N$

③ argmax(dim=-2) → 反向最近邻 nn21: 对 B 每行找 A 中最大 $B, M$

④ mutual check → nn21 $nn12\[i$ ] == i → 互为最近邻

⑤ threshold filter → cossim_max > min_cossim → 阈值过滤

python 复制代码

@torch.inference_mode()
	def batch_match(self, feats1, feats2, min_cossim = -1):
		B = len(feats1)
		cossim = torch.bmm(feats1, feats2.permute(0,2,1))
		match12 = torch.argmax(cossim, dim=-1)
		match21 = torch.argmax(cossim.permute(0,2,1), dim=-1)

		idx0 = torch.arange(len(match12[0]), device=match12.device)

		batched_matches = []

		for b in range(B):
			mutual = match21[b][match12[b]] == idx0

			if min_cossim > 0:
				cossim_max, _ = cossim[b].max(dim=1)
				good = cossim_max > min_cossim
				idx0_b = idx0[mutual & good]
				idx1_b = match12[b][mutual & good]
			else:
				idx0_b = idx0[mutual]
				idx1_b = match12[b][mutual]

			batched_matches.append((idx0_b, idx1_b))

		return batched_matches

C++暴力实现

暴力实现for循环找最大的就好了，如果硬说可以优化的话倒是可以采用记忆中间值的方法加速，这样不用完全反复的算sim值

cpp 复制代码

float BruteForceMatcher::ComputeCosSim(const float* desc0, const float* desc1) {
    float dot = 0.0f;
    float norm0 = 0.0f;
    float norm1 = 0.0f;
    
    for (int i = 0; i < 64; ++i) {
        dot += desc0[i] * desc1[i];
        norm0 += desc0[i] * desc0[i];
        norm1 += desc1[i] * desc1[i];
    }
    
    if (norm0 < 1e-8f || norm1 < 1e-8f) {
        return 0.0f;
    }
    
    return dot / (std::sqrt(norm0) * std::sqrt(norm1));
}

void BruteForceMatcher::Match(const float* desc0, int num0,
                               const float* desc1, int num1,
                               MatchResult& result) {
    result.clear();
    
    if (num0 == 0 || num1 == 0) {
        return;
    }
    
    // 计算余弦相似度矩阵并找到相互最近邻
    std::vector<int> match12(num0, -1);
    std::vector<float> sim12(num0, -1.0f);
    std::vector<int> match21(num1, -1);
    std::vector<float> sim21(num1, -1.0f);
    
    // 计算 desc0 -> desc1 的匹配
    for (int i = 0; i < num0; ++i) {
        float max_sim = -1.0f;
        int best_j = -1;
        
        for (int j = 0; j < num1; ++j) {
            float sim = ComputeCosSim(&desc0[i * 64], &desc1[j * 64]);
            if (sim > max_sim) {
                max_sim = sim;
                best_j = j;
            }
        }
        
        match12[i] = best_j;
        sim12[i] = max_sim;
    }
    
    // 计算 desc1 -> desc0 的匹配
    for (int j = 0; j < num1; ++j) {
        float max_sim = -1.0f;
        int best_i = -1;
        
        for (int i = 0; i < num0; ++i) {
            float sim = ComputeCosSim(&desc0[i * 64], &desc1[j * 64]);
            if (sim > max_sim) {
                max_sim = sim;
                best_i = i;
            }
        }
        
        match21[j] = best_i;
        sim21[j] = max_sim;
    }
    
    // 筛选相互最近邻
    for (int i = 0; i < num0; ++i) {
        int j = match12[i];
        if (j >= 0 && match21[j] == i) {
            // 相互最近邻
            float sim = sim12[i];
            
            // 应用余弦相似度阈值
            if (min_cossim_ <= 0 || sim > min_cossim_) {
                result.matches.push_back({i, j});
                result.scores.push_back(sim);
            }
        }
    }
    
    std::cout << "暴力匹配完成: " << num0 << " vs " << num1 
              << " -> 找到 " << result.size() << " 对匹配" << std::endl;
}

C++ CUDA

python中的三步可合并成一步

cpp 复制代码

① bmm → sim [N,M]                  ┐
② argmax(sim, dim=-1) → nn12       ├→ find_nn_kernel: 边算内积边找 max
③ argmax(sim, dim=-2) → nn21       ┘   不存 sim 矩阵，直接输出 nn + score

Kernel设计

Step 1: 加载 A $row$ → shared memory
64 个 float，被整个 block 反复使用
Step 2: 每个线程扫描 ~8 行 B，float4 向量化内积
边算边比较，维护局部最优 (val, idx)
Step 3: Warp 内 shuffle 归约
32 → 1，5 轮，纯寄存器操作
Step 4: 跨 warp 归约
4 个 warp → 1 个结果，写回 global memory

kernel实现

cpp 复制代码

template<int BLOCK_T, int D>
__global__ void find_nn_kernel(
    const float* __restrict__ A,
    const float* __restrict__ B,
    const int N,
    const int M,
    int*   __restrict__ best_idx,
    float* __restrict__ best_sim
) {
    const int row = blockIdx.x;
    if (row >= N) return;
    const int tid = threadIdx.x;

    // --- 1) 把 A[row, :] 加载到 shared memory ---
    __shared__ float query[D];
    for (int i = tid; i < D; i += blockDim.x) {
        query[i] = A[row * D + i];
    }
    __syncthreads();

    // --- 2) 每个线程扫描自己负责的 B 行，求局部最大 ---
    float local_best_sim = -1e20f;
    int   local_best_idx = -1;
    const float4* query4 = reinterpret_cast<const float4*>(query);
    for (int b_row = tid; b_row < M; b_row += BLOCK_T) {
        const float4* B_row4 = reinterpret_cast<const float4*>(B + b_row * D);
        float dot = 0.0f;
        #pragma unroll
        for (int d = 0; d < D / 4; ++d) {
            float4 q = query4[d];
            float4 b = B_row4[d];
            dot += q.x * b.x + q.y * b.y + q.z * b.z + q.w * b.w;
        }
        if (dot > local_best_sim) {
            local_best_sim = dot;
            local_best_idx = b_row;
        }
    }

    // --- 3) Warp 内 shuffle 归约 ---
    #pragma unroll
    for (int offset = warpSize / 2; offset > 0; offset /= 2) {
        float other_sim = __shfl_down_sync(0xffffffff, local_best_sim, offset);
        int   other_idx = __shfl_down_sync(0xffffffff, local_best_idx, offset);
        if (other_sim > local_best_sim) {
            local_best_sim = other_sim;
            local_best_idx = other_idx;
        }
    }

    // --- 4) 跨 warp 归约 ---
    __shared__ float warp_best_sim[BLOCK_T / 32];
    __shared__ int   warp_best_idx[BLOCK_T / 32];
    if (tid % warpSize == 0) {
        warp_best_sim[tid / warpSize] = local_best_sim;
        warp_best_idx[tid / warpSize] = local_best_idx;
    }
    __syncthreads();
    if (tid < BLOCK_T / 32) {
        local_best_sim = warp_best_sim[tid];
        local_best_idx = warp_best_idx[tid];
        #pragma unroll
        for (int offset = (BLOCK_T / 32) / 2; offset > 0; offset /= 2) {
            float other_sim = __shfl_down_sync(0xf, local_best_sim, offset);
            int   other_idx = __shfl_down_sync(0xf, local_best_idx, offset);
            if (other_sim > local_best_sim) {
                local_best_sim = other_sim;
                local_best_idx = other_idx;
            }
        }
        if (tid == 0) {
            best_idx[row] = local_best_idx;
            if (best_sim != nullptr) best_sim[row] = local_best_sim;
        }
    }
}

// 包装函数: 在GPU上执行最近邻搜索
void LaunchFindNNKernel(
    const float* d_A, const float* d_B,
    int N, int M,
    int* d_best_idx, float* d_best_sim,
    cudaStream_t stream
) {
    const int BLOCK_T = 128;
    const int D = 64;
    dim3 grid(N);
    dim3 block(BLOCK_T);
    
    find_nn_kernel<BLOCK_T, D><<<grid, block, 0, stream>>>(
        d_A, d_B, N, M, d_best_idx, d_best_sim
    );
}

实验

可以看出基本是够用的了，因为普遍稀疏的SLAM系统单针匹配个数也就1024个左右，使用显卡p620都能跑到3.958ms

特征点数量	描述子维度	匹配对数	CPU 时间 (ms)	GPU 时间 (ms)	加速比
512 × 512	64	246	22.677	1.423	15.94×
1024 × 1024	64	499	89.614	3.958	22.64×
2048 × 2048	64	1029	354.243	17.776	19.93×
4096 × 4096	64	2049	1418.830	76.999	18.43×