暴力匹配
- 总结
-
- 数据说明
- Python代码分析
- C++暴力实现
- [C++ CUDA](#C++ CUDA)
- 实验
总结
当匹配两个特征点时SLAM系统一般选择最近邻的一些缩小匹配范围的一些方法,直接暴力匹配一般是全局最优的,但是往往伴随着很多时间的消耗。因此这里使用cuda对暴力匹配做优化使得计算时间可被SLAM系统接受
数据说明
图1中特征点维度为(M, 64)
图2中特征点维度为(N, 64)
表示图1检测出M个特征点,图2中检测出N个特征点,我们知道
Python代码分析
① torch.bmm → 算相似度矩阵 sim = feats1 @ feats2^T [B, N, M]
② argmax(dim=-1) → 正向最近邻 nn12: 对 A 每行找 B 中最大 [B, N]
③ argmax(dim=-2) → 反向最近邻 nn21: 对 B 每行找 A 中最大 [B, M]
④ mutual check → nn21[nn12[i]] == i → 互为最近邻
⑤ threshold filter → cossim_max > min_cossim → 阈值过滤
python
@torch.inference_mode()
def batch_match(self, feats1, feats2, min_cossim = -1):
B = len(feats1)
cossim = torch.bmm(feats1, feats2.permute(0,2,1))
match12 = torch.argmax(cossim, dim=-1)
match21 = torch.argmax(cossim.permute(0,2,1), dim=-1)
idx0 = torch.arange(len(match12[0]), device=match12.device)
batched_matches = []
for b in range(B):
mutual = match21[b][match12[b]] == idx0
if min_cossim > 0:
cossim_max, _ = cossim[b].max(dim=1)
good = cossim_max > min_cossim
idx0_b = idx0[mutual & good]
idx1_b = match12[b][mutual & good]
else:
idx0_b = idx0[mutual]
idx1_b = match12[b][mutual]
batched_matches.append((idx0_b, idx1_b))
return batched_matches
C++暴力实现
暴力实现for循环找最大的就好了,如果硬说可以优化的话倒是可以采用记忆中间值的方法加速,这样不用完全反复的算sim值
cpp
float BruteForceMatcher::ComputeCosSim(const float* desc0, const float* desc1) {
float dot = 0.0f;
float norm0 = 0.0f;
float norm1 = 0.0f;
for (int i = 0; i < 64; ++i) {
dot += desc0[i] * desc1[i];
norm0 += desc0[i] * desc0[i];
norm1 += desc1[i] * desc1[i];
}
if (norm0 < 1e-8f || norm1 < 1e-8f) {
return 0.0f;
}
return dot / (std::sqrt(norm0) * std::sqrt(norm1));
}
void BruteForceMatcher::Match(const float* desc0, int num0,
const float* desc1, int num1,
MatchResult& result) {
result.clear();
if (num0 == 0 || num1 == 0) {
return;
}
// 计算余弦相似度矩阵并找到相互最近邻
std::vector<int> match12(num0, -1);
std::vector<float> sim12(num0, -1.0f);
std::vector<int> match21(num1, -1);
std::vector<float> sim21(num1, -1.0f);
// 计算 desc0 -> desc1 的匹配
for (int i = 0; i < num0; ++i) {
float max_sim = -1.0f;
int best_j = -1;
for (int j = 0; j < num1; ++j) {
float sim = ComputeCosSim(&desc0[i * 64], &desc1[j * 64]);
if (sim > max_sim) {
max_sim = sim;
best_j = j;
}
}
match12[i] = best_j;
sim12[i] = max_sim;
}
// 计算 desc1 -> desc0 的匹配
for (int j = 0; j < num1; ++j) {
float max_sim = -1.0f;
int best_i = -1;
for (int i = 0; i < num0; ++i) {
float sim = ComputeCosSim(&desc0[i * 64], &desc1[j * 64]);
if (sim > max_sim) {
max_sim = sim;
best_i = i;
}
}
match21[j] = best_i;
sim21[j] = max_sim;
}
// 筛选相互最近邻
for (int i = 0; i < num0; ++i) {
int j = match12[i];
if (j >= 0 && match21[j] == i) {
// 相互最近邻
float sim = sim12[i];
// 应用余弦相似度阈值
if (min_cossim_ <= 0 || sim > min_cossim_) {
result.matches.push_back({i, j});
result.scores.push_back(sim);
}
}
}
std::cout << "暴力匹配完成: " << num0 << " vs " << num1
<< " -> 找到 " << result.size() << " 对匹配" << std::endl;
}
C++ CUDA
python中的三步可合并成一步
cpp
① bmm → sim [N,M] ┐
② argmax(sim, dim=-1) → nn12 ├→ find_nn_kernel: 边算内积边找 max
③ argmax(sim, dim=-2) → nn21 ┘ 不存 sim 矩阵,直接输出 nn + score
Kernel设计
- Step 1: 加载 A[row] → shared memory
64 个 float,被整个 block 反复使用 - Step 2: 每个线程扫描 ~8 行 B,float4 向量化内积
边算边比较,维护局部最优 (val, idx) - Step 3: Warp 内 shuffle 归约
32 → 1,5 轮,纯寄存器操作 - Step 4: 跨 warp 归约
4 个 warp → 1 个结果,写回 global memory
kernel实现
cpp
template<int BLOCK_T, int D>
__global__ void find_nn_kernel(
const float* __restrict__ A,
const float* __restrict__ B,
const int N,
const int M,
int* __restrict__ best_idx,
float* __restrict__ best_sim
) {
const int row = blockIdx.x;
if (row >= N) return;
const int tid = threadIdx.x;
// --- 1) 把 A[row, :] 加载到 shared memory ---
__shared__ float query[D];
for (int i = tid; i < D; i += blockDim.x) {
query[i] = A[row * D + i];
}
__syncthreads();
// --- 2) 每个线程扫描自己负责的 B 行,求局部最大 ---
float local_best_sim = -1e20f;
int local_best_idx = -1;
const float4* query4 = reinterpret_cast<const float4*>(query);
for (int b_row = tid; b_row < M; b_row += BLOCK_T) {
const float4* B_row4 = reinterpret_cast<const float4*>(B + b_row * D);
float dot = 0.0f;
#pragma unroll
for (int d = 0; d < D / 4; ++d) {
float4 q = query4[d];
float4 b = B_row4[d];
dot += q.x * b.x + q.y * b.y + q.z * b.z + q.w * b.w;
}
if (dot > local_best_sim) {
local_best_sim = dot;
local_best_idx = b_row;
}
}
// --- 3) Warp 内 shuffle 归约 ---
#pragma unroll
for (int offset = warpSize / 2; offset > 0; offset /= 2) {
float other_sim = __shfl_down_sync(0xffffffff, local_best_sim, offset);
int other_idx = __shfl_down_sync(0xffffffff, local_best_idx, offset);
if (other_sim > local_best_sim) {
local_best_sim = other_sim;
local_best_idx = other_idx;
}
}
// --- 4) 跨 warp 归约 ---
__shared__ float warp_best_sim[BLOCK_T / 32];
__shared__ int warp_best_idx[BLOCK_T / 32];
if (tid % warpSize == 0) {
warp_best_sim[tid / warpSize] = local_best_sim;
warp_best_idx[tid / warpSize] = local_best_idx;
}
__syncthreads();
if (tid < BLOCK_T / 32) {
local_best_sim = warp_best_sim[tid];
local_best_idx = warp_best_idx[tid];
#pragma unroll
for (int offset = (BLOCK_T / 32) / 2; offset > 0; offset /= 2) {
float other_sim = __shfl_down_sync(0xf, local_best_sim, offset);
int other_idx = __shfl_down_sync(0xf, local_best_idx, offset);
if (other_sim > local_best_sim) {
local_best_sim = other_sim;
local_best_idx = other_idx;
}
}
if (tid == 0) {
best_idx[row] = local_best_idx;
if (best_sim != nullptr) best_sim[row] = local_best_sim;
}
}
}
// 包装函数: 在GPU上执行最近邻搜索
void LaunchFindNNKernel(
const float* d_A, const float* d_B,
int N, int M,
int* d_best_idx, float* d_best_sim,
cudaStream_t stream
) {
const int BLOCK_T = 128;
const int D = 64;
dim3 grid(N);
dim3 block(BLOCK_T);
find_nn_kernel<BLOCK_T, D><<<grid, block, 0, stream>>>(
d_A, d_B, N, M, d_best_idx, d_best_sim
);
}
实验
可以看出基本是够用的了,因为普遍稀疏的SLAM系统单针匹配个数也就1024个左右,使用显卡p620都能跑到3.958ms
| 特征点数量 | 描述子维度 | 匹配对数 | CPU 时间 (ms) | GPU 时间 (ms) | 加速比 |
|---|---|---|---|---|---|
| 512 × 512 | 64 | 246 | 22.677 | 1.423 | 15.94× |
| 1024 × 1024 | 64 | 499 | 89.614 | 3.958 | 22.64× |
| 2048 × 2048 | 64 | 1029 | 354.243 | 17.776 | 19.93× |
| 4096 × 4096 | 64 | 2049 | 1418.830 | 76.999 | 18.43× |