双目视觉深度估计与 3D 点云重建

1. 双目视觉原理

复制代码

双目测距原理：
├── 两台相机平行放置，间距为基线 B
├── 同一物体在左右图像中的像素差为视差 d
├── 深度 Z = (B × f) / d
└── f 为焦距（像素单位）

  左相机        右相机
    │            │
    │←── B ────→│
    │            │
    ▼            ▼
  ┌───┐        ┌───┐
  │ L │        │ R │
  └───┘        └───┘
    │            │
    │     Z      │
    │←──────────→│
    ▼            ▼
  ┌─────────────┐
  │    物体      │
  └─────────────┘

2. 双目相机标定

python 复制代码

#!/usr/bin/env python3
"""stereo_calibration.py - 双目标定"""
import cv2
import numpy as np

def stereo_calibrate(left_images, right_images, pattern_size=(9, 6)):
    """双目标定"""
    objp = np.zeros((pattern_size[0] * pattern_size[1], 3), np.float32)
    objp[:, :2] = np.mgrid[0:pattern_size[0], 0:pattern_size[1]].T.reshape(-1, 2)
    objp *= 25  # 25mm
    
    obj_points = []
    img_points_l = []
    img_points_r = []
    
    for left_path, right_path in zip(left_images, right_images):
        img_l = cv2.imread(left_path)
        img_r = cv2.imread(right_path)
        gray_l = cv2.cvtColor(img_l, cv2.COLOR_BGR2GRAY)
        gray_r = cv2.cvtColor(img_r, cv2.COLOR_BGR2GRAY)
        
        ret_l, corners_l = cv2.findChessboardCorners(gray_l, pattern_size)
        ret_r, corners_r = cv2.findChessboardCorners(gray_r, pattern_size)
        
        if ret_l and ret_r:
            corners_l = cv2.cornerSubPix(gray_l, corners_l, (11, 11), (-1, -1),
                (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001))
            corners_r = cv2.cornerSubPix(gray_r, corners_r, (11, 11), (-1, -1),
                (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 30, 0.001))
            
            obj_points.append(objp)
            img_points_l.append(corners_l)
            img_points_r.append(corners_r)
    
    # 单目标定
    ret_l, mtx_l, dist_l, _, _ = cv2.calibrateCamera(
        obj_points, img_points_l, gray_l.shape[::-1], None, None)
    ret_r, mtx_r, dist_r, _, _ = cv2.calibrateCamera(
        obj_points, img_points_r, gray_r.shape[::-1], None, None)
    
    # 双目标定
    flags = cv2.CALIB_FIX_INTRINSIC
    ret, mtx_l, dist_l, mtx_r, dist_r, R, T, E, F = cv2.stereoCalibrate(
        obj_points, img_points_l, img_points_r,
        mtx_l, dist_l, mtx_r, dist_r,
        gray_l.shape[::-1], flags=flags
    )
    
    # 立体校正
    R1, R2, P1, P2, Q, roi1, roi2 = cv2.stereoRectify(
        mtx_l, dist_l, mtx_r, dist_r,
        gray_l.shape[::-1], R, T,
        alpha=0, newImageSize=gray_l.shape[::-1]
    )
    
    # 计算映射表
    map1_l, map2_l = cv2.initUndistortRectifyMap(
        mtx_l, dist_l, R1, P1, gray_l.shape[::-1], cv2.CV_32FC1)
    map1_r, map2_r = cv2.initUndistortRectifyMap(
        mtx_r, dist_r, R2, P2, gray_r.shape[::-1], cv2.CV_32FC1)
    
    print(f"重投影误差: {ret:.4f}")
    print(f"基线距离: {np.linalg.norm(T):.2f} mm")
    
    return {
        'camera_matrix_l': mtx_l, 'dist_l': dist_l,
        'camera_matrix_r': mtx_r, 'dist_r': dist_r,
        'R': R, 'T': T, 'R1': R1, 'R2': R2,
        'P1': P1, 'P2': P2, 'Q': Q,
        'map1_l': map1_l, 'map2_l': map2_l,
        'map1_r': map1_r, 'map2_r': map2_r,
    }

if __name__ == "__main__":
    left_imgs = [f"calib/left/{i}.jpg" for i in range(20)]
    right_imgs = [f"calib/right/{i}.jpg" for i in range(20)]
    params = stereo_calibrate(left_imgs, right_imgs)
    np.savez("stereo_calibration.npz", **params)

3. 立体匹配（SGBM）

python 复制代码

#!/usr/bin/env python3
"""stereo_sgbm.py - SGBM 立体匹配"""
import cv2
import numpy as np

class StereoDepth:
    """双目深度估计"""
    
    def __init__(self, calib_params):
        self.map1_l = calib_params['map1_l']
        self.map2_l = calib_params['map2_l']
        self.map1_r = calib_params['map1_r']
        self.map2_r = calib_params['map2_r']
        self.Q = calib_params['Q']
        self.baseline = np.linalg.norm(calib_params['T'])
        self.fx = calib_params['P1'][0, 0]
        
        # SGBM 参数
        self.stereo = cv2.StereoSGBM_create(
            minDisparity=0,
            numDisparities=128,  # 必须是 16 的倍数
            blockSize=5,
            P1=8 * 3 * 5**2,
            P2=32 * 3 * 5**2,
            disp12MaxDiff=1,
            uniquenessRatio=10,
            speckleWindowSize=100,
            speckleRange=32,
            preFilterCap=63,
            mode=cv2.STEREO_SGBM_MODE_SGBM_3WAY,
        )
    
    def compute_depth(self, left_img, right_img):
        """计算深度图"""
        # 校正
        rect_l = cv2.remap(left_img, self.map1_l, self.map2_l, cv2.INTER_LINEAR)
        rect_r = cv2.remap(right_img, self.map1_r, self.map2_r, cv2.INTER_LINEAR)
        
        # 转灰度
        gray_l = cv2.cvtColor(rect_l, cv2.COLOR_BGR2GRAY)
        gray_r = cv2.cvtColor(rect_r, cv2.COLOR_BGR2GRAY)
        
        # 计算视差
        disparity = self.stereo.compute(gray_l, gray_r).astype(np.float32) / 16.0
        
        # 视差转深度
        depth = np.zeros_like(disparity)
        valid = disparity > 0
        depth[valid] = (self.fx * self.baseline) / disparity[valid]
        
        return disparity, depth
    
    def compute_3d_points(self, disparity):
        """视差图转 3D 点云"""
        points_3d = cv2.reprojectImageTo3D(disparity, self.Q)
        return points_3d

if __name__ == "__main__":
    calib = np.load("stereo_calibration.npz", allow_pickle=True)
    stereo = StereoDepth(calib)
    
    left = cv2.imread("left.jpg")
    right = cv2.imread("right.jpg")
    
    disparity, depth = stereo.compute_depth(left, right)
    
    # 可视化深度图
    depth_vis = cv2.normalize(depth, None, 0, 255, cv2.NORM_MINMAX).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_vis, cv2.COLORMAP_JET)
    cv2.imwrite("depth.jpg", depth_color)
    
    # 测量特定点距离
    h, w = depth.shape
    center_depth = depth[h//2, w//2]
    print(f"画面中心距离: {center_depth:.2f} mm")

4. 深度学习深度估计

python 复制代码

#!/usr/bin/env python3
"""dl_depth.py - 深度学习单目深度估计"""
import cv2
import numpy as np
import torch

class DNNDepthEstimator:
    """深度学习深度估计（Depth Anything V2）"""
    
    def __init__(self, model_path):
        # 使用 OpenCV DNN 加载
        self.net = cv2.dnn.readNetFromONNX(model_path)
        self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
        self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
    
    def estimate(self, image, input_size=(518, 518)):
        """估计深度图"""
        h, w = image.shape[:2]
        
        # 预处理
        blob = cv2.dnn.blobFromImage(
            image, 1/255.0, input_size,
            swapRB=True, crop=False
        )
        
        # 推理
        self.net.setInput(blob)
        depth = self.net.forward()
        
        # 后处理
        depth = depth[0, 0]  # [H, W]
        depth = cv2.resize(depth, (w, h))
        
        # 归一化到相对深度
        depth = (depth - depth.min()) / (depth.max() - depth.min())
        
        return depth

if __name__ == "__main__":
    estimator = DNNDepthEstimator("depth_anything_v2_vitb.onnx")
    
    image = cv2.imread("test.jpg")
    depth = estimator.estimate(image)
    
    # 可视化
    depth_vis = (depth * 255).astype(np.uint8)
    depth_color = cv2.applyColorMap(depth_vis, cv2.COLORMAP_INFERNO)
    cv2.imwrite("depth_dl.jpg", depth_color)

5. 3D 点云处理

python 复制代码

#!/usr/bin/env python3
"""pointcloud.py - 3D 点云处理"""
import numpy as np
import open3d as o3d

def depth_to_pointcloud(depth, camera_matrix, rgb=None):
    """深度图转 3D 点云"""
    fx = camera_matrix[0, 0]
    fy = camera_matrix[1, 1]
    cx = camera_matrix[0, 2]
    cy = camera_matrix[1, 2]
    
    h, w = depth.shape
    u, v = np.meshgrid(np.arange(w), np.arange(h))
    
    z = depth
    x = (u - cx) * z / fx
    y = (v - cy) * z / fy
    
    points = np.stack([x, y, z], axis=-1).reshape(-1, 3)
    
    # 过滤无效点
    valid = (z.reshape(-1) > 0) & (z.reshape(-1) < 10000)
    points = points[valid]
    
    # 创建点云
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(points)
    
    if rgb is not None:
        colors = rgb.reshape(-1, 3)[valid] / 255.0
        pcd.colors = o3d.utility.Vector3dVector(colors)
    
    return pcd

def measure_volume(pcd):
    """测量点云体积"""
    # 计算边界框
    bbox = pcd.get_axis_aligned_bounding_box()
    extent = bbox.get_extent()
    
    volume = extent[0] * extent[1] * extent[2]
    
    return {
        'width': extent[0],
        'height': extent[1],
        'depth': extent[2],
        'volume': volume,
        'bbox': bbox,
    }

总结

方法	精度	实时性	成本	适用场景
双目 SGBM	高	15 FPS	中	室内/室外
深度相机	高	30 FPS	中	室内
深度学习	中	10 FPS	低	通用
LiDAR	极高	10 Hz	高	自动驾驶