3D 视觉前沿:NeRF、3D Gaussian Splatting 与点云处理

3D 视觉前沿:NeRF、3D Gaussian Splatting 与点云处理

1. 引言

3D 视觉是计算机视觉的前沿领域。从 2D 图像重建 3D 场景,传统方法需要复杂的多视图几何,而 NeRF 和 3D Gaussian Splatting(3DGS)用神经网络实现了高质量的 3D 重建和新视角合成。

技术演进:

复制代码
多视图立体 (MVS) → NeRF (2020) → Instant-NGP (2022) → 3DGS (2023)
  数学方法           隐式表示        快速训练              显式表示
  慢速               分钟级训练       秒级训练              实时渲染

2. NeRF 原理

2.1 核心思想

复制代码
NeRF 将场景表示为一个连续的隐式函数:
  F_θ: (x, y, z, θ, φ) → (r, g, b, σ)

输入:3D 坐标 (x,y,z) + 观察方向 (θ,φ)
输出:颜色 (r,g,b) + 密度 (σ)

通过体渲染积分得到像素颜色:
  C(r) = ∫ T(t) · σ(r(t)) · c(r(t), d) dt

其中 T(t) = exp(-∫σ(r(s))ds) 是累积透射率

2.2 NeRF 实现

python 复制代码
import torch
import torch.nn as nn

class NeRF(nn.Module):
    """基础 NeRF 模型"""

    def __init__(self, pos_freq=10, dir_freq=4, hidden=256):
        super().__init__()
        self.pos_freq = pos_freq
        self.dir_freq = dir_freq

        # 位置编码维度: 3 + 3*2*10 = 63
        pos_input = 3 + 3 * 2 * pos_freq
        # 方向编码维度: 3 + 3*2*4 = 27
        dir_input = 3 + 3 * 2 * dir_freq

        # 主干网络
        self.layers = nn.Sequential(
            nn.Linear(pos_input, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, hidden), nn.ReLU(),
        )

        # 密度头
        self.density_head = nn.Sequential(
            nn.Linear(hidden, hidden), nn.ReLU(),
            nn.Linear(hidden, 1), nn.Softplus(),
        )

        # 颜色头
        self.color_layers = nn.Sequential(
            nn.Linear(hidden + dir_input, hidden // 2), nn.ReLU(),
            nn.Linear(hidden // 2, 3), nn.Sigmoid(),
        )

    def positional_encoding(self, x, freq):
        """位置编码:将低维坐标映射到高维"""
        encodings = [x]
        for i in range(freq):
            encodings.append(torch.sin(2 ** i * torch.pi * x))
            encodings.append(torch.cos(2 ** i * torch.pi * x))
        return torch.cat(encodings, dim=-1)

    def forward(self, positions, directions):
        # 位置编码
        pos_enc = self.positional_encoding(positions, self.pos_freq)
        dir_enc = self.positional_encoding(directions, self.dir_freq)

        # 主干
        features = self.layers(pos_enc)

        # 密度
        density = self.density_head(features)

        # 颜色
        color_input = torch.cat([features, dir_enc], dim=-1)
        color = self.color_layers(color_input)

        return color, density


def volume_rendering(colors, densities, deltas):
    """体渲染"""
    # alpha = 1 - exp(-σ * δ)
    alpha = 1 - torch.exp(-densities * deltas)

    # T_i = prod(1 - alpha_j), j < i
    transmittance = torch.cumprod(
        torch.cat([torch.ones_like(alpha[:, :1]), 1 - alpha[:, :-1]], dim=1), dim=1
    )

    # 权重 = T_i * alpha_i
    weights = transmittance * alpha

    # 最终颜色
    rendered_color = (weights.unsqueeze(-1) * colors).sum(dim=1)

    return rendered_color

2.3 NeRF 训练

python 复制代码
def train_nerf(model, dataset, epochs=200000, lr=5e-4):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        # 从数据集采样射线
        rays_o, rays_d, target_colors = dataset.sample_rays(batch_size=4096)

        # 沿射线采样点
        t_vals = torch.linspace(0, 1, 64)
        points = rays_o.unsqueeze(1) + t_vals.unsqueeze(0).unsqueeze(-1) * rays_d.unsqueeze(1)

        # 前向传播
        colors, densities = model(points, rays_d.unsqueeze(1).expand_as(points))

        # 体渲染
        deltas = t_vals[1] - t_vals[0]
        rendered = volume_rendering(colors, densities, deltas)

        # 损失
        loss = ((rendered - target_colors) ** 2).mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if epoch % 1000 == 0:
            print(f"Epoch {epoch}, Loss: {loss.item():.6f}")

3. 3D Gaussian Splatting

3.1 核心思想

复制代码
3DGS 用一组 3D 高斯椭球表示场景:
每个高斯有以下属性:
  - 位置 μ ∈ R³
  - 协方差矩阵 Σ ∈ R³ˣ³(控制形状和方向)
  - 不透明度 α ∈ [0,1]
  - 球谐系数(控制颜色)

渲染:将 3D 高斯投影到 2D,进行 alpha blending

3.2 高斯定义

python 复制代码
import torch
import torch.nn as nn

class Gaussian3D:
    """3D 高斯椭球"""

    def __init__(self, num_points=100000):
        # 位置
        self.means = nn.Parameter(torch.randn(num_points, 3))
        # 旋转(四元数)
        self.rotations = nn.Parameter(torch.randn(num_points, 4))
        # 缩放
        self.scales = nn.Parameter(torch.randn(num_points, 3))
        # 不透明度
        self.opacities = nn.Parameter(torch.zeros(num_points))
        # 球谐系数(RGB)
        self.sh_coeffs = nn.Parameter(torch.randn(num_points, 3, 16))

    def get_covariance(self):
        """从旋转和缩放计算协方差矩阵"""
        # 归一化四元数
        rotations = torch.nn.functional.normalize(self.rotations, dim=-1)

        # 四元数 → 旋转矩阵 R
        r, x, y, z = rotations.unbind(-1)
        R = torch.stack([
            1-2*(y*y+z*z), 2*(x*y-r*z), 2*(x*z+r*y),
            2*(x*y+r*z), 1-2*(x*x+z*z), 2*(y*z-r*x),
            2*(x*z-r*y), 2*(y*z+r*x), 1-2*(x*x+y*y),
        ], dim=-1).reshape(-1, 3, 3)

        # 缩放矩阵 S
        S = torch.diag_embed(torch.exp(self.scales))

        # 协方差 = R @ S @ S @ R^T
        M = R @ S
        covariance = M @ M.transpose(-1, -2)

        return covariance

3.3 3DGS 渲染

python 复制代码
def render_gaussians(gaussians, camera_matrix, width, height):
    """将 3D 高斯投影到 2D 并渲染"""
    means = gaussians.means  # (N, 3)

    # 投影到 2D
    means_2d = project(means, camera_matrix)  # (N, 2)

    # 计算 2D 协方差
    cov_3d = gaussians.get_covariance()
    cov_2d = project_covariance(cov_3d, camera_matrix)  # (N, 2, 2)

    # 计算每个像素的颜色贡献
    opacities = torch.sigmoid(gaussians.opacities)

    # 按深度排序(远→近)
    depths = means[:, 2]
    sorted_indices = torch.argsort(depths)

    # Alpha Blending
    final_color = torch.zeros(height, width, 3)
    final_alpha = torch.zeros(height, width, 1)

    for idx in sorted_indices:
        mu = means_2d[idx]
        sigma = cov_2d[idx]
        alpha = opacities[idx]
        color = evaluate_sh(gaussians.sh_coeffs[idx], means[idx])

        # 计算该高斯对每个像素的影响
        influence = gaussian_2d(mu, sigma, width, height)

        # Alpha blending
        contribution = influence * alpha * (1 - final_alpha)
        final_color += contribution.unsqueeze(-1) * color
        final_alpha += contribution * alpha

    return final_color

4. 点云处理

4.1 PointNet 分类

python 复制代码
class PointNet(nn.Module):
    """PointNet 点云分类网络"""

    def __init__(self, num_classes=40):
        super().__init__()
        self.mlp1 = nn.Sequential(
            nn.Conv1d(3, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
            nn.Conv1d(64, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
        )
        self.mlp2 = nn.Sequential(
            nn.Conv1d(64, 128, 1), nn.BatchNorm1d(128), nn.ReLU(),
            nn.Conv1d(128, 1024, 1), nn.BatchNorm1d(1024), nn.ReLU(),
        )
        self.classifier = nn.Sequential(
            nn.Linear(1024, 512), nn.Dropout(0.3), nn.ReLU(),
            nn.Linear(512, 256), nn.Dropout(0.3), nn.ReLU(),
            nn.Linear(256, num_classes),
        )

    def forward(self, x):
        # x: (B, 3, N)
        x = self.mlp1(x)
        x = self.mlp2(x)
        x = torch.max(x, dim=2)[0]  # 全局最大池化
        return self.classifier(x)

4.2 点云分割

python 复制代码
class PointNetSeg(nn.Module):
    """PointNet 语义分割"""

    def __init__(self, num_classes=13):
        super().__init__()
        self.mlp1 = nn.Sequential(
            nn.Conv1d(3, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
            nn.Conv1d(64, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
        )
        self.mlp2 = nn.Sequential(
            nn.Conv1d(64, 128, 1), nn.BatchNorm1d(128), nn.ReLU(),
            nn.Conv1d(128, 1024, 1), nn.BatchNorm1d(1024), nn.ReLU(),
        )
        self.seg_head = nn.Sequential(
            nn.Conv1d(1088, 512, 1), nn.BatchNorm1d(512), nn.ReLU(),
            nn.Conv1d(512, 256, 1), nn.BatchNorm1d(256), nn.ReLU(),
            nn.Conv1d(256, num_classes, 1),
        )

    def forward(self, x):
        # x: (B, 3, N)
        local_feat = self.mlp1(x)  # (B, 64, N)
        global_feat = self.mlp2(local_feat)  # (B, 1024, N)
        global_feat = torch.max(global_feat, dim=2, keepdim=True)[0]  # (B, 1024, 1)
        global_feat = global_feat.expand(-1, -1, x.size(2))  # (B, 1024, N)

        combined = torch.cat([local_feat, global_feat], dim=1)  # (B, 1088, N)
        return self.seg_head(combined)  # (B, num_classes, N)

5. 3DGS vs NeRF 对比

特性 NeRF 3DGS
表示方式 隐式(MLP) 显式(高斯点)
训练速度 小时级 分钟级
渲染速度 秒级/帧 实时(100+ FPS)
可编辑性 困难 容易(直接操作点)
内存占用 高(需存所有点)
质量 高(细节更好)

6. 总结

3D 视觉的核心技术:

  1. NeRF:隐式神经表示,质量高但训练慢
  2. 3DGS:显式高斯表示,训练快、可实时渲染
  3. PointNet:直接处理点云的经典架构
  4. 应用方向:自动驾驶、AR/VR、数字孪生、机器人导航