3D 视觉前沿:NeRF、3D Gaussian Splatting 与点云处理
1. 引言
3D 视觉是计算机视觉的前沿领域。从 2D 图像重建 3D 场景,传统方法需要复杂的多视图几何,而 NeRF 和 3D Gaussian Splatting(3DGS)用神经网络实现了高质量的 3D 重建和新视角合成。
技术演进:
多视图立体 (MVS) → NeRF (2020) → Instant-NGP (2022) → 3DGS (2023)
数学方法 隐式表示 快速训练 显式表示
慢速 分钟级训练 秒级训练 实时渲染
2. NeRF 原理
2.1 核心思想
NeRF 将场景表示为一个连续的隐式函数:
F_θ: (x, y, z, θ, φ) → (r, g, b, σ)
输入:3D 坐标 (x,y,z) + 观察方向 (θ,φ)
输出:颜色 (r,g,b) + 密度 (σ)
通过体渲染积分得到像素颜色:
C(r) = ∫ T(t) · σ(r(t)) · c(r(t), d) dt
其中 T(t) = exp(-∫σ(r(s))ds) 是累积透射率
2.2 NeRF 实现
python
import torch
import torch.nn as nn
class NeRF(nn.Module):
"""基础 NeRF 模型"""
def __init__(self, pos_freq=10, dir_freq=4, hidden=256):
super().__init__()
self.pos_freq = pos_freq
self.dir_freq = dir_freq
# 位置编码维度: 3 + 3*2*10 = 63
pos_input = 3 + 3 * 2 * pos_freq
# 方向编码维度: 3 + 3*2*4 = 27
dir_input = 3 + 3 * 2 * dir_freq
# 主干网络
self.layers = nn.Sequential(
nn.Linear(pos_input, hidden), nn.ReLU(),
nn.Linear(hidden, hidden), nn.ReLU(),
nn.Linear(hidden, hidden), nn.ReLU(),
nn.Linear(hidden, hidden), nn.ReLU(),
)
# 密度头
self.density_head = nn.Sequential(
nn.Linear(hidden, hidden), nn.ReLU(),
nn.Linear(hidden, 1), nn.Softplus(),
)
# 颜色头
self.color_layers = nn.Sequential(
nn.Linear(hidden + dir_input, hidden // 2), nn.ReLU(),
nn.Linear(hidden // 2, 3), nn.Sigmoid(),
)
def positional_encoding(self, x, freq):
"""位置编码:将低维坐标映射到高维"""
encodings = [x]
for i in range(freq):
encodings.append(torch.sin(2 ** i * torch.pi * x))
encodings.append(torch.cos(2 ** i * torch.pi * x))
return torch.cat(encodings, dim=-1)
def forward(self, positions, directions):
# 位置编码
pos_enc = self.positional_encoding(positions, self.pos_freq)
dir_enc = self.positional_encoding(directions, self.dir_freq)
# 主干
features = self.layers(pos_enc)
# 密度
density = self.density_head(features)
# 颜色
color_input = torch.cat([features, dir_enc], dim=-1)
color = self.color_layers(color_input)
return color, density
def volume_rendering(colors, densities, deltas):
"""体渲染"""
# alpha = 1 - exp(-σ * δ)
alpha = 1 - torch.exp(-densities * deltas)
# T_i = prod(1 - alpha_j), j < i
transmittance = torch.cumprod(
torch.cat([torch.ones_like(alpha[:, :1]), 1 - alpha[:, :-1]], dim=1), dim=1
)
# 权重 = T_i * alpha_i
weights = transmittance * alpha
# 最终颜色
rendered_color = (weights.unsqueeze(-1) * colors).sum(dim=1)
return rendered_color
2.3 NeRF 训练
python
def train_nerf(model, dataset, epochs=200000, lr=5e-4):
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
for epoch in range(epochs):
# 从数据集采样射线
rays_o, rays_d, target_colors = dataset.sample_rays(batch_size=4096)
# 沿射线采样点
t_vals = torch.linspace(0, 1, 64)
points = rays_o.unsqueeze(1) + t_vals.unsqueeze(0).unsqueeze(-1) * rays_d.unsqueeze(1)
# 前向传播
colors, densities = model(points, rays_d.unsqueeze(1).expand_as(points))
# 体渲染
deltas = t_vals[1] - t_vals[0]
rendered = volume_rendering(colors, densities, deltas)
# 损失
loss = ((rendered - target_colors) ** 2).mean()
optimizer.zero_grad()
loss.backward()
optimizer.step()
if epoch % 1000 == 0:
print(f"Epoch {epoch}, Loss: {loss.item():.6f}")
3. 3D Gaussian Splatting
3.1 核心思想
3DGS 用一组 3D 高斯椭球表示场景:
每个高斯有以下属性:
- 位置 μ ∈ R³
- 协方差矩阵 Σ ∈ R³ˣ³(控制形状和方向)
- 不透明度 α ∈ [0,1]
- 球谐系数(控制颜色)
渲染:将 3D 高斯投影到 2D,进行 alpha blending
3.2 高斯定义
python
import torch
import torch.nn as nn
class Gaussian3D:
"""3D 高斯椭球"""
def __init__(self, num_points=100000):
# 位置
self.means = nn.Parameter(torch.randn(num_points, 3))
# 旋转(四元数)
self.rotations = nn.Parameter(torch.randn(num_points, 4))
# 缩放
self.scales = nn.Parameter(torch.randn(num_points, 3))
# 不透明度
self.opacities = nn.Parameter(torch.zeros(num_points))
# 球谐系数(RGB)
self.sh_coeffs = nn.Parameter(torch.randn(num_points, 3, 16))
def get_covariance(self):
"""从旋转和缩放计算协方差矩阵"""
# 归一化四元数
rotations = torch.nn.functional.normalize(self.rotations, dim=-1)
# 四元数 → 旋转矩阵 R
r, x, y, z = rotations.unbind(-1)
R = torch.stack([
1-2*(y*y+z*z), 2*(x*y-r*z), 2*(x*z+r*y),
2*(x*y+r*z), 1-2*(x*x+z*z), 2*(y*z-r*x),
2*(x*z-r*y), 2*(y*z+r*x), 1-2*(x*x+y*y),
], dim=-1).reshape(-1, 3, 3)
# 缩放矩阵 S
S = torch.diag_embed(torch.exp(self.scales))
# 协方差 = R @ S @ S @ R^T
M = R @ S
covariance = M @ M.transpose(-1, -2)
return covariance
3.3 3DGS 渲染
python
def render_gaussians(gaussians, camera_matrix, width, height):
"""将 3D 高斯投影到 2D 并渲染"""
means = gaussians.means # (N, 3)
# 投影到 2D
means_2d = project(means, camera_matrix) # (N, 2)
# 计算 2D 协方差
cov_3d = gaussians.get_covariance()
cov_2d = project_covariance(cov_3d, camera_matrix) # (N, 2, 2)
# 计算每个像素的颜色贡献
opacities = torch.sigmoid(gaussians.opacities)
# 按深度排序(远→近)
depths = means[:, 2]
sorted_indices = torch.argsort(depths)
# Alpha Blending
final_color = torch.zeros(height, width, 3)
final_alpha = torch.zeros(height, width, 1)
for idx in sorted_indices:
mu = means_2d[idx]
sigma = cov_2d[idx]
alpha = opacities[idx]
color = evaluate_sh(gaussians.sh_coeffs[idx], means[idx])
# 计算该高斯对每个像素的影响
influence = gaussian_2d(mu, sigma, width, height)
# Alpha blending
contribution = influence * alpha * (1 - final_alpha)
final_color += contribution.unsqueeze(-1) * color
final_alpha += contribution * alpha
return final_color
4. 点云处理
4.1 PointNet 分类
python
class PointNet(nn.Module):
"""PointNet 点云分类网络"""
def __init__(self, num_classes=40):
super().__init__()
self.mlp1 = nn.Sequential(
nn.Conv1d(3, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
nn.Conv1d(64, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
)
self.mlp2 = nn.Sequential(
nn.Conv1d(64, 128, 1), nn.BatchNorm1d(128), nn.ReLU(),
nn.Conv1d(128, 1024, 1), nn.BatchNorm1d(1024), nn.ReLU(),
)
self.classifier = nn.Sequential(
nn.Linear(1024, 512), nn.Dropout(0.3), nn.ReLU(),
nn.Linear(512, 256), nn.Dropout(0.3), nn.ReLU(),
nn.Linear(256, num_classes),
)
def forward(self, x):
# x: (B, 3, N)
x = self.mlp1(x)
x = self.mlp2(x)
x = torch.max(x, dim=2)[0] # 全局最大池化
return self.classifier(x)
4.2 点云分割
python
class PointNetSeg(nn.Module):
"""PointNet 语义分割"""
def __init__(self, num_classes=13):
super().__init__()
self.mlp1 = nn.Sequential(
nn.Conv1d(3, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
nn.Conv1d(64, 64, 1), nn.BatchNorm1d(64), nn.ReLU(),
)
self.mlp2 = nn.Sequential(
nn.Conv1d(64, 128, 1), nn.BatchNorm1d(128), nn.ReLU(),
nn.Conv1d(128, 1024, 1), nn.BatchNorm1d(1024), nn.ReLU(),
)
self.seg_head = nn.Sequential(
nn.Conv1d(1088, 512, 1), nn.BatchNorm1d(512), nn.ReLU(),
nn.Conv1d(512, 256, 1), nn.BatchNorm1d(256), nn.ReLU(),
nn.Conv1d(256, num_classes, 1),
)
def forward(self, x):
# x: (B, 3, N)
local_feat = self.mlp1(x) # (B, 64, N)
global_feat = self.mlp2(local_feat) # (B, 1024, N)
global_feat = torch.max(global_feat, dim=2, keepdim=True)[0] # (B, 1024, 1)
global_feat = global_feat.expand(-1, -1, x.size(2)) # (B, 1024, N)
combined = torch.cat([local_feat, global_feat], dim=1) # (B, 1088, N)
return self.seg_head(combined) # (B, num_classes, N)
5. 3DGS vs NeRF 对比
| 特性 | NeRF | 3DGS |
|---|---|---|
| 表示方式 | 隐式(MLP) | 显式(高斯点) |
| 训练速度 | 小时级 | 分钟级 |
| 渲染速度 | 秒级/帧 | 实时(100+ FPS) |
| 可编辑性 | 困难 | 容易(直接操作点) |
| 内存占用 | 低 | 高(需存所有点) |
| 质量 | 高 | 高(细节更好) |
6. 总结
3D 视觉的核心技术:
- NeRF:隐式神经表示,质量高但训练慢
- 3DGS:显式高斯表示,训练快、可实时渲染
- PointNet:直接处理点云的经典架构
- 应用方向:自动驾驶、AR/VR、数字孪生、机器人导航