目录
1. 引言与技术挑战
1.1 机器人视觉的核心需求
现代机器人系统对视觉感知提出了极高要求:
- 超低延迟: 工业机械臂需10-20ms内完成目标识别与定位
- 高精度: 6D姿态估计误差需控制在亚毫米级
- 实时性: 移动机器人避障决策延迟<100ms
- 鲁棒性: 应对光照变化、遮挡、运动模糊
- 资源受限: 在有限算力、功耗下实现复杂视觉任务
传统云端处理模式的问题:
- 网络延迟40-200ms,无法满足实时性要求
- 带宽成本高,单路1080p视频需8-12Mbps
- 依赖网络连接,离线场景无法工作
- 隐私风险,敏感数据上传云端
边缘 AI 解决方案:
- 本地推理,端到端延迟<50ms
- 减少90%+数据传输量
- 离线可用,网络断连仍能工作
- 数据本地化处理,保护隐私
1.2 技术架构演进
传统架构: 摄像头 → 云端推理 → 结果返回 (延迟200+ms)
边缘架构: 摄像头 → 边缘推理 → 实时控制 (延迟<50ms)
协同架构: 端侧+边缘+云端分层处理 (最优方案)
2. 系统架构设计
2.1 硬件平台选型
2.1.1 计算平台对比
| 平台 | 算力 (TOPS) | 功耗 (W) | 内存 | 适用场景 | 价格 |
|---|---|---|---|---|---|
| Jetson Orin Nano | 40 | 7-15 | 4-8GB | 移动机器人 | $499 |
| Jetson AGX Orin | 275 | 15-60 | 32/64GB | 工业机器人 | $1,999 |
| RK3588 | 6 | 8-12 | 8-16GB | 成本敏感型 | $200-300 |
| Intel NUC+GPU | 可扩展 | 65+ | 16-64GB | 固定式工业 | $800+ |
选型建议:
- AMR/AGV → Jetson Orin Nano (平衡性能与功耗)
- 工业机械臂 → Jetson AGX Orin (优先推理速度)
- 服务机器人 → Jetson Orin NX (多模态处理)
- 教育/原型 → Raspberry Pi 4 + Coral TPU
2.1.2 视觉传感器选择
深度相机类型:
- 结构光 (Intel RealSense D435): 室内高精度,0.3-10m
- ToF (Azure Kinect): 抗强光,范围更远
- 双目 (ZED 2): 室外可用,需GPU立体匹配
工业相机:
- 全局快门: 消除运动模糊,适合高速运动场景
- GigE接口: 长距离传输,适合固定安装
- USB3接口: 即插即用,适合移动平台
2.2 软件栈设计
完整的机器人视觉系统软件栈:
应用层: ROS 2 / 自研框架
中间件: OpenCV | PCL | Eigen
推理层: TensorRT | OpenVINO | ONNX Runtime
模型层: PyTorch | TensorFlow
驱动层: V4L2 | libRealSense | ZED SDK
系统层: Ubuntu 20.04/22.04 LTS
关键组件:
- ROS 2: 分布式节点通信,生态丰富
- TensorRT: NVIDIA推理加速,3-10倍提速
- OpenVINO: Intel异构计算,CPU/GPU/VPU
- OpenCV: 图像处理基础库,GPU加速
- PCL: 点云处理,3D感知必备
3. 核心算法实现
3.1 实时目标检测
3.1.1 模型选择与性能对比
| 模型 | 参数量 | mAP@0.5 | FPS (Xavier NX) | 适用场景 |
|---|---|---|---|---|
| YOLOv8n | 3.2M | 37.3% | 120+ | 实时性优先 |
| YOLOv8s | 11.2M | 44.9% | 60+ | 平衡 |
| YOLOv8m | 25.9M | 50.2% | 30+ | 精度优先 |
推荐方案: YOLOv8s + TensorRT FP16量化
3.1.2 完整部署流程
python
!/usr/bin/env python3
"""
YOLO目标检测完整部署示例
包含: 模型导出 → TensorRT转换 → 推理优化
"""
import torch
import cv2
import numpy as np
from ultralytics import YOLO
步骤 1: 训练模型 ( 在 GPU 服务器上 )
def train_model():
model = YOLO('yolov8s.pt')
results = model.train(
data='robot_objects.yaml', 自定义数据集
epochs=100,
imgsz=640,
batch=16,
device=0,
cache=True
)
model.export(format='onnx', simplify=True)
return model
步骤 2: TensorRT 引擎构建 ( 在 Jetson 上 )
def build_trt_engine():
"""构建FP16优化的TensorRT引擎"""
import tensorrt as trt
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)
加载 ONNX
with open('yolov8s.onnx', 'rb') as model:
parser.parse(model.read())
配置
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
config.set_flag(trt.BuilderFlag.FP16)
构建并保存
engine = builder.build_serialized_network(network, config)
with open('yolov8s_fp16.trt', 'wb') as f:
f.write(engine)
print("TensorRT engine built successfully!")
步骤 3: 高性能推理类
class YOLODetector:
def init(self, engine_path, conf_thresh=0.5, iou_thresh=0.45):
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
self.conf_thresh = conf_thresh
self.iou_thresh = iou_thresh
加载 TensorRT 引擎
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
with open(engine_path, 'rb') as f:
runtime = trt.Runtime(TRT_LOGGER)
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
分配 GPU 内存
self.input_shape = (1, 3, 640, 640)
self.output_shape = (1, 84, 8400) YOLO 输出格式
self.d_input = cuda.mem_alloc(np.prod(self.input_shape) * 4)
self.d_output = cuda.mem_alloc(np.prod(self.output_shape) * 4)
self.stream = cuda.Stream()
def preprocess(self, image):
"""高效预处理"""
Resize
img = cv2.resize(image, (640, 640))
Normalize
img = img.astype(np.float32) / 255.0
HWC -> CHW
img = img.transpose(2, 0, 1)
Add batch dimension
img = np.expand_dims(img, axis=0)
Contiguous array
return np.ascontiguousarray(img)
def infer(self, image):
"""推理"""
import pycuda.driver as cuda
预处理
input_data = self.preprocess(image)
上传到 GPU
cuda.memcpy_htod_async(
self.d_input, input_data, self.stream
)
推理
self.context.execute_async_v2(
bindings=[int(self.d_input), int(self.d_output)],
stream_handle=self.stream.handle
)
下载结果
output = np.empty(self.output_shape, dtype=np.float32)
cuda.memcpy_dtoh_async(output, self.d_output, self.stream)
self.stream.synchronize()
后处理
detections = self.postprocess(output, image.shape[:2])
return detections
def postprocess(self, output, original_shape):
"""后处理: NMS + 坐标还原"""
转置 (1, 84, 8400) -> (8400, 84)
output = output[0].T
过滤低置信度
scores = output[:, 4:].max(axis=1)
mask = scores > self.conf_thresh
output = output[mask]
scores = scores[mask]
提取 bbox 和 class
boxes = output[:, :4]
classes = output[:, 4:].argmax(axis=1)
NMS
import torchvision
indices = torchvision.ops.nms(
torch.from_numpy(boxes),
torch.from_numpy(scores),
self.iou_thresh
)
坐标还原
boxes = boxes[indices]
h, w = original_shape
boxes[:, [0, 2]] *= w / 640
boxes[:, [1, 3]] *= h / 640
results = []
for box, cls, score in zip(boxes, classes[indices], scores[indices]):
x1, y1, x2, y2 = box
results.append({
'bbox': [int(x1), int(y1), int(x2), int(y2)],
'class': int(cls),
'confidence': float(score)
})
return results
使用示例
if name == 'main':
构建引擎 ( 首次运行 )
build_trt_engine()
创建检测器
detector = YOLODetector('yolov8s_fp16.trt')
测试推理
cap = cv2.VideoCapture(0)
import time
fps_list = []
while True:
ret, frame = cap.read()
if not ret:
break
计时
start = time.time()
检测
detections = detector.infer(frame)
FPS
fps = 1.0 / (time.time() - start)
fps_list.append(fps)
可视化
for det in detections:
x1, y1, x2, y2 = det['bbox']
cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
label = f"Class {det['class']}: {det['confidence']:.2f}"
cv2.putText(frame, label, (x1, y1-10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
显示 FPS
avg_fps = np.mean(fps_list[-30:])
cv2.putText(frame, f"FPS: {avg_fps:.1f}", (10, 30),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
cv2.imshow('Detection', frame)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
cap.release()
cv2.destroyAllWindows()
print(f"Average FPS: {np.mean(fps_list):.2f}")
3.2 6D 姿态估计
机械臂抓取需要精确的物体6自由度姿态(3D位置 + 3D旋转)。
3.2.1 基于 RGB-D 的姿态估计
python
!/usr/bin/env python3
"""
6D姿态估计系统
方法: PnP + RANSAC (经典方法,可靠性高)
"""
import numpy as np
import cv2
class PoseEstimator6D:
def init(self, camera_matrix, dist_coeffs):
"""
Args:
camera_matrix: 3x3相机内参
dist_coeffs: 畸变系数
"""
self.K = camera_matrix
self.dist = dist_coeffs
物体的 3D 模型点 ( 相对于物体坐标系 )
self.object_models = {
'box': self.create_box_model(0.1, 0.08, 0.05),
'cylinder': self.create_cylinder_model(0.04, 0.12)
}
def create_box_model(self, length, width, height):
"""创建长方体模型点"""
return np.array([
0, 0, 0\], \[length, 0, 0\], \[length, width, 0\], \[0, width, 0\], \[0, 0, height\], \[length, 0, height\], \[length, width, height\], \[0, width, height
], dtype=np.float32)
def create_cylinder_model(self, radius, height, num_points=12):
"""创建圆柱体模型点"""
points = []
底面圆
for i in range(num_points):
angle = i * 2 * np.pi / num_points
x = radius * np.cos(angle)
y = radius * np.sin(angle)
points.append([x, y, 0])
顶面圆
for i in range(num_points):
angle = i * 2 * np.pi / num_points
x = radius * np.cos(angle)
y = radius * np.sin(angle)
points.append([x, y, height])
return np.array(points, dtype=np.float32)
def estimate_pose(self, rgb_image, depth_image, bbox, object_type='box'):
"""
估计物体6D姿态
Args:
rgb_image: RGB图像
depth_image: 深度图 (单位:米)
bbox: [x1, y1, x2, y2] 目标框
object_type: 物体类型
Returns:
pose: 4x4变换矩阵 (物体坐标系→相机坐标系)
None if 估计失败
"""
1. 提取 ROI
x1, y1, x2, y2 = bbox
roi_rgb = rgb_image[y1:y2, x1:x2]
roi_depth = depth_image[y1:y2, x1:x2]
2. 特征检测与匹配
使用 ORB 特征点
orb = cv2.ORB_create(nfeatures=500)
kp, des = orb.detectAndCompute(roi_rgb, None)
if len(kp) < 4:
return None
3. 构建 3D-2D 对应关系
points_3d = []
points_2d = []
for keypoint in kp:
2D 点坐标 (ROI 内 )
u, v = keypoint.pt
u_global = int(u + x1)
v_global = int(v + y1)
获取深度
if v_global >= depth_image.shape[0] or u_global >= depth_image.shape[1]:
continue
z = depth_image[v_global, u_global]
if z <= 0 or z > 5.0: 过滤无效深度
continue
反投影到 3D
x_3d = (u_global - self.K[0, 2]) * z / self.K[0, 0]
y_3d = (v_global - self.K[1, 2]) * z / self.K[1, 1]
points_3d.append([x_3d, y_3d, z])
points_2d.append([u_global, v_global])
if len(points_3d) < 6:
return None
points_3d = np.array(points_3d, dtype=np.float32)
points_2d = np.array(points_2d, dtype=np.float32)
4. PnP 求解 ( 使用 RANSAC 提高鲁棒性 )
success, rvec, tvec, inliers = cv2.solvePnPRansac(
points_3d,
points_2d,
self.K,
self.dist,
iterationsCount=1000,
reprojectionError=8.0,
confidence=0.99
)
if not success or inliers is None or len(inliers) < 6:
return None
5. 构建 4x4 变换矩阵
R, _ = cv2.Rodrigues(rvec)
pose = np.eye(4)
pose[:3, :3] = R
pose[:3, 3] = tvec.flatten()
6. 优化 ( 可选 : 使用内点重新优化 )
if len(inliers) > 10:
inlier_points_3d = points_3d[inliers.flatten()]
inlier_points_2d = points_2d[inliers.flatten()]
_, rvec_refined, tvec_refined = cv2.solvePnP(
inlier_points_3d,
inlier_points_2d,
self.K,
self.dist,
rvec,
tvec,
useExtrinsicGuess=True,
flags=cv2.SOLVEPNP_ITERATIVE
)
R_refined, _ = cv2.Rodrigues(rvec_refined)
pose[:3, :3] = R_refined
pose[:3, 3] = tvec_refined.flatten()
return pose
def refine_with_icp(self, pose_init, depth_image, object_type='box'):
"""
使用ICP优化姿态估计
Args:
pose_init: 初始姿态
depth_image: 深度图
object_type: 物体类型
"""
获取物体模型点云
model_points = self.object_models[object_type]
将模型点投影到图像
rvec, _ = cv2.Rodrigues(pose_init[:3, :3])
tvec = pose_init[:3, 3]
projected_points, _ = cv2.projectPoints(
model_points, rvec, tvec, self.K, self.dist
)
提取观测点云
observed_points = []
for pt_2d in projected_points:
u, v = int(pt_2d[0, 0]), int(pt_2d[0, 1])
if 0 <= v < depth_image.shape[0] and 0 <= u < depth_image.shape[1]:
z = depth_image[v, u]
if z > 0:
x = (u - self.K[0, 2]) * z / self.K[0, 0]
y = (v - self.K[1, 2]) * z / self.K[1, 1]
observed_points.append([x, y, z])
if len(observed_points) < 4:
return pose_init
ICP 配准
import open3d as o3d
source = o3d.geometry.PointCloud()
source.points = o3d.utility.Vector3dVector(model_points)
target = o3d.geometry.PointCloud()
target.points = o3d.utility.Vector3dVector(np.array(observed_points))
reg_result = o3d.pipelines.registration.registration_icp(
source, target, 0.02, pose_init,
o3d.pipelines.registration.TransformationEstimationPointToPoint()
)
return reg_result.transformation
使用示例
if name == 'main':
相机内参 ( 示例值 , 需实际标定 )
K = np.array([
615.0, 0, 320.0\], \[0, 615.0, 240.0\], \[0, 0, 1
])
dist = np.zeros(5) 无畸变
estimator = PoseEstimator6D(K, dist)
加载测试数据
rgb = cv2.imread('test_rgb.png')
depth = cv2.imread('test_depth.png', cv2.IMREAD_UNCHANGED).astype(np.float32) / 1000.0
bbox = [100, 100, 300, 300] 示例 bbox
估计姿态
pose = estimator.estimate_pose(rgb, depth, bbox, 'box')
if pose is not None:
print("Estimated pose:")
print(pose)
ICP 精化
pose_refined = estimator.refine_with_icp(pose, depth, 'box')
print("\nRefined pose:")
print(pose_refined)
else:
print("Pose estimation failed")
3.3 视觉 SLAM
移动机器人需要同时定位与建图。
3.3.1 ORB-SLAM3 集成
python
!/usr/bin/env python3
"""
ORB-SLAM3集成示例
支持单目、双目、RGB-D
"""
import numpy as np
import cv2
try:
from orbslam3 import System, Sensor
ORBSLAM_AVAILABLE = True
except ImportError:
ORBSLAM_AVAILABLE = False
print("Warning: ORB-SLAM3 not installed")
class VisualSLAM:
def init(self, vocab_path, settings_path, sensor_type='RGBD'):
"""
初始化SLAM系统
Args:
vocab_path: ORB词袋文件 (ORBvoc.txt)
settings_path: 相机配置文件
sensor_type: 'MONOCULAR', 'STEREO', 'RGBD'
"""
if not ORBSLAM_AVAILABLE:
raise RuntimeError("ORB-SLAM3 not available")
sensor_map = {
'MONOCULAR': Sensor.MONOCULAR,
'STEREO': Sensor.STEREO,
'RGBD': Sensor.RGBD
}
self.slam = System(
vocab_path,
settings_path,
sensor_map[sensor_type]
)
self.slam.initialize()
self.trajectory = []
self.map_points = []
print(f"SLAM initialized with {sensor_type} mode")
def track_rgbd(self, rgb_image, depth_image, timestamp):
"""
处理RGB-D帧
Returns:
pose: 4x4相机位姿 (世界坐标系)
tracking_state: 跟踪状态
"""
转灰度
if len(rgb_image.shape) == 3:
gray = cv2.cvtColor(rgb_image, cv2.COLOR_BGR2GRAY)
else:
gray = rgb_image
SLAM 跟踪
pose = self.slam.process_image_rgbd(gray, depth_image, timestamp)
记录轨迹
if pose is not None:
self.trajectory.append({
'timestamp': timestamp,
'pose': pose.copy()
})
tracking_state = self.slam.get_tracking_state()
return pose, tracking_state
def get_map_points(self):
"""获取地图点"""
return self.slam.get_all_map_points()
def save_trajectory(self, filename='trajectory.txt'):
"""保存轨迹 (TUM格式)"""
with open(filename, 'w') as f:
for item in self.trajectory:
t = item['timestamp']
pose = item['pose']
提取位置和四元数
tx, ty, tz = pose[:3, 3]
R = pose[:3, :3]
q = self.rotation_to_quaternion(R)
TUM 格式 : timestamp tx ty tz qx qy qz qw
f.write(f"{t} {tx} {ty} {tz} {q[0]} {q[1]} {q[2]} {q[3]}\n")
print(f"Trajectory saved to {filename}")
def save_map(self, filename='map.ply'):
"""保存地图点云"""
map_points = self.get_map_points()
if len(map_points) == 0:
print("No map points to save")
return
PLY 格式
with open(filename, 'w') as f:
f.write("ply\n")
f.write("format ascii 1.0\n")
f.write(f"element vertex {len(map_points)}\n")
f.write("property float x\n")
f.write("property float y\n")
f.write("property float z\n")
f.write("end_header\n")
for pt in map_points:
f.write(f"{pt[0]} {pt[1]} {pt[2]}\n")
print(f"Map saved to {filename}")
@staticmethod
def rotation_to_quaternion(R):
"""旋转矩阵转四元数 (xyzw格式)"""
from scipy.spatial.transform import Rotation
r = Rotation.from_matrix(R)
return r.as_quat() [x, y, z, w]
def shutdown(self):
"""关闭SLAM"""
self.slam.shutdown()
使用示例 (ROS 集成 )
if name == 'main':
import rospy
from sensor_msgs.msg import Image
from cv_bridge import CvBridge
rospy.init_node('visual_slam_node')
初始化 SLAM
slam = VisualSLAM(
'/path/to/ORBvoc.txt',
'/path/to/camera.yaml',
'RGBD'
)
bridge = CvBridge()
数据缓存
rgb_cache = None
depth_cache = None
def rgb_callback(msg):
global rgb_cache
rgb_cache = bridge.imgmsg_to_cv2(msg, 'bgr8')
def depth_callback(msg):
global depth_cache
depth_cache = bridge.imgmsg_to_cv2(msg, 'passthrough')
订阅话题
rospy.Subscriber('/camera/color/image_raw', Image, rgb_callback)
rospy.Subscriber('/camera/depth/image_raw', Image, depth_callback)
rate = rospy.Rate(30) 30 Hz
try:
while not rospy.is_shutdown():
if rgb_cache is not None and depth_cache is not None:
timestamp = rospy.Time.now().to_sec()
pose, state = slam.track_rgbd(rgb_cache, depth_cache, timestamp)
if pose is not None:
print(f"Tracking OK: {pose[0,3]:.2f}, {pose[1,3]:.2f}, {pose[2,3]:.2f}")
else:
print("Tracking lost")
rate.sleep()
except KeyboardInterrupt:
print("\nShutting down...")
slam.save_trajectory('slam_trajectory.txt')
slam.save_map('slam_map.ply')
slam.shutdown()
3.4 多传感器融合
融合视觉、激光雷达、IMU提升定位精度与鲁棒性。
3.4.1 扩展卡尔曼滤波融合
python
!/usr/bin/env python3
"""
传感器融合: EKF (Extended Kalman Filter)
融合IMU、视觉里程计、轮式里程计
"""
import numpy as np
from scipy.spatial.transform import Rotation
class SensorFusion:
def init(self):
状态向量 : [x, y, z, vx, vy, vz, qw, qx, qy, qz, bwx, bwy, bwz, bax, bay, baz]
位置 (3) + 速度 (3) + 姿态四元数 (4) + 陀螺仪偏差 (3) + 加速度偏差 (3) = 16 维
self.state = np.zeros(16)
self.state[6] = 1.0 初始姿态为单位四元数
协方差矩阵
self.P = np.eye(16) * 0.1
过程噪声
self.Q = np.eye(16) * 0.01
重力向量
self.gravity = np.array([0, 0, -9.81])
def predict(self, imu_data, dt):
"""
预测步 (IMU数据)
Args:
imu_data: {'gyro': [wx, wy, wz], 'accel': [ax, ay, az]}
dt: 时间间隔
"""
提取状态
pos = self.state[0:3]
vel = self.state[3:6]
quat = self.state[6:10]
gyro_bias = self.state[10:13]
accel_bias = self.state[13:16]
去偏差
gyro = np.array(imu_data['gyro']) - gyro_bias
accel = np.array(imu_data['accel']) - accel_bias
旋转矩阵
R = Rotation.from_quat([quat[1], quat[2], quat[3], quat[0]]).as_matrix()
状态更新
1. 位置 : p = p + v*dt + 0.5*a*dt²
accel_world = R @ accel + self.gravity
pos_new = pos + vel * dt + 0.5 * accel_world * dt**2
2. 速度 : v = v + a*dt
vel_new = vel + accel_world * dt
3. 姿态 : 四元数微分方程
q_dot = 0.5 * Omega(w) * q
omega_matrix = np.array([
0, -gyro\[0\], -gyro\[1\], -gyro\[2\]\], \[gyro\[0\], 0, gyro\[2\], -gyro\[1\]\], \[gyro\[1\], -gyro\[2\], 0, gyro\[0\]\], \[gyro\[2\], gyro\[1\], -gyro\[0\], 0
])
quat_new = quat + 0.5 * omega_matrix @ quat * dt
quat_new = quat_new / np.linalg.norm(quat_new) 归一化
更新状态
self.state[0:3] = pos_new
self.state[3:6] = vel_new
self.state[6:10] = quat_new
状态转移雅可比 ( 简化 )
F = np.eye(16)
F[0:3, 3:6] = np.eye(3) * dt
协方差预测
self.P = F @ self.P @ F.T + self.Q
def update_vision(self, visual_pose):
"""
更新步 (视觉里程计)
Args:
visual_pose: 4x4位姿矩阵
"""
提取观测
pos_meas = visual_pose[:3, 3]
R_meas = visual_pose[:3, :3]
quat_meas = Rotation.from_matrix(R_meas).as_quat() [x,y,z,w]
quat_meas = np.array([quat_meas[3], quat_meas[0], quat_meas[1], quat_meas[2]]) 转 [w,x,y,z]
观测向量 z = [x, y, z, qw, qx, qy, qz]
z = np.concatenate([pos_meas, quat_meas])
预测观测
h = np.concatenate([self.state[0:3], self.state[6:10]])
创新 ( 观测 - 预测 )
innovation = z - h
四元数创新需特殊处理 ( 误差四元数 )
q_meas = quat_meas
q_pred = self.state[6:10]
q_error = self.quaternion_multiply(q_meas, self.quaternion_inverse(q_pred))
innovation[3:7] = q_error
观测雅可比
H = np.zeros((7, 16))
H[0:3, 0:3] = np.eye(3) 位置观测
H[3:7, 6:10] = np.eye(4) 姿态观测
观测噪声
R = np.diag([0.01, 0.01, 0.01, 0.001, 0.001, 0.001, 0.001])
Kalman 增益
S = H @ self.P @ H.T + R
K = self.P @ H.T @ np.linalg.inv(S)
状态更新
self.state = self.state + K @ innovation
归一化四元数
self.state[6:10] = self.state[6:10] / np.linalg.norm(self.state[6:10])
协方差更新
self.P = (np.eye(16) - K @ H) @ self.P
def get_pose(self):
"""获取当前位姿"""
pose = np.eye(4)
pose[:3, 3] = self.state[0:3]
quat_wxyz = self.state[6:10]
quat_xyzw = [quat_wxyz[1], quat_wxyz[2], quat_wxyz[3], quat_wxyz[0]]
pose[:3, :3] = Rotation.from_quat(quat_xyzw).as_matrix()
return pose
@staticmethod
def quaternion_multiply(q1, q2):
"""四元数乘法 (wxyz格式)"""
w1, x1, y1, z1 = q1
w2, x2, y2, z2 = q2
return np.array([
w1*w2 - x1*x2 - y1*y2 - z1*z2,
w1*x2 + x1*w2 + y1*z2 - z1*y2,
w1*y2 - x1*z2 + y1*w2 + z1*x2,
w1*z2 + x1*y2 - y1*x2 + z1*w2
])
@staticmethod
def quaternion_inverse(q):
"""四元数求逆"""
w, x, y, z = q
return np.array([w, -x, -y, -z]) / np.dot(q, q)
使用示例
if name == 'main':
fusion = SensorFusion()
模拟数据
import time
for i in range(100):
IMU 数据 (100Hz)
imu = {
'gyro': [0.01, -0.02, 0.05],
'accel': [0.1, -0.05, 9.8]
}
fusion.predict(imu, dt=0.01)
视觉更新 (10Hz)
if i % 10 == 0:
visual_pose = np.eye(4)
visual_pose[:3, 3] = [i*0.01, 0, 0] 模拟前进
fusion.update_vision(visual_pose)
获取融合位姿
pose = fusion.get_pose()
if i % 10 == 0:
print(f"Step {i}: Position = {pose[:3, 3]}")
time.sleep(0.01)
4. 性能优化实践
4.1 模型量化
将FP32模型转为INT8可获得3-4倍加速,同时减小70%模型大小。
4.1.1 TensorRT INT8 量化
python
!/usr/bin/env python3
"""
TensorRT INT8量化完整流程
包含校准数据集准备、量化、精度验证
"""
import numpy as np
import cv2
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
class Int8Calibrator(trt.IInt8EntropyCalibrator2):
"""INT8校准器"""
def init(self, calibration_images, batch_size=8, cache_file='calibration.cache'):
trt.IInt8EntropyCalibrator2.init(self)
self.batch_size = batch_size
self.cache_file = cache_file
准备校准数据
self.calibration_data = []
for img_path in calibration_images:
img = cv2.imread(img_path)
img = self.preprocess(img)
self.calibration_data.append(img)
self.calibration_data = np.array(self.calibration_data)
self.current_index = 0
分配 GPU 内存
self.device_input = cuda.mem_alloc(
self.calibration_data[0].nbytes * batch_size
)
def preprocess(self, image):
"""预处理"""
img = cv2.resize(image, (640, 640))
img = img.astype(np.float32) / 255.0
img = img.transpose(2, 0, 1)
return img
def get_batch_size(self):
return self.batch_size
def get_batch(self, names):
"""获取一个批次"""
if self.current_index + self.batch_size > len(self.calibration_data):
return None
batch = self.calibration_data[
self.current_index:self.current_index + self.batch_size
]
batch = np.ascontiguousarray(batch)
cuda.memcpy_htod(self.device_input, batch)
self.current_index += self.batch_size
return [int(self.device_input)]
def read_calibration_cache(self):
"""读取缓存"""
import os
if os.path.exists(self.cache_file):
with open(self.cache_file, 'rb') as f:
return f.read()
return None
def write_calibration_cache(self, cache):
"""写入缓存"""
with open(self.cache_file, 'wb') as f:
f.write(cache)
def build_int8_engine(onnx_file, calibration_images, output_file='model_int8.trt'):
"""构建INT8引擎"""
TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
创建 builder
builder = trt.Builder(TRT_LOGGER)
network = builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, TRT_LOGGER)
解析 ONNX
print(f"Loading ONNX file: {onnx_file}")
with open(onnx_file, 'rb') as model:
if not parser.parse(model.read()):
for error in range(parser.num_errors):
print(parser.get_error(error))
return None
配置
config = builder.create_builder_config()
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 2 << 30) 2GB
启用 INT8
config.set_flag(trt.BuilderFlag.INT8)
创建校准器
calibrator = Int8Calibrator(calibration_images, batch_size=8)
config.int8_calibrator = calibrator
构建引擎
print("Building INT8 engine (this may take a while)...")
engine = builder.build_serialized_network(network, config)
if engine is None:
print("Failed to build engine")
return None
保存
with open(output_file, 'wb') as f:
f.write(engine)
print(f"INT8 engine saved to {output_file}")
return output_file
精度验证
def validate_quantized_model(fp32_engine, int8_engine, test_images):
"""验证量化后精度"""
加载两个引擎
... ( 省略加载代码 )
对比推理结果
results_fp32 = []
results_int8 = []
for img in test_images:
FP32 推理
out_fp32 = run_inference(fp32_engine, img)
results_fp32.append(out_fp32)
INT8 推理
out_int8 = run_inference(int8_engine, img)
results_int8.append(out_int8)
计算精度差异
mse = np.mean([(r1 - r2)**2 for r1, r2 in zip(results_fp32, results_int8)])
print(f"MSE between FP32 and INT8: {mse}")
如果 MSE 过大 , 需要调整量化策略
if mse > 0.01:
print("Warning: Large accuracy drop detected!")
使用示例
if name == 'main':
准备校准数据集 ( 代表性样本 )
calibration_images = [
'calibration/img_001.jpg',
'calibration/img_002.jpg',
... 添加更多图片 ( 建议 100-500 张 )
]
构建 INT8 引擎
build_int8_engine(
'yolov8s.onnx',
calibration_images,
'yolov8s_int8.trt'
)