7.Gemini相机+yoloseg+foundationpose环境搭建及应用

系统:ubuntu22.04 ROS2

NVIDIA-SMI 535.309.01 Driver Version: 535.309.01 CUDA Version: 12.2

由于ros2环境是自带的python10,而foundationpose环境是python11,所以该demo用两个环境实现功能,ros2负责获取相机参数以及分割目标物体,foundationpose接收数据后获取目标物体的位姿态,中间由socket进行发送信息和接收信息。

1.最简单的代码

1.1 ros2获取相机参数,处理数据

代码如下:

复制代码
#!/usr/bin/env python3
import rclpy
from rclpy.node import Node
from sensor_msgs.msg import Image, CameraInfo
from cv_bridge import CvBridge
import cv2
import numpy as np
from ultralytics import YOLO
import socket
import json
import base64

# -------- Socket 配置 --------
HOST = '127.0.0.1'
PORT = 12345

class CameraSenderNode(Node):
    def __init__(self):
        super().__init__('ros2_camera_sender')
        self.bridge = CvBridge()
        self.rgb_img = None
        self.depth_img = None
        self.K = None

        # YOLO
        self.yolo = YOLO("/home/wyq/ros2_ws/best_v3.pt")
        self.yolo.to("cpu")

        # Socket Server
        self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.sock.bind((HOST, PORT))
        self.sock.listen(1)
        self.conn = None
        self.get_logger().info(f"TCP Server listening {HOST}:{PORT}")

        # 订阅
        self.create_subscription(Image, '/camera/color/image_raw', self.rgb_cb, 10)
        self.create_subscription(Image, '/camera/depth/image_raw', self.depth_cb, 10)
        self.create_subscription(CameraInfo, '/camera/color/camera_info', self.caminfo_cb, 10)

        # 后台接受连接
        import threading
        threading.Thread(target=self.accept_conn, daemon=True).start()

    def accept_conn(self):
        while True:
            self.conn, addr = self.sock.accept()
            self.get_logger().info(f"FP Connected: {addr}")

    def caminfo_cb(self, msg):
        self.K = np.array(msg.k).reshape(3,3)

    def rgb_cb(self, msg):
        self.rgb_img = self.bridge.imgmsg_to_cv2(msg, 'bgr8')

    def depth_cb(self, msg):
        # 转cv2
        dtype = np.uint16 if msg.encoding == "16UC1" else np.uint8
        depth_data = np.ndarray(
            shape=(msg.height, msg.width, 1), dtype=dtype, buffer=msg.data
        )
        self.depth_img = depth_data
        self.process()

    def process(self):
        if self.rgb_img is None or self.depth_img is None or self.K is None:
            return
        rgb = self.rgb_img
        depth_raw = self.depth_img
        h, w = rgb.shape[:2]

        # 深度对齐 + 归一化
        depth_aligned = cv2.resize(depth_raw, (w, h), interpolation=cv2.INTER_NEAREST)
        depth = depth_aligned.astype(np.float32) / 1000.0
        depth = np.clip(depth, 0.1, 3.0)
        depth[np.isnan(depth)] = 1.0
        depth[np.isinf(depth)] = 1.0

        # YOLO
        res = self.yolo.predict(rgb, conf=0.6, verbose=False)
        mask = np.zeros((h,w), dtype=np.uint8)
        if len(res)>0 and res[0].masks is not None:
            m = res[0].masks.data[0].cpu().numpy()
            m = cv2.resize(m, (w,h))
            mask = (m>0.5).astype(np.uint8)

        # 掩码简单提纯(保持你逻辑)
        kernel_close = cv2.getStructuringElement(cv2.MORPH_RECT, (7,7))
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel_close, iterations=2)

        # 打包发送(json+base64)
        def b64(arr):
            return base64.b64encode(arr.tobytes()).decode('utf-8')

        data = {
            'rgb_b64': b64(rgb),
            'rgb_shape': list(rgb.shape),
            'depth_b64': b64(depth),
            'depth_shape': list(depth.shape),
            'mask_b64': b64(mask),
            'mask_shape': list(mask.shape),
            'K': self.K.tolist()
        }
        if self.conn:
            try:
                self.conn.sendall((json.dumps(data)+'\n').encode('utf-8'))
            except Exception as e:
                self.get_logger().error(f"Send err: {e}")
                self.conn = None

        # 可视化
        overlay = cv2.addWeighted(rgb, 0.7, cv2.cvtColor(mask*255,cv2.COLOR_GRAY2BGR), 0.3,0)
        cv2.imshow("overlay", overlay)
        cv2.waitKey(1)

def main(args=None):
    rclpy.init(args=args)
    node = CameraSenderNode()
    rclpy.spin(node)
    cv2.destroyAllWindows()
    rclpy.shutdown()

if __name__ == '__main__':
    main()

1.2 foundationpose接收数据,获取目标物的位姿态

代码如下:

复制代码
#!/usr/bin/env python3
import socket
import json
import base64
import numpy as np
import cv2
import trimesh
import sys

# -------- 路径 + FoundationPose --------
sys.path.insert(0, "/home/wyq/ros2_ws/FoundationPose")
from estimater import FoundationPose

# -------- Socket --------
HOST = '127.0.0.1'
PORT = 12345

# -------- 初始化FP --------
mesh = trimesh.load("/home/wyq/ros2_ws/phones.obj")
model_pts = mesh.vertices.astype(np.float32)
model_normals = mesh.vertex_normals.astype(np.float32)
fp = FoundationPose(
    model_pts=model_pts,
    model_normals=model_normals,
    mesh=mesh,
    symmetry_tfs=None,
    debug=0
)
initialized = False

def from_b64(s, shape):
    return np.frombuffer(base64.b64decode(s), dtype=np.uint8).reshape(shape)

def main():
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    while True:
        try:
            sock.connect((HOST, PORT))
            print("Connected to ROS2 sender")
            break
        except Exception as e:
            print("Waiting ROS2...", e)
            import time
            time.sleep(2)

    buf = b''
    global initialized
    while True:
        data = sock.recv(65536)
        if not data:
            break
        buf += data
        while b'\n' in buf:
            line, buf = buf.split(b'\n', 1)
            try:
                d = json.loads(line.decode('utf-8'))
                # 解包
                rgb = from_b64(d['rgb_b64'], d['rgb_shape'])
                depth = np.frombuffer(base64.b64decode(d['depth_b64']), dtype=np.float32).reshape(d['depth_shape'])
                mask = from_b64(d['mask_b64'], d['mask_shape'])
                K = np.array(d['K'])

                # FP
                if not initialized:
                    print("⚡ FP init...")
                    pose = fp.register(K=K, rgb=rgb, depth=depth, ob_mask=mask, iteration=2)
                    initialized = True
                    print("✅ Init done, tracking...")
                else:
                    pose = fp.track_one(rgb=rgb, depth=depth, K=K, iteration=2)

                print("\n🎯 Pose:\n", pose)

            except Exception as e:
                print("ERR:", e)

if __name__ == '__main__':
    main()

代码放在foundationpose的根目录。

2.环境搭建

两个代码用了两个环境,可以分别搭建环境,分别验证环境搭建是否成功。

2.1 Ros2及分割算法环境搭建

RTX 4060 支持 CUDA 12.x,最稳组合:

  • Python:3.10(兼容 ROS2 Humble)
  • PyTorch:2.4.x / 2.5.x
  • CUDA:12.1(cu121)

创建 conda 环境

复制代码
conda create -n seg_pose python=3.10 -y
conda activate seg_pose

安装 PyTorchCUDA 12.x(12.0/12.1/12.2)

复制代码
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

验证是否成功(必须做)

复制代码
python -c "import torch; print('torch版本:', torch.__version__)"
python -c "print('CUDA可用:', torch.cuda.is_available())"
python -c "print('GPU:', torch.cuda.get_device_name(0))"
python -c "print('torch使用的CUDA:', torch.version.cuda)"

安装分割

复制代码
# 分割模型(YOLO)
pip install ultralytics opencv-python numpy pillow

# 降级numpy
pip install numpy==1.24.4

#验证ultralytics是否可用
python -c "from ultralytics import YOLO; print('✅ YOLO OK')"

最后下载需要的模型后运行代码即可。

2.2 foundationpose环境安装

这里按照官网在本地安装,也可以用docker安装,可以看官网https://github.com/NVlabs/FoundationPose

创建 conda 环境

复制代码
conda env create -f environment.yml
conda activate foundationpose

安装prtorch

复制代码
python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

安装PyTorch3D and NVDiffRast

复制代码
export CUDA_HOME=/usr/local/cuda   # or e.g. /usr/local/cuda-12.8
export PATH="$CUDA_HOME/bin:$PATH"
python -m pip install --no-build-isolation "git+https://github.com/facebookresearch/pytorch3d.git"
python -m pip install --no-build-isolation "git+https://github.com/NVlabs/nvdiffrast.git"

也可以先下载zip文件,再安装。

再安装python依赖的其他库

复制代码
python -m pip install -r requirements.txt
bash build_all_conda.sh
相关推荐
数智工坊13 小时前
《计算机双目立体视觉》高宏伟:第3章-对极几何及其恢复方法
笔记·数码相机
双翌视觉13 小时前
线扫描成像技术,高速运动物体的“无限视野”
人工智能·数码相机·计算机视觉
神仙别闹13 小时前
基于QT(C++)+SQL Server 2008 实现相机租赁系统
开发语言·c++·数码相机
数智工坊14 小时前
《计算机双目立体视觉》高宏伟:第5章-三维重建
笔记·数码相机
yeflx14 小时前
鱼眼相机人体检测
数码相机
2601_9574188015 小时前
相机如何连接手机?通俗易懂的PTP/MTP连接原理解析
android·数码相机·架构
Angelina_Jolie1 天前
基于边缘特征的相机图像-雷达点云多模态高精度配准
数码相机
ZPC82102 天前
双目相机 + 点云 + YOLO 是机械臂抓取最标准、最精准的方案!
数码相机·yolo
sali-tec2 天前
C# 基于OpenCv的视觉工作流-章78-KRT测量
图像处理·人工智能·数码相机·opencv·算法·计算机视觉