手部检测 yolov5 实战笔记

手部检测
yolo-v5
检测保存json:
python 复制代码
# -*- coding:utf-8 -*-
import warnings
warnings.filterwarnings("ignore")
import argparse
import os
import json
import random
import time
import cv2
from utils.datasets import *
from utils.utils import *

def detect(save_img=False):
    # --- 解析参数
    source, weights, half, imgsz = \
        opt.source, opt.weights, opt.half, opt.img_size

    # --- 初始化设备
    device = torch_utils.select_device(opt.device)

    # --- 加载模型
    ckpt = torch.load(weights, weights_only=False, map_location="cpu")
    model = ckpt['model'].float().to(device).eval()
    names = model.names if hasattr(model, 'names') else model.modules.names

    # --- 设置半精度
    half = half and device.type != 'cpu'
    if half:
        model.half()

    save_dir = source[:-4]
    # --- 初始化视频流
    cap = cv2.VideoCapture(source)
    if not cap.isOpened():
        raise Exception(f"无法打开视频: {source}")

    os.makedirs(save_dir, exist_ok=True)

    frame_count = 0
    t0 = time.time()

    while True:
        ret, img0 = cap.read()
        if not ret:
            break

        frame_count += 1
        h0, w0 = img0.shape[:2]

        # --- 预处理输入图像
        img = letterbox(img0, new_shape=imgsz)[0]
        img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR → RGB
        img = np.ascontiguousarray(img)
        img = torch.from_numpy(img).to(device)
        img = img.half() if half else img.float()
        img /= 255.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)

        # --- 推理
        t1 = torch_utils.time_synchronized()
        pred = model(img, augment=opt.augment)[0]
        t2 = torch_utils.time_synchronized()

        if half:
            pred = pred.float()

        # --- NMS
        pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres,
                                   classes=opt.classes, agnostic=opt.agnostic_nms)

        # --- 遍历检测结果
        for i, det in enumerate(pred):
            if det is None or len(det) == 0:
                continue

            # --- 转换为原图坐标
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()

            shapes = []
            for *xyxy, conf, cls in det:
                x1, y1, x2, y2 = [float(x) for x in xyxy]
                label = f"{names[int(cls)]}"

                shapes.append({
                    "label": "hand",
                    "points": [[x1, y1], [x2, y2]],
                    "group_id": 0,
                    "description": "",
                    "shape_type": "rectangle",
                    "flags": {}
                })

                # # 画框可视化
                # plot_one_box(xyxy, img0, label=f"{label} {conf:.2f}",
                #              color=(0, 255, 0), line_thickness=2)

            # --- 生成 JSON
            json_dict = {
                "version": "5.3.1",
                "flags": {},
                "imageData": None,
                "imageHeight": h0,
                "imageWidth": w0,
                "imagePath": f"frame_{frame_count:04d}.jpg",
                "shapes": shapes
            }

            json_path = os.path.join(save_dir, f"frame_{frame_count:04d}.json")
            with open(json_path, "w", encoding="utf-8") as f:
                json.dump(json_dict, f, ensure_ascii=False, indent=4)

            img_path=os.path.join(save_dir, f"frame_{frame_count:04d}.jpg")
            cv2.imwrite(img_path, img0)

            print(f"[Frame {frame_count}] 检测到 {len(shapes)} 个目标 → {json_path}")

        # --- 显示结果
        cv2.imshow("YOLOv5 Detect Hands", img0)
        if cv2.waitKey(1) == 27:  # ESC退出
            break

    cap.release()
    cv2.destroyAllWindows()
    print(f"推理完成，共 {frame_count} 帧，耗时 {time.time() - t0:.2f}s")


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--weights', type=str, default='hand_m.pt', help='model.pt path')
    # parser.add_argument('--source', type=str, default=r"D:\data\jiezhi\det_1201\20251201-201906.mp4", help='source')
    parser.add_argument('--source', type=str, default=r"D:\data\jiezhi\det_1201\20251201-201944.mp4", help='source')
    parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
    parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold')
    parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
    parser.add_argument('--fourcc', type=str, default='mp4v', help='output video codec')
    parser.add_argument('--half', default=False, help='half precision FP16 inference')
    parser.add_argument('--device', default='', help='cuda device, i.e. 0 or cpu')
    parser.add_argument('--classes', nargs='+', type=int, help='filter by class')
    parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
    parser.add_argument('--augment', default=False, help='augmented inference')
    opt = parser.parse_args()
    print(opt)

    with torch.no_grad():
        detect(save_img=True)