mediapipe标注视频姿态关键点

前言

手语视频流的识别有两种大的分类,一种是直接将视频输入进网络,一种是识别了关键点之后再进入网络。所以这篇文章我就要来讲讲如何用mediapipe对手语视频进行关键点标注。

代码

需要直接使用代码的,我就放这里了。环境自己配置一下吧,不太记得了。

python 复制代码
import os
import cv2
import numpy as np
import mediapipe as mp
from concurrent.futures import ThreadPoolExecutor

# 关键点过滤设置
filtered_hand = list(range(21))
filtered_pose = [11, 12, 13, 14, 15, 16]  # 只保留躯干和手臂关键点
HAND_NUM = len(filtered_hand)
POSE_NUM = len(filtered_pose)

# 初始化MediaPipe模型(增加检测参数)
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.1,#太高的话,没识别到就不识别,比较低能识别的比较全(没有干扰的情况下低比较好)
    min_tracking_confidence=0.1#太高,没追踪到也会放弃,比较低的连续性会比较好
)

pose = mp_pose.Pose(
    static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5
)


def get_frame_landmarks(frame):
    """获取单帧关键点(修复线程安全问题)"""
    all_landmarks = np.full((HAND_NUM * 2 + POSE_NUM, 3), np.nan)  # 初始化为NaN

    # 改为顺序执行确保数据可靠性
    # 手部关键点
    results_hands = hands.process(frame)
    if results_hands.multi_hand_landmarks:
        for i, hand_landmarks in enumerate(results_hands.multi_hand_landmarks[:2]):  # 最多两只手
            hand_type = results_hands.multi_handedness[i].classification[0].index
            points = np.array([(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark])
            if hand_type == 0:  # 右手
                all_landmarks[:HAND_NUM] = points
            else:  # 左手
                all_landmarks[HAND_NUM:HAND_NUM * 2] = points

    # 身体关键点
    results_pose = pose.process(frame)
    if results_pose.pose_landmarks:
        pose_points = np.array([(lm.x, lm.y, lm.z) for lm in results_pose.pose_landmarks.landmark])
        all_landmarks[HAND_NUM * 2:HAND_NUM * 2 + POSE_NUM] = pose_points[filtered_pose]

    return all_landmarks


def get_video_landmarks(video_path, start_frame=1, end_frame=-1):
    """获取视频关键点(添加调试信息)"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"无法打开视频文件: {video_path}")
        return np.empty((0, HAND_NUM * 2 + POSE_NUM, 3))

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if end_frame < 0 or end_frame > total_frames:
        end_frame = total_frames

    valid_frames = []
    frame_index = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or frame_index > end_frame:
            break

        if frame_index >= start_frame:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            landmarks = get_frame_landmarks(frame_rgb)

            # 检查是否检测到有效关键点
            if not np.all(np.isnan(landmarks)):
                valid_frames.append(landmarks)
            else:
                print(f"第 {frame_index} 帧未检测到关键点")

        frame_index += 1

    cap.release()

    if not valid_frames:
        print("警告:未检测到任何关键点")
        return np.empty((0, HAND_NUM * 2 + POSE_NUM, 3))

    return np.stack(valid_frames)


def draw_landmarks(video_path, output_path, landmarks):
    """绘制关键点到视频"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"无法打开视频文件: {video_path}")
        return

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    landmark_index = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if landmark_index < len(landmarks):
            # 绘制关键点
            for i, (x, y, _) in enumerate(landmarks[landmark_index]):
                if not np.isnan(x) and not np.isnan(y):
                    px, py = int(x * width), int(y * height)
                    # 右手绿色,左手红色,身体蓝色
                    color = (0, 255, 0) if i < HAND_NUM else \
                        (0, 0, 255) if i < HAND_NUM * 2 else \
                            (255, 0, 0)
                    cv2.circle(frame, (px, py), 4, color, -1)

            landmark_index += 1

        out.write(frame)

    cap.release()
    out.release()


# 处理所有视频
video_root = "./doc/补充版/正式数据集/"
output_root = "./doc/save/"

if not os.path.exists(output_root):
    os.makedirs(output_root)

for video_name in os.listdir(video_root):
    if not video_name.endswith(('.mp4', '.avi', '.mov')):
        continue

    video_path = os.path.join(video_root, video_name)
    print(f"\n处理视频: {video_name}")

    # 获取关键点
    landmarks = get_video_landmarks(video_path)
    print(f"获取到 {len(landmarks)} 帧关键点")

    # 保存npy文件
    base_name = os.path.splitext(video_name)[0]
    np.save(os.path.join(output_root,"npy", f"{base_name}.npy"), landmarks)

    # 生成带关键点的视频
    output_video = os.path.join(output_root, "MP4",f"{base_name}_landmarks.mp4")
    draw_landmarks(video_path, output_video, landmarks)
print("全部处理完成!")

使用比较简单,修改video_root为视频目录路径,output_root为结果输出目录路径就可以正常使用了!

前置处理

python 复制代码
# 关键点过滤设置
filtered_hand = list(range(21))
filtered_pose = [11, 12, 13, 14, 15, 16]  # 只保留躯干和手臂关键点
HAND_NUM = len(filtered_hand)
POSE_NUM = len(filtered_pose)
)

这里需要选取你需要的关键点,手部正常来说每个手21个,姿态和脸部的关键点也可以自己选择保留什么,网上可以查到每个点对应数字。

python 复制代码
# 初始化MediaPipe模型(增加检测参数)
mp_hands = mp.solutions.hands
mp_pose = mp.solutions.pose

hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=2,
    min_detection_confidence=0.1,#太高的话,没识别到就不识别,比较低能识别的比较全(没有干扰的情况下低比较好)
    min_tracking_confidence=0.1#太高,没追踪到也会放弃,比较低的连续性会比较好
)

pose = mp_pose.Pose(
    static_image_mode=False,
    model_complexity=1,
    min_detection_confidence=0.7,
    min_tracking_confidence=0.5

参数调整,对于手部和姿态都可以进行单独的参数调整,static_image_mode是是否是图片,False代表不是,我这里是视频,如果是视频的话,后面就还有一个min_tracking_confidence追踪阈值,而图片不具有时间连续性,所以用不到这个参数。max_num_hands是最大会识别到有几个手,后面两个参数我也写了怎么调。姿态参数基本同理,有一些区别可以自己查一下。

函数讲解

python 复制代码
def get_frame_landmarks(frame):
    """获取单帧关键点(修复线程安全问题)"""
    all_landmarks = np.full((HAND_NUM * 2 + POSE_NUM, 3), np.nan)  # 初始化为NaN

    # 改为顺序执行确保数据可靠性
    # 手部关键点
    results_hands = hands.process(frame)
    if results_hands.multi_hand_landmarks:
        for i, hand_landmarks in enumerate(results_hands.multi_hand_landmarks[:2]):  # 最多两只手
            hand_type = results_hands.multi_handedness[i].classification[0].index
            points = np.array([(lm.x, lm.y, lm.z) for lm in hand_landmarks.landmark])
            if hand_type == 0:  # 右手
                all_landmarks[:HAND_NUM] = points
            else:  # 左手
                all_landmarks[HAND_NUM:HAND_NUM * 2] = points

    # 身体关键点
    results_pose = pose.process(frame)
    if results_pose.pose_landmarks:
        pose_points = np.array([(lm.x, lm.y, lm.z) for lm in results_pose.pose_landmarks.landmark])
        all_landmarks[HAND_NUM * 2:HAND_NUM * 2 + POSE_NUM] = pose_points[filtered_pose]

    return all_landmarks

对于单帧进行处理,先对所有关键点留np的位置,全部填充NaN,再分别对手部关键点和肢体关键点进行识别,将识别的点填入原先的数组里面,得到最后要返回的关键点数组。

python 复制代码
def get_video_landmarks(video_path, start_frame=1, end_frame=-1):
    """获取视频关键点(添加调试信息)"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"无法打开视频文件: {video_path}")
        return np.empty((0, HAND_NUM * 2 + POSE_NUM, 3))

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    if end_frame < 0 or end_frame > total_frames:
        end_frame = total_frames

    valid_frames = []
    frame_index = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret or frame_index > end_frame:
            break

        if frame_index >= start_frame:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            landmarks = get_frame_landmarks(frame_rgb)

            # 检查是否检测到有效关键点
            if not np.all(np.isnan(landmarks)):
                valid_frames.append(landmarks)
            else:
                print(f"第 {frame_index} 帧未检测到关键点")

        frame_index += 1

    cap.release()

    if not valid_frames:
        print("警告:未检测到任何关键点")
        return np.empty((0, HAND_NUM * 2 + POSE_NUM, 3))

    return np.stack(valid_frames)

处理视频帧的关键点识别,读取视频的每一帧,分别做通道BGR转RGB和调用单帧处理函数对其进行处理,将每一帧的结果堆叠起来返回。

python 复制代码
def draw_landmarks(video_path, output_path, landmarks):
    """绘制关键点到视频"""
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"无法打开视频文件: {video_path}")
        return

    fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    landmark_index = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        if landmark_index < len(landmarks):
            # 绘制关键点
            for i, (x, y, _) in enumerate(landmarks[landmark_index]):
                if not np.isnan(x) and not np.isnan(y):
                    px, py = int(x * width), int(y * height)
                    # 右手绿色,左手红色,身体蓝色
                    color = (0, 255, 0) if i < HAND_NUM else \
                        (0, 0, 255) if i < HAND_NUM * 2 else \
                            (255, 0, 0)
                    cv2.circle(frame, (px, py), 4, color, -1)

            landmark_index += 1

        out.write(frame)

    cap.release()
    out.release()

绘制结果关键点函数,将视频路径和输出路径以及识别的关键点数组传入,读取视频,对每一帧的图片每一个关键点进行绘制,画圈圈,然后将帧写入保存。

总结

整个路线还是比较清晰的,由于我使用的数据视频背景比较简单,不太会出现误识别,所以我的参数调的很低,但是不知道为什么还是会出现掉帧的情况,需要后续研究一下。

相关推荐
视觉语言导航2 小时前
俄罗斯无人机自主任务规划!UAV-CodeAgents:基于多智能体ReAct和视觉语言推理的可扩展无人机任务规划
人工智能·深度学习·无人机·具身智能
世润3 小时前
深度学习-梯度消失和梯度爆炸
人工智能·深度学习
pen-ai3 小时前
【深度学习】11. Transformer解析: Self-Attention、ELMo、Bert、GPT
深度学习·bert·transformer
归去_来兮4 小时前
长短期记忆(LSTM)网络模型
人工智能·深度学习·lstm·时序模型
蔗理苦5 小时前
2025-05-28 Python&深度学习8——优化器
开发语言·pytorch·python·深度学习·优化器
魔乐社区6 小时前
基于昇腾MindSpeed训练加速库玩转智谱GLM-4-0414模型
人工智能·深度学习·开源·大模型·魔乐社区·魔乐
~山有木兮6 小时前
pytorch部分函数理解
深度学习
写代码的小阿帆6 小时前
Attention Is All You Need论文阅读笔记
论文阅读·深度学习·机器学习·transformer
WiSirius8 小时前
LLM:decoder-only 思考
人工智能·深度学习·自然语言处理·大模型
爱学习的书文9 小时前
霹雳吧啦Wz_深度学习-图像分类篇章_1.1 卷积神经网络基础_笔记
深度学习·分类·cnn