计算机视觉 | OpenCV 实现手势虚拟控制亮度和音量

Hi，大家好，我是半亩花海。在当今科技飞速发展的时代，我们身边充斥着各种智能设备，然而，如何更便捷地与这些设备进行交互却是一个不断被探索的课题。本文将主要介绍一个基于 OpenCV 的手势识别项目，通过手势来控制电脑屏幕亮度和音量大小，为用户提供了一种全新的交互方式。

一、代码拆解

[1. 导入必要库](#1. 导入必要库)

[2. 初始化手部关键点](#2. 初始化手部关键点)

[3. 数据格式转换](#3. 数据格式转换)

[4. 画手势关键点](#4. 画手势关键点)

[5. 手势状态缓冲处理](#5. 手势状态缓冲处理)

[6. 画直线](#6. 画直线)

[7. 屏幕亮度和音量控制](#7. 屏幕亮度和音量控制)

[8. 初始化摄像头和手部关键点识别器](#8. 初始化摄像头和手部关键点识别器)

[9. Pygame 界面初始化和事件监听](#9. Pygame 界面初始化和事件监听)

二、实战演示

[1. 亮度------light](#1. 亮度——light)

[2. 音量------voice](#2. 音量——voice)

[3. 菜单------menu](#3. 菜单——menu)

三、完整代码

一、代码拆解

1. 导入必要库

在开始介绍项目的实现细节之前，我们首先需要导入项目所需的必要库。这些库包括：

OpenCV：用于处理图像和视频数据。
Mediapipe：提供了对手部关键点的识别和跟踪功能。
Pygame：用于创建图形界面和显示摄像头捕获的图像。
WMI：用于调节电脑屏幕亮度。
pycaw：用于控制电脑的音量。

python 复制代码

# 导入必要库
import math
import sys
import numpy as np
import cv2
import pygame
import wmi
import mediapipe as mp
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
import warnings  # 忽略警告
warnings.filterwarnings("ignore")

2. 初始化手部关键点

首先创建一个 HandKeyPoint 类，用于初始化手部关键点检测器，并提供对图像进行处理的方法。

python 复制代码

# 手部关键点类
class HandKeyPoint:
    def __init__(self,
                 static_image_mode=False,
                 max_num_hands=2,
                 model_complexity=1,
                 min_detection_confidence=0.5,
                 min_tracking_confidence=0.5):
        # 手部识别api
        self.mp_hands = mp.solutions.hands
        # 获取手部识别类
        self.hands = self.mp_hands.Hands(static_image_mode=static_image_mode,
                                         max_num_hands=max_num_hands,
                                         model_complexity=model_complexity,
                                         min_detection_confidence=min_detection_confidence,
                                         min_tracking_confidence=min_tracking_confidence)

    def process(self, image):
        # 将BGR转换为RGB
        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # 识别图像中的手势，并返回结果
        results = self.hands.process(img)
        # numpy格式的数据
        np_arr = landmarks_to_numpy(results)
        return results, np_arr

3. 数据格式转换

将手部关键点的检测结果（将 landmarks 格式的数据）转换为 numpy 数组，以便后续的处理和分析。

python 复制代码

# 将landmarks格式的数据转换为numpy格式的数据
def landmarks_to_numpy(results):
    """
    将landmarks格式的数据转换为numpy格式的数据
    numpy shape:(2, 21, 3)
    :param results:
    :return:
    """
    shape = (2, 21, 3)
    landmarks = results.multi_hand_landmarks
    if landmarks is None:
        # 没有检测到手
        return np.zeros(shape)
    elif len(landmarks) == 1:
        # 检测出一只手，先判断是左手还是右手
        label = results.multi_handedness[0].classification[0].label
        hand = landmarks[0]
        # print(label)
        if label == "Left":
            return np.array(
                [np.array([[hand.landmark[i].x, hand.landmark[i].y, hand.landmark[i].z] for i in range(21)]),
                 np.zeros((21, 3))])
        else:
            return np.array([np.zeros((21, 3)),
                             np.array(
                                 [[hand.landmark[i].x, hand.landmark[i].y, hand.landmark[i].z] for i in range(21)])])
    elif len(landmarks) == 2:
        # print(results.multi_handedness)
        lh_idx = 0
        rh_idx = 0
        for idx, hand_type in enumerate(results.multi_handedness):
            label = hand_type.classification[0].label
            if label == 'Left':
                lh_idx = idx
            if label == 'Right':
                rh_idx = idx

        lh = np.array(
            [[landmarks[lh_idx].landmark[i].x, landmarks[lh_idx].landmark[i].y, landmarks[lh_idx].landmark[i].z] for i
             in range(21)])
        rh = np.array(
            [[landmarks[rh_idx].landmark[i].x, landmarks[rh_idx].landmark[i].y, landmarks[rh_idx].landmark[i].z] for i
             in range(21)])
        return np.array([lh, rh])
    else:
        return np.zeros((2, 21, 3))

4. 画手势关键点

python 复制代码

# 画手势关键点
def draw_landmark(img, results):
    if results.multi_hand_landmarks:
        for hand_landmark in results.multi_hand_landmarks:
            mp.solutions.drawing_utils.draw_landmarks(img,
                                                      hand_landmark,
                                                      mp.solutions.hands.HAND_CONNECTIONS,
                                                      mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
                                                      mp.solutions.drawing_styles.get_default_hand_connections_style())

    return img

5. 手势状态缓冲处理

为了平滑处理手势状态的变化，我们实现了一个 Buffer 类，用于缓存手势状态的变化，并提供了添加正例和负例的方法。

python 复制代码

# 缓冲区类
class Buffer:
    def __init__(self, volume=20):
        self.__positive = 0
        self.state = False
        self.__negative = 0
        self.__volume = volume
        self.__count = 0

    def add_positive(self):
        self.__count += 1
        if self.__positive >= self.__volume:
            # 如果正例个数大于容量，将状态定为True
            self.state = True
            self.__negative = 0
            self.__count = 0
        else:
            self.__positive += 1

        if self.__count > self.__volume:
            # 如果大于容量次操作后还没有确定状态
            self.__positive = 0
            self.__count = 0

    def add_negative(self):
        self.__count += 1
        if self.__negative >= self.__volume:
            # 如果负例个数大于容量，将状态定为False
            self.state = False
            self.__positive = 0
        else:
            self.__negative += 1

        if self.__count > self.__volume:
            # 如果大于容量次操作后还没有确定状态
            self.__positive = 0
            self.__count = 0
        # print(f"pos:{self.__positive} neg:{self.__negative} count:{self.__count}")

    def clear(self):
        self.__positive = 0
        self.state = False
        self.__negative = 0
        self.__count = 0

6. 画直线

python 复制代码

# 画线函数
def draw_line(frame, p1, p2, color=(255, 127, 0), thickness=3):
    """
    画一条直线
    :param p1:
    :param p2:
    :return:
    """
    return cv2.line(frame, (int(p1[0] * CAM_W), int(p1[1] * CAM_H)), (int(p2[0] * CAM_W), int(p2[1] * CAM_H)), color,
                    thickness)

7. 屏幕亮度和音量控制

python 复制代码

# 控制屏幕亮度
def screen_change(percent):  # percent/2即为亮度百分比
    SCREEN = wmi.WMI(namespace='root/WMI')
    a = SCREEN.WmiMonitorBrightnessMethods()[0]
    a.WmiSetBrightness(Brightness=percent, Timeout=500)

# 初始化音量控制
def init_voice():
    devices = AudioUtilities.GetSpeakers()
    interface = devices.Activate(
        IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
    volume = cast(interface, POINTER(IAudioEndpointVolume))
    volume.SetMute(0, None)
    volume_range = volume.GetVolumeRange()
    min_volume = volume_range[0]
    max_volume = volume_range[1]
    return (min_volume, max_volume), volume

8. 初始化摄像头和手部关键点识别器

在项目的初始化阶段，我们需要加载摄像头实例和手部关键点识别实例，以便后续对手势进行识别和处理。

python 复制代码

# 加载摄像头实例
cap = cv2.VideoCapture(0)
CAM_W = 640
CAM_H = 480
CAM_SCALE = CAM_W / CAM_H

# 加载手部关键点识别实例
hand = HandKeyPoint()

9. Pygame 界面初始化和事件监听

为了展示手势控制效果，并提供交互界面 ，我们使用了 Pygame 库。在初始化阶段，我们创建了一个窗口，并设置了标题。同时，我们实现了事件监听功能，以便在需要时退出程序。

具体来说，我们使用 Pygame 创建了一个窗口，并将摄像头捕获的图像显示在窗口中。同时，我们利用 Pygame 的事件监听功能，监听用户的键盘事件，例如按下"q"键时退出程序。这样，用户就可以通过手势控制屏幕亮度和音量大小，同时在 Pygame 窗口中观察手势识别效果。

python 复制代码

# 初始化pygame
pygame.init()
# 设置窗口全屏
screen = pygame.display.set_mode((800, 600))
pygame.display.set_caption("virtual_control_screen")
# 获取当前窗口大小
window_size = list(screen.get_size())

# 主循环
while True:
······
    # 事件监听 若按q则退出程序
    for event in pygame.event.get():
        if event.type == pygame.KEYDOWN:
            if event.key == pygame.K_q:
                sys.exit(0)

二、实战演示

1. 亮度------light

如果 20 < angle < 90，那么**"light ready"即手势控制亮度**。

2. 音量------voice

如果**-20 > angle > -50**，那么**"voice ready"即手势控制音量**。

上述两种情况除外，那么处于**"menu"状态即进入菜单**。

通过演示可以发现，食指与大拇指在屏幕中的距离越远，亮度越高（音量越大），反之越小，实现了通过手势对亮度和音量的控制。

三、完整代码

python 复制代码

#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@Project : virtual
@File    : virtual_control.py
@IDE     : PyCharm
@Author  : 半亩花海
@Date    : 2024:02:06 18:01
"""
# 导入模块
import math
import sys
import numpy as np
import cv2
import pygame
import wmi
import mediapipe as mp
from ctypes import cast, POINTER
from comtypes import CLSCTX_ALL
from pycaw.pycaw import AudioUtilities, IAudioEndpointVolume
import warnings  # 忽略警告
warnings.filterwarnings("ignore")


# 手部关键点类
class HandKeyPoint:
    def __init__(self,
                 static_image_mode=False,
                 max_num_hands=2,
                 model_complexity=1,
                 min_detection_confidence=0.5,
                 min_tracking_confidence=0.5):
        # 手部识别api
        self.mp_hands = mp.solutions.hands
        # 获取手部识别类
        self.hands = self.mp_hands.Hands(static_image_mode=static_image_mode,
                                         max_num_hands=max_num_hands,
                                         model_complexity=model_complexity,
                                         min_detection_confidence=min_detection_confidence,
                                         min_tracking_confidence=min_tracking_confidence)

    def process(self, image):
        # 将BGR转换为RGB
        img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # 识别图像中的手势，并返回结果
        results = self.hands.process(img)
        # numpy格式的数据
        np_arr = landmarks_to_numpy(results)
        return results, np_arr


# 将landmarks格式的数据转换为numpy格式的数据
def landmarks_to_numpy(results):
    """
    将landmarks格式的数据转换为numpy格式的数据
    numpy shape:(2, 21, 3)
    :param results:
    :return:
    """
    shape = (2, 21, 3)
    landmarks = results.multi_hand_landmarks
    if landmarks is None:
        # 没有检测到手
        return np.zeros(shape)
    elif len(landmarks) == 1:
        # 检测出一只手，先判断是左手还是右手
        label = results.multi_handedness[0].classification[0].label
        hand = landmarks[0]
        # print(label)
        if label == "Left":
            return np.array(
                [np.array([[hand.landmark[i].x, hand.landmark[i].y, hand.landmark[i].z] for i in range(21)]),
                 np.zeros((21, 3))])
        else:
            return np.array([np.zeros((21, 3)),
                             np.array(
                                 [[hand.landmark[i].x, hand.landmark[i].y, hand.landmark[i].z] for i in range(21)])])
    elif len(landmarks) == 2:
        # print(results.multi_handedness)
        lh_idx = 0
        rh_idx = 0
        for idx, hand_type in enumerate(results.multi_handedness):
            label = hand_type.classification[0].label
            if label == 'Left':
                lh_idx = idx
            if label == 'Right':
                rh_idx = idx

        lh = np.array(
            [[landmarks[lh_idx].landmark[i].x, landmarks[lh_idx].landmark[i].y, landmarks[lh_idx].landmark[i].z] for i
             in range(21)])
        rh = np.array(
            [[landmarks[rh_idx].landmark[i].x, landmarks[rh_idx].landmark[i].y, landmarks[rh_idx].landmark[i].z] for i
             in range(21)])
        return np.array([lh, rh])
    else:
        return np.zeros((2, 21, 3))


# 画手势关键点
def draw_landmark(img, results):
    if results.multi_hand_landmarks:
        for hand_landmark in results.multi_hand_landmarks:
            mp.solutions.drawing_utils.draw_landmarks(img,
                                                      hand_landmark,
                                                      mp.solutions.hands.HAND_CONNECTIONS,
                                                      mp.solutions.drawing_styles.get_default_hand_landmarks_style(),
                                                      mp.solutions.drawing_styles.get_default_hand_connections_style())

    return img


# 缓冲区类
class Buffer:
    def __init__(self, volume=20):
        self.__positive = 0
        self.state = False
        self.__negative = 0
        self.__volume = volume
        self.__count = 0

    def add_positive(self):
        self.__count += 1
        if self.__positive >= self.__volume:
            # 如果正例个数大于容量，将状态定为True
            self.state = True
            self.__negative = 0
            self.__count = 0
        else:
            self.__positive += 1

        if self.__count > self.__volume:
            # 如果大于容量次操作后还没有确定状态
            self.__positive = 0
            self.__count = 0

    def add_negative(self):
        self.__count += 1
        if self.__negative >= self.__volume:
            # 如果负例个数大于容量，将状态定为False
            self.state = False
            self.__positive = 0
        else:
            self.__negative += 1

        if self.__count > self.__volume:
            # 如果大于容量次操作后还没有确定状态
            self.__positive = 0
            self.__count = 0
        # print(f"pos:{self.__positive} neg:{self.__negative} count:{self.__count}")

    def clear(self):
        self.__positive = 0
        self.state = False
        self.__negative = 0
        self.__count = 0


# 画线函数
def draw_line(frame, p1, p2, color=(255, 127, 0), thickness=3):
    """
    画一条直线
    :param p1:
    :param p2:
    :return:
    """
    return cv2.line(frame, (int(p1[0] * CAM_W), int(p1[1] * CAM_H)), (int(p2[0] * CAM_W), int(p2[1] * CAM_H)), color,
                    thickness)


# 控制屏幕亮度
def screen_change(percent):  # percent/2即为亮度百分比
    SCREEN = wmi.WMI(namespace='root/WMI')
    a = SCREEN.WmiMonitorBrightnessMethods()[0]
    a.WmiSetBrightness(Brightness=percent, Timeout=500)


# 初始化音量控制
def init_voice():
    devices = AudioUtilities.GetSpeakers()
    interface = devices.Activate(
        IAudioEndpointVolume._iid_, CLSCTX_ALL, None)
    volume = cast(interface, POINTER(IAudioEndpointVolume))
    volume.SetMute(0, None)
    volume_range = volume.GetVolumeRange()
    min_volume = volume_range[0]
    max_volume = volume_range[1]
    return (min_volume, max_volume), volume


# 加载摄像头实例
cap = cv2.VideoCapture(0)
CAM_W = 640
CAM_H = 480
CAM_SCALE = CAM_W / CAM_H

# 加载手部关键点识别实例
hand = HandKeyPoint()

# 初始化pygame
pygame.init()
# 设置窗口全屏
screen = pygame.display.set_mode((800, 600))
pygame.display.set_caption("virtual_control_screen")
# 获取当前窗口大小
window_size = list(screen.get_size())

# 设置缓冲区
buffer_light = Buffer(10)
buffer_voice = Buffer(10)

last_y = 0
last_2_y = 1
last_2_x = 0

# 初始化声音控制
voice_range, volume = init_voice()

# 设置亮度条参数
bright_bar_length = 300
bright_bar_height = 20
bright_bar_x = 50
bright_bar_y = 100

# 设置音量条参数
vol_bar_length = 300
vol_bar_height = 20
vol_bar_x = 50
vol_bar_y = 50

# 主循环 每次循环就是对每帧的处理
while True:
    img_menu = None
    lh_index = -1
    # 读取摄像头画面
    success, frame = cap.read()

    # 将opencv中图片格式的BGR转换为常规的RGB
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    # 镜面反转
    frame = cv2.flip(frame, 1)

    # 处理图像
    res, arr = hand.process(frame)
    frame = draw_landmark(frame, res)

    scale = math.hypot((arr[0, 7, 0] - arr[0, 8, 0]),
                       (arr[0, 7, 1] - arr[0, 8, 1]),
                       (arr[0, 7, 2] - arr[0, 8, 2]))

    # 计算tan值
    tan = (arr[0, 0, 1] - arr[0, 12, 1]) / (arr[0, 0, 0] - arr[0, 12, 0])
    # 计算角度
    angle = np.arctan(tan) * 180 / np.pi
    # print(angle)

    if 20 < angle < 90:
        path = 'resources/menu/light.png'
        buffer_light.add_positive()
        buffer_voice.add_negative()
        # 显示亮度条和亮度刻度值
        show_brightness = True
        show_volume = False
    elif -20 > angle > -50:
        path = 'resources/menu/voice.png'
        buffer_voice.add_positive()
        buffer_light.add_negative()
        # 显示音量条和音量刻度值
        show_brightness = False
        show_volume = True
    else:
        path = 'resources/menu/menu.png'
        buffer_light.add_negative()
        buffer_voice.add_negative()
        # 不显示刻度值和百分比
        show_brightness = False
        show_volume = False

    # 计算拇指与食指之间的距离
    dis = math.hypot(int((arr[1, 4, 0] - arr[1, 8, 0]) * CAM_W), int((arr[1, 4, 1] - arr[1, 8, 1]) * CAM_H))
    # 右手映射时的缩放尺度
    s = math.hypot((arr[1, 5, 0] - arr[1, 9, 0]), (arr[1, 5, 1] - arr[1, 9, 1]), (arr[1, 5, 2] - arr[1, 9, 2]))

    # 调节亮度
    if buffer_light.state:
        frame = cv2.putText(frame, 'light ready', (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 127, 0))
        frame = draw_line(frame, arr[1, 4], arr[1, 8], thickness=5, color=(255, 188, 66))
        if dis != 0:
            # 线性插值，可以理解为将一个区间中的一个值映射到另一区间内
            light = np.interp(dis, [int(500 * s), int(3000 * s)], (0, 100))
            # 调节亮度
            screen_change(light)
    # 调节声音
    elif buffer_voice.state:
        frame = cv2.putText(frame, 'voice ready', (10, 35), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 127, 0))
        frame = draw_line(frame, arr[1, 4], arr[1, 8], thickness=5, color=(132, 134, 248))
        if dis != 0:
            vol = np.interp(dis, [int(500 * s), int(3000 * s)], voice_range)
            # 调节音量
            volume.SetMasterVolumeLevel(vol, None)

    # 将图片改为与窗口一样的大小
    frame = cv2.resize(frame, (int(window_size[1] * CAM_SCALE), window_size[1]))
    frame = cv2.transpose(frame)
    # 渲染图片
    frame = pygame.surfarray.make_surface(frame)
    screen.blit(frame, (int(0.5 * (CAM_W - CAM_H * CAM_SCALE)), 0))

    img_menu = pygame.image.load(path).convert_alpha()
    img_w, img_h = img_menu.get_size()
    img_menu = pygame.transform.scale(img_menu, (int(img_w * scale * 5), int(img_h * scale * 5)))
    x = (arr[0][9][0] + arr[0][13][0] + arr[0][0][0]) / 3
    y = (arr[0][9][1] + arr[0][13][1] + arr[0][0][1]) / 3
    x = int(x * window_size[0] - window_size[0] * scale * 3.5)
    y = int(y * window_size[1] - window_size[1] * scale * 12)
    # print(x, y)
    screen.blit(img_menu, (x, y))

    # 绘制音量条和亮度条的外框
    if show_volume:
        pygame.draw.rect(screen, (255, 255, 255), (vol_bar_x, vol_bar_y, vol_bar_length, vol_bar_height), 3)
    elif show_brightness:
        pygame.draw.rect(screen, (255, 255, 255), (bright_bar_x, bright_bar_y, bright_bar_length, bright_bar_height),
                         3)

    # 计算当前音量和亮度在条上的位置和大小，并绘制已填充的条
    if show_volume:
        vol = volume.GetMasterVolumeLevel()
        vol_range = voice_range[1] - voice_range[0]
        vol_bar_fill_length = int((vol - voice_range[0]) / vol_range * vol_bar_length)
        pygame.draw.rect(screen, (0, 255, 0), (vol_bar_x, vol_bar_y, vol_bar_fill_length, vol_bar_height))
        # 显示音量刻度值和当前音量大小
        vol_text = f"Volume: {int((vol - voice_range[0]) / vol_range * 100)}%"
        vol_text_surface = pygame.font.SysFont(None, 24).render(vol_text, True, (255, 255, 255))
        screen.blit(vol_text_surface, (vol_bar_x + vol_bar_length + 10, vol_bar_y))
    elif show_brightness:
        brightness = wmi.WMI(namespace='root/WMI').WmiMonitorBrightness()[0].CurrentBrightness
        bright_bar_fill_length = int(brightness / 100 * bright_bar_length)
        pygame.draw.rect(screen, (255, 255, 0), (bright_bar_x, bright_bar_y, bright_bar_fill_length, bright_bar_height))
        # 显示亮度刻度值和当前亮度大小
        bright_text = f"Brightness: {brightness}%"
        bright_text_surface = pygame.font.SysFont(None, 24).render(bright_text, True, (255, 255, 255))
        screen.blit(bright_text_surface, (bright_bar_x + bright_bar_length + 10, bright_bar_y))

    pygame.display.flip()

    # 事件监听 若按q则退出程序
    for event in pygame.event.get():
        if event.type == pygame.KEYDOWN:
            if event.key == pygame.K_q:
                sys.exit(0)