【Python脚本系列】PyAudio+librosa+dtw库录制、识别音频并实现点击（四）

PyCaw库可以获取应用程序的峰值音量，但是无法识别不同声音，这时候就需要用到以下几个库：

PyAudio：实时采集音频流（VB-Cable 或麦克风）

Librosa：从音频中提取 MFCC 特征

DTW：将实时音频 MFCC 与模板音频 MFCC 做相似度匹配

pyautogui：实现模拟键盘点击

本例中的程序将应用以上几个库实现音频识别并点击，首先需要下载安装VB-Audio，之后重启系统否则程序会识别出错，然后将系统音量的输出设备变为cable input(VB-Audio Virtual Cable)。

具体的功能是识别到A声音时不点击，识别到B或者C时点击，未识别到声音时也不点击

还有附加操作可以参考：

1.输出设备变为cable input后，系统音响会听不到声音，这时候可以按win+r输入mmsys.cpl，点击录制-属性-侦听，勾选侦听此设备，然后选择音响设备，这时候系统的音量在音响也能听到了。

2.本例中的程序会识别系统所有音量，所以如果想单独识别某个应用的音量，可以把"系统音量的输出设备变为cable input(VB-Audio Virtual Cable)"的操作变为打开音量合成器，找到某个应用，然后把该应用的音量输出设备变为cable input(VB-Audio Virtual Cable)，如此其他应用的音量不会被识别到。

以下是具体程序：

python 复制代码

import pyaudio
import numpy as np
import librosa
from dtw import dtw
import time
import threading
import keyboard

# 窗口与点击部分 -----------------------------
from pycaw.pycaw import AudioUtilities, IAudioMeterInformation
from comtypes import CLSCTX_ALL
import win32gui
import win32process
import win32con
import pyautogui

# ============================================
#               配置区域
# ============================================
TARGET_PROCESS = "app.exe"
CLICK_X, CLICK_Y = 123, 456          # 点击坐标
device_index = 21                    # VB-Cable / WASAPI 输入 ID
samplerate = 48000
frames_per_buffer = 1024

volume_threshold = 0.01              # 检测声音
silence_time_threshold = 0.3         # 静音判断结束播放
PEAK_THRESHOLD = 0.01                # 只要游戏窗口发声就激活它

stop_flag = False

# ============================================
#       加载模板音频（A / B / C）
# ============================================
files = ["A.wav", "B.wav", "C.wav"]
mfccs = []

for f in files:
    y, sr = librosa.load(f, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=14).T
    mfcc = (mfcc - mfcc.mean(axis=0)) / (mfcc.std(axis=0) + 1e-6)
    mfccs.append(mfcc)

# ============================================
#     Win32 工具函数（激活窗口和点击）
# ============================================

def get_hwnd_from_pid(pid):
    hwnds = []
    def callback(hwnd, _):
        tid, current_pid = win32process.GetWindowThreadProcessId(hwnd)
        if current_pid == pid and win32gui.IsWindowVisible(hwnd):
            hwnds.append(hwnd)
        return True
    win32gui.EnumWindows(callback, None)
    return hwnds[0] if hwnds else None

def activate_window(hwnd):
    win32gui.ShowWindow(hwnd, win32con.SW_RESTORE)
    win32gui.SetForegroundWindow(hwnd)
    time.sleep(0.2)

def click_window_relative(hwnd, rel_x, rel_y):
    left, top, right, bottom = win32gui.GetWindowRect(hwnd)
    pyautogui.click(left + rel_x, top + rel_y)

# ============================================
#            退出
# ============================================

def listen_for_quit():
    global stop_flag
    keyboard.wait('q')
    print("检测到按下 q，脚本停止。")
    stop_flag = True

threading.Thread(target=listen_for_quit, daemon=True).start()

# ============================================
#                主识别流程
# ============================================

# 初始化 PyAudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paFloat32,
                channels=2,
                rate=samplerate,
                input=True,
                input_device_index=device_index,
                frames_per_buffer=frames_per_buffer)

print("识别中... 按 q 退出")

audio_buffer = []
last_sound_time = time.time()

def detect_game_window():
    """找到游戏窗口并激活"""
    sessions = AudioUtilities.GetAllSessions()
    for session in sessions:
        if session.Process and session.Process.name() == TARGET_PROCESS:
            return session.Process.pid
    return None

# ============================================
#              主循环：识别 + 点击
# ============================================
try:
    while not stop_flag:
        data = stream.read(frames_per_buffer, exception_on_overflow=False)
        audio = np.frombuffer(data, dtype=np.float32)

        if len(audio) % 2 == 0:
            audio = audio.reshape(-1, 2).mean(axis=1)

        audio = np.nan_to_num(audio)
        max_vol = np.max(np.abs(audio))

        # 有声音 → 加入缓冲
        if max_vol > volume_threshold:
            audio_buffer.extend(audio)
            last_sound_time = time.time()

        # 静音 → 判断是否结束播放
        else:
            if audio_buffer and time.time() - last_sound_time > silence_time_threshold:
                buf = np.array(audio_buffer)
                audio_buffer = []

                # 提取特征
                mfcc_live = librosa.feature.mfcc(y=buf, sr=samplerate, n_mfcc=14).T
                mfcc_live = (mfcc_live - mfcc_live.mean(axis=0)) / (mfcc_live.std(axis=0) + 1e-6)

                # DTW 距离
                distances = [dtw(mfcc_live, ref, dist_method='euclidean').distance for ref in mfccs]
                idx = np.argmin(distances)
                detected_file = files[idx]

                print(f"识别到声音: {detected_file} (距离 {distances[idx]:.2f})")

                # ============================
                #         点击逻辑
                # ============================

                if detected_file == "A.wav":
                    print("识别到 A - 不点击")
                else:
                    pid = detect_game_window()
                    if pid:
                        hwnd = get_hwnd_from_pid(pid)
                        if hwnd:
                            activate_window(hwnd)
                            click_window_relative(hwnd, CLICK_X, CLICK_Y)
                            print(f"识别到 {detected_file} → 已点击！")
                
        time.sleep(0.01)

except KeyboardInterrupt:
    pass
finally:
    stream.stop_stream()
    stream.close()
    p.terminate()
    print("已退出")

注意：

1."A.wav", "B.wav", "C.wav"提前录制好的音频得放在脚本同一目录下

2.device_index = 21

samplerate = 48000

可以根据以下程序查找：

python 复制代码

import pyaudio

p = pyaudio.PyAudio()

# 列出所有设备并显示默认采样率
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    print(f"Index {i}: {dev['name']}, "
          f"max input channels: {dev['maxInputChannels']}, "
          f"max output channels: {dev['maxOutputChannels']}, "
          f"default samplerate: {dev['defaultSampleRate']}")

# 查找 VB-Cable WASAPI 输出设备并获取默认采样率
device_index = None
device_samplerate = None
for i in range(p.get_device_count()):
    dev = p.get_device_info_by_index(i)
    if "CABLE Output" in dev['name'] and dev['hostApi'] == 2 and dev['maxInputChannels'] > 0:
        device_index = i
        device_samplerate = int(dev['defaultSampleRate'])
        print(f"找到 VB-Cable WASAPI 输出设备: {dev['name']} "
              f"(Index {i}, 默认采样率: {device_samplerate})")
        break

if device_index is None:
    print("未找到 VB-Cable WASAPI 输出设备")

p.terminate()

目前程序仅供参考，程序实测可能出现识别不准确的问题，后续还需对算法进行改进