PyCaw库可以获取应用程序的峰值音量,但是无法识别不同声音,这时候就需要用到以下几个库:
PyAudio:实时采集音频流(VB-Cable 或麦克风)
Librosa:从音频中提取 MFCC 特征
DTW:将实时音频 MFCC 与模板音频 MFCC 做相似度匹配
pyautogui:实现模拟键盘点击
本例中的程序将应用以上几个库实现音频识别并点击,首先需要下载安装VB-Audio,之后重启系统否则程序会识别出错,然后将系统音量的输出设备变为cable input(VB-Audio Virtual Cable)。
具体的功能是识别到A声音时不点击,识别到B或者C时点击,未识别到声音时也不点击
还有附加操作可以参考:
1.输出设备变为cable input后,系统音响会听不到声音,这时候可以按win+r输入mmsys.cpl,点击录制-属性-侦听,勾选侦听此设备,然后选择音响设备,这时候系统的音量在音响也能听到了。
2.本例中的程序会识别系统所有音量,所以如果想单独识别某个应用的音量,可以把"系统音量的输出设备变为cable input(VB-Audio Virtual Cable)"的操作变为打开音量合成器,找到某个应用,然后把该应用的音量输出设备变为cable input(VB-Audio Virtual Cable),如此其他应用的音量不会被识别到。
以下是具体程序:
python
import pyaudio
import numpy as np
import librosa
from dtw import dtw
import time
import threading
import keyboard
# 窗口与点击部分 -----------------------------
from pycaw.pycaw import AudioUtilities, IAudioMeterInformation
from comtypes import CLSCTX_ALL
import win32gui
import win32process
import win32con
import pyautogui
# ============================================
# 配置区域
# ============================================
TARGET_PROCESS = "app.exe"
CLICK_X, CLICK_Y = 123, 456 # 点击坐标
device_index = 21 # VB-Cable / WASAPI 输入 ID
samplerate = 48000
frames_per_buffer = 1024
volume_threshold = 0.01 # 检测声音
silence_time_threshold = 0.3 # 静音判断结束播放
PEAK_THRESHOLD = 0.01 # 只要游戏窗口发声就激活它
stop_flag = False
# ============================================
# 加载模板音频(A / B / C)
# ============================================
files = ["A.wav", "B.wav", "C.wav"]
mfccs = []
for f in files:
y, sr = librosa.load(f, sr=None)
mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=14).T
mfcc = (mfcc - mfcc.mean(axis=0)) / (mfcc.std(axis=0) + 1e-6)
mfccs.append(mfcc)
# ============================================
# Win32 工具函数(激活窗口和点击)
# ============================================
def get_hwnd_from_pid(pid):
hwnds = []
def callback(hwnd, _):
tid, current_pid = win32process.GetWindowThreadProcessId(hwnd)
if current_pid == pid and win32gui.IsWindowVisible(hwnd):
hwnds.append(hwnd)
return True
win32gui.EnumWindows(callback, None)
return hwnds[0] if hwnds else None
def activate_window(hwnd):
win32gui.ShowWindow(hwnd, win32con.SW_RESTORE)
win32gui.SetForegroundWindow(hwnd)
time.sleep(0.2)
def click_window_relative(hwnd, rel_x, rel_y):
left, top, right, bottom = win32gui.GetWindowRect(hwnd)
pyautogui.click(left + rel_x, top + rel_y)
# ============================================
# 退出
# ============================================
def listen_for_quit():
global stop_flag
keyboard.wait('q')
print("检测到按下 q,脚本停止。")
stop_flag = True
threading.Thread(target=listen_for_quit, daemon=True).start()
# ============================================
# 主识别流程
# ============================================
# 初始化 PyAudio
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paFloat32,
channels=2,
rate=samplerate,
input=True,
input_device_index=device_index,
frames_per_buffer=frames_per_buffer)
print("识别中... 按 q 退出")
audio_buffer = []
last_sound_time = time.time()
def detect_game_window():
"""找到游戏窗口并激活"""
sessions = AudioUtilities.GetAllSessions()
for session in sessions:
if session.Process and session.Process.name() == TARGET_PROCESS:
return session.Process.pid
return None
# ============================================
# 主循环:识别 + 点击
# ============================================
try:
while not stop_flag:
data = stream.read(frames_per_buffer, exception_on_overflow=False)
audio = np.frombuffer(data, dtype=np.float32)
if len(audio) % 2 == 0:
audio = audio.reshape(-1, 2).mean(axis=1)
audio = np.nan_to_num(audio)
max_vol = np.max(np.abs(audio))
# 有声音 → 加入缓冲
if max_vol > volume_threshold:
audio_buffer.extend(audio)
last_sound_time = time.time()
# 静音 → 判断是否结束播放
else:
if audio_buffer and time.time() - last_sound_time > silence_time_threshold:
buf = np.array(audio_buffer)
audio_buffer = []
# 提取特征
mfcc_live = librosa.feature.mfcc(y=buf, sr=samplerate, n_mfcc=14).T
mfcc_live = (mfcc_live - mfcc_live.mean(axis=0)) / (mfcc_live.std(axis=0) + 1e-6)
# DTW 距离
distances = [dtw(mfcc_live, ref, dist_method='euclidean').distance for ref in mfccs]
idx = np.argmin(distances)
detected_file = files[idx]
print(f"识别到声音: {detected_file} (距离 {distances[idx]:.2f})")
# ============================
# 点击逻辑
# ============================
if detected_file == "A.wav":
print("识别到 A - 不点击")
else:
pid = detect_game_window()
if pid:
hwnd = get_hwnd_from_pid(pid)
if hwnd:
activate_window(hwnd)
click_window_relative(hwnd, CLICK_X, CLICK_Y)
print(f"识别到 {detected_file} → 已点击!")
time.sleep(0.01)
except KeyboardInterrupt:
pass
finally:
stream.stop_stream()
stream.close()
p.terminate()
print("已退出")
注意:
1."A.wav", "B.wav", "C.wav"提前录制好的音频得放在脚本同一目录下
2.device_index = 21
samplerate = 48000
可以根据以下程序查找:
python
import pyaudio
p = pyaudio.PyAudio()
# 列出所有设备并显示默认采样率
for i in range(p.get_device_count()):
dev = p.get_device_info_by_index(i)
print(f"Index {i}: {dev['name']}, "
f"max input channels: {dev['maxInputChannels']}, "
f"max output channels: {dev['maxOutputChannels']}, "
f"default samplerate: {dev['defaultSampleRate']}")
# 查找 VB-Cable WASAPI 输出设备并获取默认采样率
device_index = None
device_samplerate = None
for i in range(p.get_device_count()):
dev = p.get_device_info_by_index(i)
if "CABLE Output" in dev['name'] and dev['hostApi'] == 2 and dev['maxInputChannels'] > 0:
device_index = i
device_samplerate = int(dev['defaultSampleRate'])
print(f"找到 VB-Cable WASAPI 输出设备: {dev['name']} "
f"(Index {i}, 默认采样率: {device_samplerate})")
break
if device_index is None:
print("未找到 VB-Cable WASAPI 输出设备")
p.terminate()
目前程序仅供参考,程序实测可能出现识别不准确的问题,后续还需对算法进行改进