国内Ubuntu环境Docker部署 SenseVoice

国内Ubuntu环境Docker部署 SenseVoice


趁热搞定了 docker 部署 SenseVoice。在这里记录一下相关的文件。

SenseVoice是一个大模型语音识别库, 支持多种语言识别,速度快,准确率高,详细介绍请参考GitHub官网:
https://github.com/FunAudioLLM/SenseVoice

本笔记主要记录使用 docker 进行部署的相关文件,文件内容放在最后。

部署过程
1. 下载必要的模型

model_download.py

python 复制代码
import os
import argparse


parser = argparse.ArgumentParser(description='modelscope模型下载')

parser.add_argument('--model_name', type=str, help='the model name from modelscope, example AI-ModelScope/stable-diffusion-2-1', required=True)
parser.add_argument('--local_dir', type=str, help='the model cache path.', default=os.getcwd(), required=True)


if __name__ == '__main__':
    args = parser.parse_args()

    print(f"current workspace is {os.getcwd()}")
    print(f"the model_name is {args.local_dir}/{args.model_name}")
    print(f"the local_dir is {args.local_dir}")

    try:
        from modelscope import snapshot_download
        model_dir = snapshot_download(args.model_name, local_dir=args.local_dir)
    except ImportError:
        print("modelscope was not installed! try to install...")
        os.system("pip install modelscope")
    except Exception as e:
        print(f"An error occurred: {e}")

SenseVoice项目的根目录下创建一个 download_model.py 文件,并将上述内容写入。

执行以下命令分别下载 SenseVoiceSmall speech_fsmn_vad_zh-cn-16k-common-pytorch 模型。

python3 model_download.py --model_name=iic/SenseVoiceSmall --local_dir=models/iic/SenseVoiceSmall

python3 model_download.py --model_name=iic/speech_fsmn_vad_zh-cn-16k-common-pytorch --local_dir=models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch

2、docker部署

请在 SenseVoice项目的根目录下创建一个 docker 文件夹,并将上述文件放入 docker 文件夹内。

修改 webui.py 文件18行的 model 变量为 models/iic/SenseVoiceSmall (上述1下载模型设置的本地路径); 20行的vad_model参数修改为 models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch

webui.py

python 复制代码
# coding=utf-8

import os
import librosa
import base64
import io
import gradio as gr
import re

import numpy as np
import torch
import torchaudio
from argparse import ArgumentParser


from funasr import AutoModel

model = "models/iic/SenseVoiceSmall"
model = AutoModel(model=model,
				  vad_model="models/iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
				  vad_kwargs={"max_single_segment_time": 30000},
				  trust_remote_code=True,
				  )

import re

emo_dict = {
	"<|HAPPY|>": "😊",
	"<|SAD|>": "😔",
	"<|ANGRY|>": "😡",
	"<|NEUTRAL|>": "",
	"<|FEARFUL|>": "😰",
	"<|DISGUSTED|>": "🤢",
	"<|SURPRISED|>": "😮",
}

event_dict = {
	"<|BGM|>": "🎼",
	"<|Speech|>": "",
	"<|Applause|>": "👏",
	"<|Laughter|>": "😀",
	"<|Cry|>": "😭",
	"<|Sneeze|>": "🤧",
	"<|Breath|>": "",
	"<|Cough|>": "🤧",
}

emoji_dict = {
	"<|nospeech|><|Event_UNK|>": "❓",
	"<|zh|>": "",
	"<|en|>": "",
	"<|yue|>": "",
	"<|ja|>": "",
	"<|ko|>": "",
	"<|nospeech|>": "",
	"<|HAPPY|>": "😊",
	"<|SAD|>": "😔",
	"<|ANGRY|>": "😡",
	"<|NEUTRAL|>": "",
	"<|BGM|>": "🎼",
	"<|Speech|>": "",
	"<|Applause|>": "👏",
	"<|Laughter|>": "😀",
	"<|FEARFUL|>": "😰",
	"<|DISGUSTED|>": "🤢",
	"<|SURPRISED|>": "😮",
	"<|Cry|>": "😭",
	"<|EMO_UNKNOWN|>": "",
	"<|Sneeze|>": "🤧",
	"<|Breath|>": "",
	"<|Cough|>": "😷",
	"<|Sing|>": "",
	"<|Speech_Noise|>": "",
	"<|withitn|>": "",
	"<|woitn|>": "",
	"<|GBG|>": "",
	"<|Event_UNK|>": "",
}

lang_dict =  {
    "<|zh|>": "<|lang|>",
    "<|en|>": "<|lang|>",
    "<|yue|>": "<|lang|>",
    "<|ja|>": "<|lang|>",
    "<|ko|>": "<|lang|>",
    "<|nospeech|>": "<|lang|>",
}

emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}

def format_str(s):
	for sptk in emoji_dict:
		s = s.replace(sptk, emoji_dict[sptk])
	return s


def format_str_v2(s):
	sptk_dict = {}
	for sptk in emoji_dict:
		sptk_dict[sptk] = s.count(sptk)
		s = s.replace(sptk, "")
	emo = "<|NEUTRAL|>"
	for e in emo_dict:
		if sptk_dict[e] > sptk_dict[emo]:
			emo = e
	for e in event_dict:
		if sptk_dict[e] > 0:
			s = event_dict[e] + s
	s = s + emo_dict[emo]

	for emoji in emo_set.union(event_set):
		s = s.replace(" " + emoji, emoji)
		s = s.replace(emoji + " ", emoji)
	return s.strip()

def format_str_v3(s):
	def get_emo(s):
		return s[-1] if s[-1] in emo_set else None
	def get_event(s):
		return s[0] if s[0] in event_set else None

	s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
	for lang in lang_dict:
		s = s.replace(lang, "<|lang|>")
	s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
	new_s = " " + s_list[0]
	cur_ent_event = get_event(new_s)
	for i in range(1, len(s_list)):
		if len(s_list[i]) == 0:
			continue
		if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
			s_list[i] = s_list[i][1:]
		#else:
		cur_ent_event = get_event(s_list[i])
		if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
			new_s = new_s[:-1]
		new_s += s_list[i].strip().lstrip()
	new_s = new_s.replace("The.", " ")
	return new_s.strip()

def model_inference(input_wav, language, fs=16000):
	# task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
	language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
					 "nospeech": "nospeech"}
	
	# task = "Speech Recognition" if task is None else task
	language = "auto" if len(language) < 1 else language
	selected_language = language_abbr[language]
	# selected_task = task_abbr.get(task)
	
	# print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
	
	if isinstance(input_wav, tuple):
		fs, input_wav = input_wav
		input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
		if len(input_wav.shape) > 1:
			input_wav = input_wav.mean(-1)
		if fs != 16000:
			print(f"audio_fs: {fs}")
			resampler = torchaudio.transforms.Resample(fs, 16000)
			input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
			input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
	
	
	merge_vad = True #False if selected_task == "ASR" else True
	print(f"language: {language}, merge_vad: {merge_vad}")
	text = model.generate(input=input_wav,
						  cache={},
						  language=language,
						  use_itn=True,
						  batch_size_s=60, merge_vad=merge_vad)
	
	print(text)
	text = text[0]["text"]
	text = format_str_v3(text)
	
	print(text)
	
	return text


audio_examples = [
    ["example/zh.mp3", "zh"],
    ["example/yue.mp3", "yue"],
    ["example/en.mp3", "en"],
    ["example/ja.mp3", "ja"],
    ["example/ko.mp3", "ko"],
    ["example/emo_1.wav", "auto"],
    ["example/emo_2.wav", "auto"],
    ["example/emo_3.wav", "auto"],
    #["example/emo_4.wav", "auto"],
    #["example/event_1.wav", "auto"],
    #["example/event_2.wav", "auto"],
    #["example/event_3.wav", "auto"],
    ["example/rich_1.wav", "auto"],
    ["example/rich_2.wav", "auto"],
    #["example/rich_3.wav", "auto"],
    ["example/longwav_1.wav", "auto"],
    ["example/longwav_2.wav", "auto"],
    ["example/longwav_3.wav", "auto"],
    #["example/longwav_4.wav", "auto"],
]



html_content = """
<div>
    <h2 style="font-size: 22px;margin-left: 0px;">Voice Understanding Model: SenseVoice-Small</h2>
    <p style="font-size: 18px;margin-left: 20px;">SenseVoice-Small is an encoder-only speech foundation model designed for rapid voice understanding. It encompasses a variety of features including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and acoustic event detection (AED). SenseVoice-Small supports multilingual recognition for Chinese, English, Cantonese, Japanese, and Korean. Additionally, it offers exceptionally low inference latency, performing 7 times faster than Whisper-small and 17 times faster than Whisper-large.</p>
    <h2 style="font-size: 22px;margin-left: 0px;">Usage</h2> <p style="font-size: 18px;margin-left: 20px;">Upload an audio file or input through a microphone, then select the task and language. the audio is transcribed into corresponding text along with associated emotions (😊 happy, 😡 angry/exicting, 😔 sad) and types of sound events (😀 laughter, 🎼 music, 👏 applause, 🤧 cough&sneeze, 😭 cry). The event labels are placed in the front of the text and the emotion are in the back of the text.</p>
	<p style="font-size: 18px;margin-left: 20px;">Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.</p>
	<h2 style="font-size: 22px;margin-left: 0px;">Repo</h2>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">SenseVoice</a>: multilingual speech understanding model</p>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/FunASR" target="_blank">FunASR</a>: fundamental speech recognition toolkit</p>
	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/CosyVoice" target="_blank">CosyVoice</a>: high-quality multilingual TTS model</p>
</div>
"""


def launch(host, port):
	with gr.Blocks(theme=gr.themes.Soft()) as demo:
		# gr.Markdown(description)
		gr.HTML(html_content)
		with gr.Row():
			with gr.Column():
				audio_inputs = gr.Audio(label="Upload audio or use the microphone")
				
				with gr.Accordion("Configuration"):
					language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
												  value="auto",
												  label="Language")
				fn_button = gr.Button("Start", variant="primary")
				text_outputs = gr.Textbox(label="Results")
			gr.Examples(examples=audio_examples, inputs=[audio_inputs, language_inputs], examples_per_page=20)
		
		fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)

	# demo.launch()
	demo.launch(server_name=host, server_port=port)


if __name__ == "__main__":
	# iface.launch()
	parser = ArgumentParser()
	parser.add_argument('--host', default="0.0.0.0", type=str, help='Server bound address')
	parser.add_argument('--port', default=5306, type=int, help='Port number')
	args = parser.parse_args()
	launch(args.host, args.port)

然后执行 cd docker && docker compose -f compose.yaml up。访问 5306端口,出现以下界面即部署成功。

最后附上docker相关文件的内容:

Dockerfile

复制代码
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04

ENV LANG=C.UTF-8 LC_ALL=C.UTF-8

ENV DEBIAN_FRONTEN=noninteractive
SHELL ["/bin/bash", "-c"]

RUN apt-get update -y
RUN apt-get install -y libgl1-mesa-glx libglib2.0-0 gcc g++
RUN apt-get install -y net-tools wget curl git

RUN apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev libffi-dev liblzma-dev

# 从国内镜像源下载安装python
# wget https://www.python.org/ftp/python/3.10.13/Python-3.10.13.tar.xz && tar Jxf Python-3.10.13.tar.xz 
RUN wget https://mirrors.huaweicloud.com/python/3.10.13/Python-3.10.13.tar.xz && tar Jxf Python-3.10.13.tar.xz
RUN cd Python-3.10.13 && ./configure --with-system-ffi --enable-shared --enable-optimizations && make && make install && echo "/usr/local/lib" | tee /etc/ld.so.conf.d/python3.conf && ldconfig
RUN python3 -V && pip3 -V

# 设置国内镜像源
RUN pip3 config set global.index-url https://mirrors.aliyun.com/pypi/simple/ && pip3 config set install.trusted-host mirrors.aliyun.com

WORKDIR /workspace
COPY ./requirements.txt ./

RUN pip3 install -r requirements.txt
RUN apt-get install -y ffmpeg

compose.yaml

复制代码
services:
  sense-voice:
    container_name: sense-voice
    image: sense-voice:1.0
    restart: always
    ports:
      - 5306:5306
    environment:
      - TZ=Asia/Tokyo
      - NVIDIA_VISIBLE_DEVICES=all
    volumes:
      - ../../SenseVoice:/workspace/SenseVoice
    # command: tail -f /dev/null
    command: sh -c "sh /workspace/SenseVoice/docker/start.sh"
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]

requirements.txt

复制代码
--extra-index-url https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud/pytorch/wheel/cu121/
# torch<=2.3
# torchaudio
torch==2.1.2
torchaudio==2.1.2
torchvision==0.16.2
modelscope
huggingface
huggingface_hub
funasr>=1.1.3
numpy<=1.26.4
gradio
fastapi>=0.111.1

start.sh

复制代码
#! /bin/bash
cd SenseVoice && python3 webui.py --port=5306

以上。愿看到的小伙伴不迷路。

相关推荐
王阿巴和王咕噜9 小时前
【WSL】安装并配置适用于Linux的Windows子系统(WSL)
linux·运维·windows
布史10 小时前
Tailscale虚拟私有网络指南
linux·网络
水天需01010 小时前
shift 命令详解
linux
wdfk_prog10 小时前
[Linux]学习笔记系列 -- 内核支持与数据
linux·笔记·学习
oMcLin10 小时前
如何在Ubuntu 22.04 LTS上配置并优化MySQL 8.0分区表,提高大规模数据集查询的效率与性能?
android·mysql·ubuntu
Xの哲學11 小时前
深入剖析Linux文件系统数据结构实现机制
linux·运维·网络·数据结构·算法
深圳市恒讯科技11 小时前
Linux 文件权限指南:chmod 755、644、drwxr-xr-x 解析
linux·服务器·xr
朝阳58111 小时前
Ubuntu 22.04 安装 Fcitx5 中文输入法完整指南
linux·运维·ubuntu
xingzhemengyou111 小时前
Linux taskset指令设置或查看进程的 CPU 亲和性
linux·服务器
开开心心就好11 小时前
图片格式转换工具,右键菜单一键转换简化
linux·运维·服务器·python·django·pdf·1024程序员节