🔥 实战 | 基于SoulX-FlashHead构建25FPS实时唇形同步FLV直播服务
大家好,今天给大家分享一个实战项目:基于SoulX-FlashHead模型搭建高帧率(25FPS)的实时唇形同步FLV直播服务。通过这个项目,我们可以实现输入文字自动生成语音,并同步生成对应唇形的视频流,最终以FLV格式推流,实现低延迟、高流畅度的直播体验。

📋 项目背景
SoulX-FlashHead是一款轻量级的唇形同步模型,能够根据音频生成逼真的人脸唇形视频。默认配置下的帧率和码率往往无法满足直播级别的流畅度要求,本文将重点讲解如何将模型推理、TTS语音合成、FFmpeg编码整合,打造25FPS高流畅度的FLV直播服务。
🎯 核心功能
- ✅ 文字转语音(TTS):基于edge-tts实现中文语音合成
- ✅ 25FPS高帧率推理:优化模型推理参数,提升视频流畅度
- ✅ FLV实时推流:通过FFmpeg编码生成低延迟FLV视频流
- ✅ FastAPI服务封装:提供Web界面和HTTP接口,开箱即用
- ✅ 模型预热机制:减少首次推理延迟
- ✅ 完整的异常处理:保证服务稳定性
🛠️ 环境准备
1. 基础依赖安装
bash
# 基础Python依赖
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
pip install fastapi uvicorn loguru edge-tts librosa soundfile opencv-python numpy
pip install simplejpeg # 可选,加速JPEG处理
pip install flash_head # SoulX-FlashHead模型
pip install ffmpeg-python
# 系统依赖(FFmpeg必须)
# Ubuntu/Debian
sudo apt update && sudo apt install -y ffmpeg
# CentOS
sudo yum install -y ffmpeg
# MacOS
brew install ffmpeg
2. 模型下载
需要提前下载以下模型文件,并放到指定目录:
- SoulX-FlashHead-1_3B:唇形同步主模型(放到
models/SoulX-FlashHead-1_3B) - wav2vec2-base-960h:音频特征提取模型(放到
models/wav2vec2-base-960h) - 参考人脸图片:放到
examples/girl.png(可替换为自己的图片)
🚀 核心代码实现
完整代码(server.py)
python
import os
import cv2
import torch
import numpy as np
import threading
import time
import asyncio
import uvicorn
import edge_tts
import io
import librosa
import soundfile as sf
import subprocess
import queue
from loguru import logger
from fastapi import FastAPI
from fastapi.responses import HTMLResponse, StreamingResponse
from flash_head.inference import get_pipeline, get_base_data, get_infer_params, get_audio_embedding, run_pipeline
import torch._dynamo
torch._dynamo.config.suppress_errors = True
try:
import simplejpeg
USE_FAST_JPEG = True
except:
USE_FAST_JPEG = False
# ==============================
# 配置项 (调整为 25 FPS)
# ==============================
CKPT_DIR = "models/SoulX-FlashHead-1_3B"
WAV2VEC_DIR = "models/wav2vec2-base-960h"
MODEL_TYPE = "lite"
COND_IMAGE_PATH = "examples/girl.png"
TTS_VOICE = "zh-CN-XiaoxiaoNeural"
SAMPLE_RATE = 16000
# 【修改】帧率调整为 25,更流畅
TARGET_FPS = 25
# 【修改】码率提升到 600k,保证画质
VIDEO_BITRATE = "600k"
AUDIO_BITRATE = "32k"
app = FastAPI()
pipeline = None
# ==============================
# 模型加载 & 预热 (3秒)
# ==============================
def load_model():
global pipeline
logger.info("Loading Model...")
pipeline = get_pipeline(world_size=1, ckpt_dir=CKPT_DIR, model_type=MODEL_TYPE, wav2vec_dir=WAV2VEC_DIR)
get_base_data(pipeline, cond_image_path_or_dir=COND_IMAGE_PATH, base_seed=9999, use_face_crop=True)
logger.info("Model Loaded.")
def warm_up():
logger.info("Warming up (Duration: 3s, FPS: 25)...")
try:
duration = 3
t = np.linspace(0, 1, SAMPLE_RATE * duration)
dummy_audio = (np.sin(2 * np.pi * 440 * t) * 0.5).astype(np.float32)
with torch.no_grad():
emb = get_audio_embedding(pipeline, dummy_audio)
params = get_infer_params()
params['tgt_fps'] = TARGET_FPS
_ = run_pipeline(pipeline, emb[:, :params['frame_num']])
if torch.cuda.is_available():
torch.cuda.synchronize()
logger.info("Warmup Complete.")
except Exception as e:
logger.warning(f"Warmup skip: {e}")
# ==============================
# FFmpeg 封装器
# ==============================
class FLVStreamer:
def __init__(self):
self.process = None
self.is_running = False
os.makedirs("temp", exist_ok=True)
self.temp_audio_path = os.path.join("temp", f"temp_audio_{time.time_ns()}.wav")
def set_audio(self, audio_data):
logger.info(f"Writing audio to {self.temp_audio_path}...")
sf.write(self.temp_audio_path, audio_data, SAMPLE_RATE)
command = [
'ffmpeg',
'-y',
# 视频输入:明确指定 -r 25
'-f', 'rawvideo',
'-vcodec', 'rawvideo',
'-s', '512x512',
'-pix_fmt', 'bgr24',
'-r', str(TARGET_FPS),
'-i', '-',
# 音频输入
'-i', self.temp_audio_path,
# 编码配置
'-c:v', 'libx264',
'-preset', 'ultrafast',
'-tune', 'zerolatency',
'-b:v', VIDEO_BITRATE,
'-pix_fmt', 'yuv420p',
'-g', str(TARGET_FPS * 2), # 关键帧间隔
'-vsync', 'cfr',
'-c:a', 'aac',
'-b:a', AUDIO_BITRATE,
'-ar', '16000',
'-f', 'flv',
'-flvflags', 'no_duration_filesize',
'pipe:1'
]
logger.info(f"Starting FFmpeg (FPS: {TARGET_FPS})...")
self.process = subprocess.Popen(
command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
bufsize=0
)
self.is_running = True
def log_stderr():
while self.is_running:
line = self.process.stderr.readline()
if not line: break
try:
msg = line.decode('utf-8', errors='ignore').strip()
if msg: logger.debug(f"[FFmpeg] {msg}")
except: pass
threading.Thread(target=log_stderr, daemon=True).start()
def feed_video(self, frame_bgr):
if not self.process or not self.is_running:
return
try:
self.process.stdin.write(frame_bgr.tobytes())
except BrokenPipeError:
logger.warning("FFmpeg stdin broken")
self.is_running = False
except Exception as e:
logger.error(f"Feed video error: {e}")
self.is_running = False
def read_stream(self):
if not self.process: return
try:
while self.is_running:
chunk = self.process.stdout.read(4096)
if not chunk: break
yield chunk
finally:
self.stop()
def stop(self):
self.is_running = False
if self.process:
try:
self.process.stdin.close()
self.process.terminate()
self.process.wait(timeout=2)
except: pass
if hasattr(self, 'temp_audio_path') and os.path.exists(self.temp_audio_path):
try: os.remove(self.temp_audio_path)
except: pass
# ==============================
# 推理线程
# ==============================
def inference_worker(text, streamer):
try:
logger.info(f"[Worker] 开始生成: {text[:20]}...")
# 1. TTS
comm = edge_tts.Communicate(text, TTS_VOICE)
audio_buffer = io.BytesIO()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
async def get_audio():
async for c in comm.stream():
if c["type"] == "audio": audio_buffer.write(c["data"])
loop.run_until_complete(get_audio())
loop.close()
audio_buffer.seek(0)
audio_data, sr = sf.read(audio_buffer)
if sr != SAMPLE_RATE:
audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=SAMPLE_RATE)
audio_data = audio_data.astype(np.float32)
# 2. 准备音频
audio_int16 = (audio_data * 32767).astype(np.int16)
streamer.set_audio(audio_int16)
# 3. 推理参数
params = get_infer_params()
params['tgt_fps'] = TARGET_FPS
tgt_fps = params['tgt_fps']
f_num = params['frame_num']
m_num = params['motion_frames_num']
s_len = f_num - m_num
s_a = s_len * SAMPLE_RATE // tgt_fps
f_a = f_num * SAMPLE_RATE // tgt_fps
# 音频对齐
rem = (len(audio_data) - f_a) % s_a
if rem > 0:
audio_data = np.concatenate([audio_data, np.zeros(s_a - rem, dtype=np.float32)])
with torch.no_grad():
emb = get_audio_embedding(pipeline, audio_data)
chunks = (emb.shape[1] - f_num) // s_len
# 计算总帧数
audio_duration = len(audio_data) / SAMPLE_RATE
target_total_frames = int(audio_duration * tgt_fps)
logger.info(f"Audio Duration: {audio_duration:.2f}s | Target Frames: {target_total_frames}")
generated_frames = 0
for i in range(chunks):
if generated_frames >= target_total_frames:
break
s = i * s_len
e = s + f_num
c_emb = emb[:, s:e].contiguous()
with torch.no_grad():
vid = run_pipeline(pipeline, c_emb)
if i != 0:
vid = vid[m_num:]
frames_np = vid.cpu().numpy().astype(np.uint8)
for k in range(frames_np.shape[0]):
if generated_frames >= target_total_frames:
break
f_rgb = frames_np[k]
f_bgr = cv2.cvtColor(f_rgb, cv2.COLOR_RGB2BGR)
streamer.feed_video(f_bgr)
generated_frames += 1
logger.info(f"Finished. Generated {generated_frames} frames.")
time.sleep(1)
streamer.stop()
except Exception as e:
logger.error(f"Worker error: {e}")
import traceback
traceback.print_exc()
streamer.stop()
# ==============================
# API 路由
# ==============================
@app.get("/")
async def index():
return HTMLResponse(content=html_content)
@app.get("/live.flv")
async def live_stream(text: str = "大家好,帧率已调整为25帧,画面更流畅。"):
streamer = FLVStreamer()
def run_inference():
inference_worker(text, streamer)
t = threading.Thread(target=run_inference, daemon=True)
t.start()
start_wait = time.time()
while streamer.process is None:
await asyncio.sleep(0.1)
if time.time() - start_wait > 20:
return {"error": "FFmpeg start timeout"}
return StreamingResponse(
streamer.read_stream(),
media_type="video/x-flv",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
}
)
# ==============================
# 前端页面
# ==============================
html_content = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>AI实时唇形同步 (25FPS版)</title>
<style>
body{background:#111;color:#eee;font-family:sans-serif;text-align:center}
.container{width:600px;margin:20px auto;}
video{width:100%;background:#000;border:2px solid #4CAF50;}
textarea{width:100%;height:60px;background:#222;color:#eee;border:1px solid #444;padding:10px;box-sizing:border-box;margin-bottom:10px;}
button{padding:12px 30px;background:#4CAF50;border:none;color:#fff;cursor:pointer;font-size:16px;border-radius:4px}
.log{color:#aaa;font-size:12px;height:100px;overflow-y:scroll;background:#1a1a1a;padding:10px;text-align:left;border:1px solid #333;margin-top:10px;font-family:monospace;}
</style>
<script src="https://cdn.jsdelivr.net/npm/flv.js@latest/dist/flv.min.js"></script>
</head>
<body>
<h2>⚡ SoulX 实时唇形同步 (25FPS版)</h2>
<div class="container">
<video id="videoPlayer" controls autoplay>Your browser does not support HTML5 video.</video>
<textarea id="t">帧率提升至 25FPS,码率 600kbps,提供更流畅的直播体验。</textarea><br>
<button onclick="start()">▶ 开始直播</button>
<div id="log" class="log">系统就绪...</div>
</div>
<script>
const videoElement = document.getElementById('videoPlayer');
const logEl = document.getElementById('log');
let flvPlayer = null;
function l(m){
const now = new Date();
const ts = `${now.getHours().toString().padStart(2,'0')}:${now.getMinutes().toString().padStart(2,'0')}:${now.getSeconds().toString().padStart(2,'0')}`;
logEl.innerHTML = `[${ts}] ${m}<br>` + logEl.innerHTML;
}
function start() {
const text = document.getElementById('t').value;
if(!text) return;
l("正在启动...");
if (flvPlayer) {
flvPlayer.pause();
flvPlayer.unload();
flvPlayer.detachMediaElement();
flvPlayer.destroy();
flvPlayer = null;
}
if (flvjs.isSupported()) {
flvPlayer = flvjs.createPlayer({
type: 'flv',
url: '/live.flv?text=' + encodeURIComponent(text),
isLive: true,
hasAudio: true,
hasVideo: true,
cors: true
}, {
enableStashBuffer: false,
stashInitialSize: 128,
lazyLoad: false
});
flvPlayer.attachMediaElement(videoElement);
flvPlayer.load();
let playPromise = videoElement.play();
if (playPromise !== undefined) {
playPromise.then(_ => {
l("播放已启动");
}).catch(error => {
l("自动播放被阻止,请点击视频区域");
});
}
}
}
</script>
</body>
</html>
"""
@app.on_event("startup")
async def startup_event():
load_model()
warm_up()
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8383)
🔍 核心模块解析
1. 模型加载与预热
python
def load_model():
global pipeline
logger.info("Loading Model...")
pipeline = get_pipeline(world_size=1, ckpt_dir=CKPT_DIR, model_type=MODEL_TYPE, wav2vec_dir=WAV2VEC_DIR)
get_base_data(pipeline, cond_image_path_or_dir=COND_IMAGE_PATH, base_seed=9999, use_face_crop=True)
logger.info("Model Loaded.")
def warm_up():
logger.info("Warming up (Duration: 3s, FPS: 25)...")
try:
duration = 3
t = np.linspace(0, 1, SAMPLE_RATE * duration)
dummy_audio = (np.sin(2 * np.pi * 440 * t) * 0.5).astype(np.float32)
with torch.no_grad():
emb = get_audio_embedding(pipeline, dummy_audio)
params = get_infer_params()
params['tgt_fps'] = TARGET_FPS
_ = run_pipeline(pipeline, emb[:, :params['frame_num']])
if torch.cuda.is_available():
torch.cuda.synchronize()
logger.info("Warmup Complete.")
except Exception as e:
logger.warning(f"Warmup skip: {e}")
- 模型加载:初始化FlashHead管道,加载参考人脸图片
- 预热机制:使用3秒的虚拟音频进行一次推理,预热GPU/模型缓存,减少首次请求延迟
- 关键参数:设置目标帧率为25FPS
2. FFmpeg FLV编码封装
python
command = [
'ffmpeg',
'-y',
# 视频输入:明确指定 -r 25
'-f', 'rawvideo',
'-vcodec', 'rawvideo',
'-s', '512x512',
'-pix_fmt', 'bgr24',
'-r', str(TARGET_FPS),
'-i', '-',
# 音频输入
'-i', self.temp_audio_path,
# 编码配置
'-c:v', 'libx264',
'-preset', 'ultrafast',
'-tune', 'zerolatency',
'-b:v', VIDEO_BITRATE,
'-pix_fmt', 'yuv420p',
'-g', str(TARGET_FPS * 2), # 关键帧间隔
'-vsync', 'cfr',
'-c:a', 'aac',
'-b:a', AUDIO_BITRATE,
'-ar', '16000',
'-f', 'flv',
'-flvflags', 'no_duration_filesize',
'pipe:1'
]
- 编码优化 :
- 使用
ultrafast预设 +zerolatency调优,优先保证低延迟 - 关键帧间隔设置为2倍帧率(50帧),平衡延迟和容错性
- 固定25FPS输入,保证视频流畅度
- 600k视频码率,保证画质的同时控制带宽
- 使用
- FLV格式 :设置
no_duration_filesize,支持实时流输出
3. 推理与流推送
python
def inference_worker(text, streamer):
try:
# 1. TTS语音合成
comm = edge_tts.Communicate(text, TTS_VOICE)
# 2. 音频处理与对齐
# 3. 模型推理生成视频帧
# 4. 逐帧推送到FFmpeg编码管道
# 5. 异常处理与资源清理
except Exception as e:
logger.error(f"Worker error: {e}")
import traceback
traceback.print_exc()
streamer.stop()
- TTS合成:使用edge-tts将文字转为语音
- 音频对齐:调整音频长度,保证与视频帧同步
- 分块推理:将长音频分块处理,避免内存溢出
- 逐帧推送:将生成的视频帧实时推送到FFmpeg编码进程
4. Web服务与前端
- FastAPI路由:提供根页面和FLV流接口
- 前端页面:使用flv.js播放FLV流,支持自定义文字输入
- 低延迟配置:关闭缓存、最小化缓冲区,保证实时性
🎬 运行与测试
1. 启动服务
bash
python server.py
2. 访问服务
打开浏览器访问 http://localhost:8383,即可看到Web界面:
- 在文本框中输入想要合成的文字
- 点击"开始直播"按钮
- 等待几秒后即可看到实时生成的唇形同步视频流
🎨 效果优化建议
- GPU加速:确保使用CUDA版本的PyTorch,大幅提升推理速度
- 模型优化:可尝试使用TensorRT加速模型推理
- 码率调整 :根据网络情况调整
VIDEO_BITRATE(建议400k-1000k) - 语音优化 :更换TTS音色(修改
TTS_VOICE参数) - 参考图片 :替换
COND_IMAGE_PATH为自定义人脸图片
❌ 常见问题解决
1. FFmpeg启动超时
- 检查FFmpeg是否正确安装:
ffmpeg -version - 确保临时目录有写入权限:
mkdir -p temp && chmod 777 temp
2. 视频卡顿
- 检查GPU是否足够:
nvidia-smi查看显存使用 - 降低码率或使用更小的模型(MODEL_TYPE="lite")
- 确认TARGET_FPS设置正确(25)
3. 音频不同步
- 检查音频采样率是否为16000Hz
- 调整
-vsync cfr参数,强制恒定帧率
### 总结
- 本项目基于SoulX-FlashHead实现了25FPS高帧率的实时唇形同步FLV直播服务,核心整合了TTS语音合成、模型推理、FFmpeg编码和FastAPI Web服务;
- 关键优化点包括:25FPS帧率设置、600k视频码率、FFmpeg低延迟编码配置、模型预热机制;
- 项目提供完整的Web界面,支持自定义文字输入,开箱即用,可根据实际需求调整模型、码率、帧率等参数优化体验。