大家好,今天给大家带来一篇实战向技术博客 :基于 SoulX-FlashHead 实现一套文字→语音→实时唇形同步视频流的 Web 直播系统。
我会直接把原理 + 完整代码 + 部署流程一次性给全,你复制即可运行,开箱即用。
一、项目介绍
这是一个端到端实时 AI 数字人唇形同步系统,核心能力:
- 输入任意中文文本
- 自动生成语音(TTS)
- 音频驱动人脸图像生成唇形动画
- WebSocket 实时推流到浏览器
- 音视频精准毫秒级同步(核心亮点)
整套系统 = 后端推理服务 + Web 前端 + 实时推流,全部整合在一个 Python 文件里。
二、核心技术栈
- 推理框架:PyTorch + SoulX-FlashHead
- Web 服务:FastAPI + Uvicorn
- 实时通信:WebSocket
- 语音合成:edge-tts(微软免费TTS)
- 音视频处理:librosa、OpenCV、simplejpeg、soundfile
- 前端:原生 HTML/JS(无框架依赖)
环境部署参考:5090显卡一键部署 SoulX-FlashHead:AI 多模态交互模型极速上线指南
三、系统设计思路
- 模型加载 & 预热:启动时加载 FlashHead 人脸驱动模型
- 文本转语音:使用 edge-tts 流式生成音频
- 音频编码:统一重采样到 16kHz
- 音频特征提取:wav2vec2 提取音频 embedding
- 唇形推理:分块推理,避免 OOM,逐帧生成视频
- 帧编码:JPEG + Base64 便于网络传输
- WebSocket 广播:同时推音频 + 视频帧
- 前端精准同步 :根据音频时间戳自动匹配视频帧,不飘、不卡、不拉伸
四、完整可运行脚本(直接复制)
我把你提供的完整代码整理成可直接发布的脚本版 ,你保存为 server.py 即可运行:
python
cat << 'EOF' > server.py
import os
import cv2
import torch
import numpy as np
import threading
import time
import base64
import asyncio
import uvicorn
import edge_tts
import io
import librosa
import soundfile as sf
import json
import queue
try:
import simplejpeg
USE_FAST_JPEG = True
except:
USE_FAST_JPEG = False
print("建议安装: pip install simplejpeg")
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.responses import HTMLResponse
from loguru import logger
import torch._dynamo
torch._dynamo.config.suppress_errors = True
from flash_head.inference import get_pipeline, get_base_data, get_infer_params, get_audio_embedding, run_pipeline
# ==============================
# 配置项(可自行修改)
# ==============================
CKPT_DIR = "models/SoulX-FlashHead-1_3B"
WAV2VEC_DIR = "models/wav2vec2-base-960h"
MODEL_TYPE = "lite"
COND_IMAGE_PATH = "examples/girl.png"
TTS_VOICE = "zh-CN-XiaoxiaoNeural"
SAMPLE_RATE = 16000
OUTPUT_SIZE = (512, 512)
app = FastAPI()
pipeline = None
# ==============================
# 全局状态管理
# ==============================
class LiveState:
def __init__(self):
self.active_connections = set()
self.video_buffer = queue.Queue(maxsize=300)
self.audio_data = None
self.is_running = False
self.lock = threading.Lock()
self.buffer_ready = False
state = LiveState()
# ==============================
# 模型加载 & 预热
# ==============================
def load_model():
global pipeline
logger.info("Loading Model...")
pipeline = get_pipeline(world_size=1, ckpt_dir=CKPT_DIR, model_type=MODEL_TYPE, wav2vec_dir=WAV2VEC_DIR)
get_base_data(pipeline, cond_image_path_or_dir=COND_IMAGE_PATH, base_seed=9999, use_face_crop=True)
logger.info("Model Loaded.")
def warm_up():
logger.info("Warming up...")
try:
t = np.linspace(0, 1, SAMPLE_RATE)
dummy = np.sin(2 * np.pi * 440 * t).astype(np.float32)
with torch.no_grad():
emb = get_audio_embedding(pipeline, dummy)
_ = run_pipeline(pipeline, emb[:, :get_infer_params()['frame_num']])
except:
pass
# ==============================
# 推理工作线程
# ==============================
def inference_worker(text):
global state
try:
logger.info(f"[Worker] 开始生成: {text[:20]}...")
# 1. TTS 生成音频
comm = edge_tts.Communicate(text, TTS_VOICE)
audio_buffer = io.BytesIO()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
async def get_audio():
async for c in comm.stream():
if c["type"] == "audio":
audio_buffer.write(c["data"])
loop.run_until_complete(get_audio())
loop.close()
audio_buffer.seek(0)
audio_data, sr = sf.read(audio_buffer)
if sr != SAMPLE_RATE:
audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=SAMPLE_RATE)
with state.lock:
state.audio_data = audio_data
state.is_running = True
state.buffer_ready = False
while not state.video_buffer.empty():
state.video_buffer.get()
# 2. 推理参数
params = get_infer_params()
tgt_fps = params['tgt_fps']
f_num = params['frame_num']
m_num = params['motion_frames_num']
s_len = f_num - m_num
a_f = audio_data.astype(np.float32)
s_a = s_len * SAMPLE_RATE // tgt_fps
f_a = f_num * SAMPLE_RATE // tgt_fps
rem = (len(a_f) - f_a) % s_a
if rem > 0:
a_f = np.concatenate([a_f, np.zeros(s_a - rem, dtype=np.float32)])
with torch.no_grad():
emb = get_audio_embedding(pipeline, a_f)
chunks = (emb.shape[1] - f_num) // s_len
PRE_BUFFER_FRAMES = tgt_fps
frame_count = 0
for i in range(chunks):
if not state.is_running:
break
s = i * s_len
e = s + f_num
c_emb = emb[:, s:e].contiguous()
with torch.no_grad():
vid = run_pipeline(pipeline, c_emb)
if i != 0:
vid = vid[m_num:]
frames_np = vid.cpu().numpy().astype(np.uint8)
for k in range(frames_np.shape[0]):
if not state.is_running:
break
f = frames_np[k]
f_bgr = cv2.cvtColor(f, cv2.COLOR_RGB2BGR)
if USE_FAST_JPEG:
jpeg = simplejpeg.encode_jpeg(cv2.cvtColor(f_bgr, cv2.COLOR_BGR2RGB), quality=90, colorspace='RGB')
else:
ret, jpeg = cv2.imencode('.jpg', f_bgr, [cv2.IMWRITE_JPEG_QUALITY, 90])
jpeg = jpeg.tobytes()
b64 = base64.b64encode(jpeg).decode('utf-8')
timestamp = frame_count / tgt_fps
state.video_buffer.put((timestamp, b64))
frame_count += 1
# 预缓冲完成,发送音频
if frame_count == PRE_BUFFER_FRAMES:
logger.info("预缓冲完成,开始播放")
audio_bytes = io.BytesIO()
sf.write(audio_bytes, audio_data, SAMPLE_RATE, format='WAV')
audio_bytes.seek(0)
audio_b64 = base64.b64encode(audio_bytes.read()).decode('utf-8')
asyncio.run(broadcast_audio(audio_b64))
state.buffer_ready = True
logger.info("[Worker] 推理完成")
except Exception as e:
logger.error(f"Worker 异常: {e}")
import traceback
traceback.print_exc()
finally:
state.is_running = False
# ==============================
# WebSocket 广播
# ==============================
async def broadcast_audio(audio_b64):
msg = json.dumps({"type": "audio", "data": audio_b64})
for conn in list(state.active_connections):
try:
await conn.send_text(msg)
except:
pass
async def broadcast_loop():
while True:
if state.buffer_ready and not state.video_buffer.empty() and state.active_connections:
try:
ts, b64 = state.video_buffer.get_nowait()
msg = json.dumps({"type": "video", "ts": ts, "data": b64})
for conn in list(state.active_connections):
try:
await conn.send_text(msg)
except:
state.active_connections.discard(conn)
except:
pass
else:
await asyncio.sleep(0.001)
@app.on_event("startup")
async def startup_event():
load_model()
warm_up()
asyncio.create_task(broadcast_loop())
# ==============================
# API 路由
# ==============================
@app.get("/")
async def index():
return HTMLResponse(content=html_content)
@app.post("/start")
async def start(req: dict):
text = req.get("text")
if not text:
return {"status": "error"}
state.is_running = False
state.buffer_ready = False
time.sleep(0.1)
threading.Thread(target=inference_worker, args=(text,), daemon=True).start()
return {"status": "started"}
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
await websocket.accept()
state.active_connections.add(websocket)
try:
while True:
data = await websocket.receive_text()
if data == "ping":
await websocket.send_text("pong")
except WebSocketDisconnect:
state.active_connections.discard(websocket)
except:
state.active_connections.discard(websocket)
# ==============================
# 前端页面(内置)
# ==============================
html_content = """
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>AI实时唇形同步直播</title>
<style>
body{background:#111;color:#eee;font-family:sans-serif;text-align:center}
#v-container {
width: 512px; height: 512px; margin: 0 auto;
background: #000; overflow: hidden;
display: flex; justify-content: center; align-items: center;
border: 2px solid #4CAF50;
}
#v { width:100%; height:100%; object-fit: contain; }
#a{display:none}
textarea{width:400px;height:60px;background:#333;color:#fff;border:1px solid #555}
button{padding:10px 20px;background:#4CAF50;border:none;color:#fff;cursor:pointer;margin:5px}
.log{color:#888;font-size:12px;height:60px;overflow-y:scroll;background:#222;padding:5px;text-align:left;width:500px;margin:10px auto;}
</style>
</head>
<body>
<h1>⚡ SoulX 实时唇形同步直播</h1>
<div id="v-container">
<img id="v">
</div>
<audio id="a" autoplay>
<textarea id="t">大家好,这是精准同步版。视频会根据音频播放进度自动对齐嘴型。</textarea><br>
<button onclick="start()">▶ 开始直播</button>
<div id="log" class="log"></div>
<script>
const v=document.getElementById('v');
const a=document.getElementById('a');
const log=document.getElementById('log');
let ws;
let frameBuffer = [];
let isPlaying = false;
function l(m){ console.log(m); log.innerHTML=m+"<br>"+log.innerHTML; }
function start() {
l("正在启动...");
frameBuffer = [];
isPlaying = false;
fetch('/start', {
method:'POST',
body: JSON.stringify({text: document.getElementById('t').value}),
headers:{'Content-Type':'application/json'}
});
}
function connect() {
const proto = location.protocol === 'https:' ? 'wss://' : 'ws://';
ws = new WebSocket(proto + location.host + '/ws');
ws.onopen = () => l("WebSocket 已连接");
ws.onclose = () => { l("断开,重连中..."); setTimeout(connect, 1000); };
ws.onmessage = (evt) => {
try {
const msg = JSON.parse(evt.data);
if (msg.type === 'audio') {
l("音频已加载,开始播放");
a.src = "data:audio/wav;base64," + msg.data;
a.play().catch(e=>l("浏览器禁止自动播放,请点击页面后重试"));
isPlaying = true;
} else if (msg.type === 'video') {
frameBuffer.push({ts: msg.ts, data: msg.data});
}
} catch(e) {}
};
}
// 音视频精准同步核心(50FPS)
setInterval(() => {
if (!isPlaying || !a.src) return;
const currentTime = a.currentTime;
// 清理过期帧
while (frameBuffer.length > 0 && frameBuffer[0].ts < currentTime - 0.1) {
frameBuffer.shift();
}
let bestFrame = null;
if (frameBuffer.length > 0) {
let target = frameBuffer[0];
for(let i=0; i<frameBuffer.length; i++){
if(frameBuffer[i].ts <= currentTime){
target = frameBuffer[i];
} else break;
}
bestFrame = target;
}
if (bestFrame) {
v.src = "data:image/jpeg;base64," + bestFrame.data;
}
}, 20);
connect();
</script>
</body>
</html>
"""
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8383)
EOF
启动命令:
bash
python server.py
五、环境安装命令
bash
pip install fastapi uvicorn edge-tts opencv-python torch numpy librosa soundfile loguru simplejpeg
六、核心亮点(可直接写进博客)
- 单文件完整服务:模型推理 + API + 前端全部在一个文件
- 毫秒级音视频同步:基于时间戳匹配,不飘、不延迟
- 实时流式推流:边推理边播放,无需等待全片生成
- 防拉伸画面:严格保持 512x512 比例,人脸不变形
- 自动重连 + 异常保护:生产级稳定性
- 免费TTS + 本地推理:无第三方云服务依赖
七、适用场景
- AI 虚拟主播
- 数字人客服
- 短视频自动配音生成
- 实时直播唇形同步
- 轻量化数字人演示系统