audio2face docker方式

[参数保存json 结果不对：](#参数保存json 结果不对：)

开源完整代码，需要联系：

https://www.bilibili.com/video/BV1kPeUzCEDX/?spm_id_from=333.337.search-card.all.click&vd_source=d4dc8f82f62c00f6ff1db7a1047e538f

ue配置数字人：

https://www.bilibili.com/video/BV1r6WvzLE1N/?spm_id_from=333.337.search-card.all.click&vd_source=d4dc8f82f62c00f6ff1db7a1047e538f

git代码和版本：

ACE-92a0ac1af1644429e9b3639985ebcdb9a824d2a6

接收代码：

https://github.com/NVIDIA/ACE/blob/92a0ac1af1644429e9b3639985ebcdb9a824d2a6/microservices/audio_2_face_microservice/1.2/scripts/audio2face_microservices_interaction_app/a2f.py

测试端口是否正常：

bash 复制代码

curl http://localhost:8000/v1/health/ready

测试文件：

bash 复制代码

microservices/audio_2_face_microservice/1.2/scripts/audio2face_microservices_interaction_app/a2f.py

参数：

python 复制代码

parser.add_argument("--file",default=r"D:\data\audios\post_res1.wav", help="PCM-16 bits mono Audio file to send to the pipeline")
parser.add_argument("--config",default="config_claire_v1.3.yml", help="Configuration file")
parser.add_argument("--url",default="127.0.0.1:52000" , help="URL of the A2F controller")

参数保存json，

https://github.com/NVIDIA/ACE/tree/92a0ac1af1644429e9b3639985ebcdb9a824d2a6

简单可视化json

python 复制代码

import json

JSON_PATH = r"20251226_204317\animation_frames.json"

# 关注的参数
KEYS = ["JawOpen", "EyeBlinkLeft", "EyeWideRight"]

with open(JSON_PATH, "r", encoding="utf-8") as f:
    frames = json.load(f)

def bar(value, width=50):
    """把0~1的数值画成条形"""
    filled = int(value * width)
    return "[" + "#" * filled + "-" * (20 - filled) + "]"

for i, frame in enumerate(frames[:300]):  # 只显示前30帧
    print(f"Frame {i:03d}: ", end="")
    for k in KEYS:
        v = frame["blendShapes"].get(k, -1)
        print(f"{k}: {int(v * 100)},{bar(v)} ", end="")
    print()

animation_frames.json

表情数据

emotion_data.json

情绪数据

microservices/audio_2_face_microservice/1.2/scripts/audio2face_microservices_interaction_app/a2f_json.py

python 复制代码

#!/usr/bin/env python3

import argparse
import asyncio
import json
from sys import stderr
from datetime import datetime
import os

import numpy as np
import grpc
import scipy
import yaml
import pandas as pd

from nvidia_ace.animation_data.v1_pb2 import AnimationData, AnimationDataStreamHeader
from nvidia_ace.a2f.v1_pb2 import AudioWithEmotion, EmotionPostProcessingParameters, \
    FaceParameters, BlendShapeParameters
from nvidia_ace.audio.v1_pb2 import AudioHeader
from nvidia_ace.services.a2f_controller.v1_pb2_grpc import A2FControllerServiceStub
from nvidia_ace.controller.v1_pb2 import AudioStream, AudioStreamHeader
from nvidia_ace.emotion_with_timecode.v1_pb2 import EmotionWithTimeCode
from nvidia_ace.emotion_aggregate.v1_pb2 import EmotionAggregate


# Bit depth of the audio file, only 16 bit PCM audio is currently supported.
BITS_PER_SAMPLE = 16
# Channel count, only mono audio is currently supported.
CHANNEL_COUNT = 1
# Audio format, only PCM is supported.
AUDIO_FORMAT = AudioHeader.AUDIO_FORMAT_PCM


def get_audio_bit_format(audio_header: AudioHeader):
    """
    Reads the audio_header parameters and returns the write type to interpret
    the audio data sent back by the server.
    """
    if audio_header.audio_format == AudioHeader.AUDIO_FORMAT_PCM:
        # We only support 16 bits PCM.
        if audio_header.bits_per_sample == 16:
            return np.int16
    return None


def save_audio_data_to_file(outdir: str, audio_header: AudioHeader, audio_buffer: bytes):
    """
    Reads the AudioHeader and output the content of the audio buffer into a wav
    file.
    """
    # Type of the audio data to output.
    dtype = get_audio_bit_format(audio_header)
    if dtype is None:
        print("Error while downloading data, unknown format for audio output", file=stderr)
        return

    audio_data_to_save = np.frombuffer(audio_buffer, dtype=dtype)
    # Write the audio data output as a wav file.
    scipy.io.wavfile.write(f"{outdir}/out.wav", audio_header.samples_per_second, audio_data_to_save)


def parse_emotion_data(animation_data, emotion_key_frames):

    emotion_aggregate: EmotionAggregate = EmotionAggregate()
    # Metadata is an Any type, try to unpack it into an EmotionAggregate object
    if (animation_data.metadata["emotion_aggregate"] and
        animation_data.metadata["emotion_aggregate"].Unpack(emotion_aggregate)):
        for emotion_with_timecode in emotion_aggregate.a2e_output:
            emotion_key_frames["a2e_output"].append({
                "time_code": emotion_with_timecode.time_code,
                "emotion_values": dict(emotion_with_timecode.emotion),
            })
        for emotion_with_timecode in emotion_aggregate.input_emotions:
            emotion_key_frames["input"].append({
                "time_code": emotion_with_timecode.time_code,
                "emotion_values": dict(emotion_with_timecode.emotion),
            })
        for emotion_with_timecode in emotion_aggregate.a2f_smoothed_output:
            emotion_key_frames["a2f_smoothed_output"].append({
                "time_code": emotion_with_timecode.time_code,
                "emotion_values": dict(emotion_with_timecode.emotion),
            })


async def read_from_stream_csv(stream):
    # List of blendshapes names recovered from the model data in the AnimationDataStreamHeader
    bs_names = []
    # List of animation key frames, meaning a time code and the values of the blendshapes
    animation_key_frames = []
    # Audio buffer that contains the result
    audio_buffer = b''
    # Audio header to store metadata for audio saving
    audio_header: AudioHeader = None
    # Emotions 'key frames' data from input, a2e output and final a2f smoothed output.
    emotion_key_frames = {
        "input": [],
        "a2e_output": [],
        "a2f_smoothed_output": []
    }
    # Reads the content of the stream using the read() method of the StreamStreamCall object.
    while True:
        # Read an incoming packet.
        message = await stream.read()
        if message == grpc.aio.EOF:
            # Create directory with current date and time
            timestamp = datetime.now()
            dir_name = timestamp.strftime("%Y%m%d_%H%M%S")
            os.makedirs(dir_name, exist_ok=True)
            # End of File signals that the stream has been read completely.
            # Not to be confused with the Status Message that contains the response of the RPC call.
            save_audio_data_to_file(dir_name, audio_header, audio_buffer)

            # Normalize the dictionary data to output in JSON.
            df_animation = pd.json_normalize(animation_key_frames)
            df_a2e_ouput = pd.json_normalize(emotion_key_frames["a2e_output"])
            df_smoothed_output = pd.json_normalize(emotion_key_frames["a2f_smoothed_output"])
            df_input = pd.json_normalize(emotion_key_frames["input"])

            # Save data to csv.
            df_animation.to_csv(f"{dir_name}/animation_frames.csv")
            df_a2e_ouput.to_csv(f"{dir_name}/a2e_emotion_output.csv")
            df_smoothed_output.to_csv(f"{dir_name}/a2f_smoothed_emotion_output.csv")
            df_input.to_csv(f"{dir_name}/a2f_input_emotions.csv")
            return

        if message.HasField("animation_data_stream_header"):
            # Message is a header
            print("Receiveing data from server...")
            animation_data_stream_header: AnimationDataStreamHeader = message.animation_data_stream_header
            # Save blendshapes names for later use
            bs_names = animation_data_stream_header.skel_animation_header.blend_shapes
            # Save audio header for later use
            audio_header = animation_data_stream_header.audio_header
        elif message.HasField("animation_data"):
            print(".", end="", flush=True)
            # Message is animation data.
            animation_data: AnimationData = message.animation_data
            parse_emotion_data(animation_data, emotion_key_frames)
            blendshape_list = animation_data.skel_animation.blend_shape_weights
            for blendshapes in blendshape_list:
                # We assign each blendshape name to its corresponding weight.
                bs_values_dict = dict(zip(bs_names, blendshapes.values))
                time_code = blendshapes.time_code
                # Append an object to the list of animation key frames
                animation_key_frames.append({
                    "timeCode": time_code,
                    "blendShapes": bs_values_dict
                })
            # Append audio data to the final audio buffer.
            audio_buffer += animation_data.audio.audio_buffer
        elif message.HasField("status"):
            # Message is status
            print()
            status = message.status
            print(f"Received status message with value: '{status.message}'")
            print(f"Status code: '{status.code}'")

async def read_from_stream(stream):
    bs_names = []
    animation_key_frames = []
    audio_buffer = b''
    audio_header: AudioHeader = None
    emotion_key_frames = {"input": [], "a2e_output": [], "a2f_smoothed_output": []}

    while True:
        message = await stream.read()
        if message == grpc.aio.EOF:
            timestamp = datetime.now()
            dir_name = timestamp.strftime("%Y%m%d_%H%M%S")
            os.makedirs(dir_name, exist_ok=True)

            # 保存音频
            save_audio_data_to_file(dir_name, audio_header, audio_buffer)

            # 保存动画 keyframes 为 JSON（UE5 可直接读取）
            with open(f"{dir_name}/animation_frames.json", "w", encoding="utf-8") as f:
                json.dump(animation_key_frames, f, indent=2, ensure_ascii=False)

            # 可选：保存情绪数据为 JSON
            with open(f"{dir_name}/emotion_data.json", "w", encoding="utf-8") as f:
                json.dump(emotion_key_frames, f, indent=2, ensure_ascii=False)

            print(f"保存完成: {dir_name}")
            return

        if message.HasField("animation_data_stream_header"):
            print("Receiving data from server...")
            animation_data_stream_header: AnimationDataStreamHeader = message.animation_data_stream_header
            bs_names = animation_data_stream_header.skel_animation_header.blend_shapes
            audio_header = animation_data_stream_header.audio_header

        elif message.HasField("animation_data"):
            print(".", end="", flush=True)
            animation_data: AnimationData = message.animation_data
            parse_emotion_data(animation_data, emotion_key_frames)
            blendshape_list = animation_data.skel_animation.blend_shape_weights
            for blendshapes in blendshape_list:
                bs_values_dict = dict(zip(bs_names, blendshapes.values))
                time_code = blendshapes.time_code
                animation_key_frames.append({
                    "timeCode": time_code,
                    "blendShapes": bs_values_dict
                })
            audio_buffer += animation_data.audio.audio_buffer

        elif message.HasField("status"):
            print()
            status = message.status
            print(f"Received status message with value: '{status.message}'")
            print(f"Status code: '{status.code}'")
async def write_to_stream(stream, config_path, audio_file_path):
    # Read the content of the audio file, extracting sample rate and data.
    samplerate, data = scipy.io.wavfile.read(audio_file_path)
    config = None
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    # Each message in the Stream should be an AudioStream message.
    # An AudioStream message can be composed of the following messages:
    # - AudioStreamHeader: must be the first message to be send,
    #       contains metadata about the audio file.
    # - AudioWithEmotion: audio bytes as well as emotions to apply.
    # - EndOfAudio: final message to signal audio sending termination.
    audio_stream_header = AudioStream(
        audio_stream_header=AudioStreamHeader(
            audio_header=AudioHeader(
                samples_per_second=samplerate,
                bits_per_sample=BITS_PER_SAMPLE,
                channel_count=CHANNEL_COUNT,
                audio_format=AUDIO_FORMAT,
            ),
            emotion_post_processing_params=EmotionPostProcessingParameters(
                **config["post_processing_parameters"]
            ),
            face_params=FaceParameters(float_params=config["face_parameters"]),
            blendshape_params=BlendShapeParameters(
                bs_weight_multipliers=config["blendshape_parameters"]["multipliers"],
                bs_weight_offsets=config["blendshape_parameters"]["offsets"],
            )
        )
    )

    # Sending the AudioStreamHeader message encapsulated into an AudioStream object.
    await stream.write(audio_stream_header)

    for i in range(len(data) // samplerate + 1):
        # Cutting the audio into arbitrary chunks, here we use sample rate to send exactly one
        # second of audio per packet but the size does not matter.
        chunk = data[i * samplerate : i * samplerate + samplerate]
        # Send audio buffer to A2F.
        # Packet 0 contains the emotion with timecode list
        # Here we send all the emotion with timecode alongside the first audio buffer
        # as they are available. In a streaming scenario if you don't have access
        # to some emotions right away you can send them in the next audio buffers.
        if i == 0:
            list_emotion_tc = [
                EmotionWithTimeCode(emotion={**v["emotions"]}, time_code=v["time_code"])
                for v in config["emotion_with_timecode_list"].values()
            ]
            await stream.write(
                AudioStream(
                    audio_with_emotion=AudioWithEmotion(
                        audio_buffer=chunk.astype(np.int16).tobytes(),
                        emotions=list_emotion_tc
                    )
                )
            )
        else:
            # Send only the audio buffer
            await stream.write(
                AudioStream(
                    audio_with_emotion=AudioWithEmotion(
                        audio_buffer=chunk.astype(np.int16).tobytes()
                    )
                )
            )
    # Sending the EndOfAudio message to signal end of sending.
    # This is necessary to obtain the status code at the end of the generation of
    # blendshapes. This status code tells you about the end of animation data stream.
    await stream.write(AudioStream(end_of_audio=AudioStream.EndOfAudio()))


parser = argparse.ArgumentParser(
    description=(
        "Sample python3 application to send audio and receive animation data and emotion "
        "data through the A2F pipeline."
    ),
    epilog="NVIDIA CORPORATION.  All rights reserved.",
)

parser.add_argument("--file",default=r"D:\data\audios\post_res1.wav", help="PCM-16 bits mono Audio file to send to the pipeline")
parser.add_argument("--config",default="config_claire_v1.3.yml", help="Configuration file")
parser.add_argument("--url",default="127.0.0.1:52000" , help="URL of the A2F controller")


async def main():
    args = parser.parse_args()

    # Creating an insecure channel to connect to the A2F controller.
    # If behind HTTPS proxy or using HTTPS, please refer to
    # https://grpc.github.io/grpc/python/grpc_asyncio.html#grpc.aio.secure_channel
    async with grpc.aio.insecure_channel(args.url) as c:
        # Creating a stub for the service. This allows us to use the remote channel to communicate
        # via RPC to the controller.
        stub = A2FControllerServiceStub(c)

        # ProcessAudioStream is a bidirectionnal stream, or StreamStreamCall object
        # It exposes a read and write interface as shown here:
        # https://grpc.github.io/grpc/python/grpc_asyncio.html#grpc.aio.StreamStreamCall
        stream = stub.ProcessAudioStream()
        # We create an asyncio task for reading the content of the string, into a async function
        # called read_from_stream.
        read = asyncio.create_task(read_from_stream(stream))
        # We create another asyncio task for writing into the stream. This allows us to run them
        # both in parrallel instead of sequentially.
        write = asyncio.create_task(write_to_stream(stream, args.config, args.file))
        # Await both tasks termination.
        await write
        await read


if __name__ == "__main__":
    asyncio.run(main())

参数保存json 结果不对：

代码来源：

Audio2Face-3D-Samples

D:\project\audio2face\Audio2Face-3D-Samples-main\demo_ok.py

python 复制代码

import asyncio
import json
import a2f_3d.client.auth
import a2f_3d.client.service
from nvidia_ace.services.a2f_controller.v1_pb2_grpc import A2FControllerServiceStub

# ===================== 配置 =====================
A2F_GRPC_ADDR = "127.0.0.1:52000"       # gRPC 服务器地址
CONFIG_FILE = "config.yaml"              # Audio2Face 部署配置文件
AUDIO_FILE = r"D:\data\audios\post_res1.wav"  # 输入音频
OUTPUT_JSON = r"output.json"             # 输出 JSON 文件

async def process_audio_to_json():
    # 创建 gRPC channel
    channel = a2f_3d.client.auth.create_channel(uri=A2F_GRPC_ADDR, use_ssl=False)
    stub = A2FControllerServiceStub(channel)

    # 创建双向流
    stream = stub.ProcessAudioStream()

    frames = []

    # 异步读取输出流
    async def read_stream():
        frame = {}

        async for msg in stream:

            animation_data_stream_header = msg.animation_data_stream_header
            anim = msg.animation_data  # 单帧 AnimationData
            skel = anim.skel_animation
            bs_names = animation_data_stream_header.skel_animation_header.blend_shapes
            blendshape_list = anim.skel_animation.blend_shape_weights
            for blendshapes in blendshape_list:
                # We assign each blendshape name to its corresponding weight.
                bs_values_dict = dict(zip(bs_names, blendshapes.values))
                time_code = blendshapes.time_code
                # Append an object to the list of animation key frames
                frames.append({
                    "timeCode": time_code,
                    "blendShapes": bs_values_dict
                })

    # 写入音频并关闭写入
    await a2f_3d.client.service.write_to_stream(stream, CONFIG_FILE, AUDIO_FILE)

    # 等待读取完成
    await read_stream()

    # 保存 JSON
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(frames, f, indent=2, ensure_ascii=False)

    print(f"输出已保存到 {OUTPUT_JSON}, 总帧数: {len(frames)}")

# ===================== 运行 =====================
if __name__ == "__main__":
    asyncio.run(process_audio_to_json())