OmniVoice

https://github.com/k2-fsa/OmniVoice.git
OmniVoice is a state-of-the-art massively multilingual zero-shot text-to-speech (TTS) model supporting over 600 languages. Built on a novel diffusion language model-style architecture, it generates high-quality speech with superior inference speed, supporting voice cloning and voice design.
Omnivoice-onnx
https://github.com/AFun9/Omnivoice-onnx.git
把 k2-fsa/OmniVoice(PyTorch)转成 ONNX,覆盖 导出 → 量化(INT8 / INT8-HQ / INT4 / FP16)→ 数值校验 → 端到端推理 → 性能基准 全流程。

在Unity中实现功能
主要代码
csharp
// ============================================================
// OmniVoiceLM.cs --- 性能优化版
//
// 优化项(相对原版):
// 1. CUDA / DML EP 正确配置:禁用 EnableMemoryPattern(动态 shape 场景)
// CUDA 额外选项:arena_extend_strategy、cudnn_conv_algo_search
// 2. 预分配复用缓冲区:消除内循环 new float[VOCAB_SIZE] 引起的 GC 压力
// 3. LogSoftmax 改为原地写入,避免每次分配返回数组
// 4. logits 拷贝:优先 ToArray(),fallback foreach
// 5. FinalUnmaskAll 快速路径:mask 为 0 时直接 return,省第 N+1 次 forward
// 6. BuildFullMask / BuildPositionIds 改为原地填充,减少堆分配
// 7. CFG 内循环:消除临时 condLogits/uncondLogits 数组,直接读 rawLogits
// 8. DiffusionStep:scores 计算与 top-k 合并,减少两次遍历
// 9. bool 扁平化 FlattenBool2D/4D 使用 Buffer.BlockCopy(bool=1byte)
// 10. CFG batch 构建:用 Array.Copy / Buffer.BlockCopy 替代逐元素循环
// ============================================================
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using System;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
public class OmniVoiceLM : IDisposable
{
public const int NUM_CODEBOOKS = 8;
public const int VOCAB_SIZE = 1025; // 1024 audio codes + 1 mask token
public const int MASK_TOKEN = 1024;
public const int PAD_TOKEN = 0;
InferenceSession _session;
System.Random _rng;
// ─── 生成参数 ────────────────────────────────────────────────────
public int NumStep = 32;
public float GuidanceScale = 2.0f;
public float TShift = 0.1f;
public float PositionTemperature = 5.0f;
public float ClassTemperature = 0.0f;
public float LayerPenaltyFactor = 5.0f;
// ─── 复用缓冲区(Generate 首次调用时按实际 S 分配)──────────────
float[] _rawLogitsBuf; // [2 * NUM_CODEBOOKS * S * VOCAB_SIZE]
float[] _resultBuf; // [NUM_CODEBOOKS * S * VOCAB_SIZE]
// 单 token LogSoftmax 工作区(VOCAB_SIZE,固定大小,构造时分配)
readonly float[] _lsmWork = new float[VOCAB_SIZE];
// ─── EP 配置参数(由 Runner 通过构造函数传入)──────────────────
public enum ExecutionProviderType { CPU, CUDA, DML }
// ════════════════════════════════════════════════════════════════
// 构造函数
// ════════════════════════════════════════════════════════════════
public OmniVoiceLM(string modelPath,
ExecutionProviderType ep = ExecutionProviderType.CUDA,
int deviceId = 0,
int seed = 42)
{
_rng = new System.Random(seed);
var opts = new SessionOptions();
opts.GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL;
// 动态 shape(每步序列不变,但 CFG batch 2 vs 1 会切换)
// EnableMemoryPattern = false 避免 ORT 在 shape 变化时重新分配 arena
opts.EnableMemoryPattern = false;
opts.ExecutionMode = ExecutionMode.ORT_SEQUENTIAL;
// 线程数:LM 模型大,推理以 GPU 为主时 CPU 线程无需过多
opts.InterOpNumThreads = 1;
opts.IntraOpNumThreads = 4;
bool epLoaded = false;
if (ep == ExecutionProviderType.CUDA)
{
try
{
var cudaOpts = new OrtCUDAProviderOptions();
cudaOpts.UpdateOptions(new Dictionary<string, string>
{
{ "device_id", deviceId.ToString() },
// kSameAsRequested:按需申请显存,避免预留过多
{ "arena_extend_strategy", "kSameAsRequested" },
// HEURISTIC 比 EXHAUSTIVE 启动快,精度无损
{ "cudnn_conv_algo_search", "HEURISTIC" },
// 在默认流上做 H2D/D2H 拷贝,减少同步点
{ "do_copy_in_default_stream", "1" },
});
opts.AppendExecutionProvider_CUDA(cudaOpts);
epLoaded = true;
Debug.Log($"[OmniVoiceLM] CUDA EP 已加载 (device={deviceId})");
}
catch (Exception ex)
{
Debug.LogWarning($"[OmniVoiceLM] CUDA EP 不可用: {ex.Message},回退 CPU");
}
}
else if (ep == ExecutionProviderType.DML)
{
try
{
// 注意:DML 不支持 bool 张量,已在 LMForward 中将 mask 转为 float32
opts.AppendExecutionProvider_DML(deviceId);
epLoaded = true;
Debug.Log($"[OmniVoiceLM] DirectML EP 已加载 (device={deviceId})");
}
catch (Exception ex)
{
Debug.LogWarning($"[OmniVoiceLM] DML EP 不可用: {ex.Message},回退 CPU");
}
}
if (!epLoaded)
{
// CPU fallback:多线程加速
opts.IntraOpNumThreads = Math.Max(4, Environment.ProcessorCount);
Debug.Log($"[OmniVoiceLM] 使用 CPU EP (threads={opts.IntraOpNumThreads})");
}
_session = new InferenceSession(modelPath, opts);
Debug.Log($"[OmniVoiceLM] 已加载: {modelPath}");
}
// ════════════════════════════════════════════════════════════════
// Generate --- 主入口
// ════════════════════════════════════════════════════════════════
public long[,] Generate(int[] textTokenIds, long[,] refCodes, int targetLen)
{
int T_text = textTokenIds != null ? textTokenIds.Length : 0;
int T_ref = refCodes != null ? refCodes.GetLength(1) : 0;
if (targetLen <= 0) targetLen = Mathf.Max(50, T_ref > 0 ? T_ref : 100);
int genStart = T_text + T_ref;
int S = genStart + targetLen;
// 按实际 S 懒分配/扩容复用缓冲区
EnsureBuffers(S);
Debug.Log($"[OmniVoiceLM] 开始扩散: T_text={T_text} T_ref={T_ref} T_gen={targetLen} " +
$"S={S} steps={NumStep} GS={GuidanceScale} LayerPenalty={LayerPenaltyFactor} " +
$"ClassTemp={ClassTemperature} PosTemp={PositionTemperature}");
// ── 构建 inputIds 和 audioMask ───────────────────────────────
var inputIds = new long[1, NUM_CODEBOOKS, S];
var audioMask = new bool[1, S];
// 文本区
for (int s = 0; s < T_text; s++)
{
long tid = textTokenIds[s];
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
inputIds[0, cb, s] = tid;
audioMask[0, s] = false;
}
// 参考音频区
for (int t = 0; t < T_ref; t++)
{
int s = T_text + t;
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
inputIds[0, cb, s] = Math.Clamp(refCodes[cb, t], 0, MASK_TOKEN - 1);
audioMask[0, s] = true;
}
// 待生成区(全 MASK)
for (int t = 0; t < targetLen; t++)
{
int s = genStart + t;
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
inputIds[0, cb, s] = MASK_TOKEN;
audioMask[0, s] = true;
}
// ── 时移余弦调度 ─────────────────────────────────────────────
double tau = TShift;
double N = NumStep;
var r = new double[NumStep + 1];
for (int n = 0; n <= NumStep; n++)
{
double u = (double)n / N;
r[n] = tau * u / (1.0 + (tau - 1.0) * u);
}
int totalMasks = targetLen * NUM_CODEBOOKS;
int remMasks = totalMasks;
// ── 主扩散循环 ───────────────────────────────────────────────
for (int step = 0; step < NumStep; step++)
{
int kNew;
if (step == NumStep - 1)
kNew = remMasks;
else
{
double kRatio = r[step + 1] - r[step];
// ★ 修复:与 Python math.ceil(total_mask * delta) 对齐使用 Round,
// Ceiling 会在前几步解掉过多 token,导致高层 codebook 优先级失衡。
kNew = (int)Math.Round(kRatio * totalMasks);
kNew = Math.Min(kNew, remMasks);
}
if (kNew <= 0) continue;
if (step % 8 == 0)
Debug.Log($"[OmniVoiceLM] step {step}/{NumStep} kNew={kNew} rem={remMasks}");
float[] logProbs = LMForwardWithCFG(
inputIds, audioMask, S, genStart, T_ref, T_text, targetLen);
if (IsCorrupted(logProbs))
{
Debug.LogError($"[OmniVoiceLM] 步 {step} 检测到 NaN/Inf,尝试恢复...");
PositionTemperature = Mathf.Max(0.1f, PositionTemperature * 0.5f);
logProbs = LMForwardWithCFG(
inputIds, audioMask, S, genStart, T_ref, T_text, targetLen);
if (IsCorrupted(logProbs))
{
Debug.LogError("[OmniVoiceLM] 恢复失败,终止生成");
break;
}
}
int unmasked = DiffusionStep(inputIds, logProbs, genStart, targetLen, S, kNew);
remMasks -= unmasked;
}
// ── 最终强制解 mask ──────────────────────────────────────────
FinalUnmaskAll(inputIds, audioMask, S, genStart, targetLen, T_ref, T_text);
// ── 提取结果 ─────────────────────────────────────────────────
var result = new long[NUM_CODEBOOKS, targetLen];
for (int t = 0; t < targetLen; t++)
{
int s = genStart + t;
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
long v = inputIds[0, cb, s];
result[cb, t] = (v == MASK_TOKEN) ? 0L : Math.Clamp(v, 0L, MASK_TOKEN - 1L);
}
}
float durSec = targetLen * 960f / 24000f;
Debug.Log($"[OmniVoiceLM] 完成: {targetLen} 帧 = {durSec:F1}s");
return result;
}
// ════════════════════════════════════════════════════════════════
// CFG 前向传播
// cond (b=0): [text | ref_audio | gen_tokens]
// uncond (b=1): [gen_tokens | PAD(MASK)]
// ════════════════════════════════════════════════════════════════
float[] LMForwardWithCFG(
long[,,] inputIds, bool[,] audioMask,
int S, int genStart, int T_ref, int T_text, int targetLen)
{
if (GuidanceScale > 0f)
{
var (batchIds, batchAudio, batchAttn) = BuildCFGBatch(
inputIds, audioMask, genStart, S, T_text, T_ref, targetLen);
long[,] posIds = BuildPositionIds(2, S);
// rawLogits 写入复用缓冲区 _rawLogitsBuf
LMForward(batchIds, batchAudio, batchAttn, posIds, batchSize: 2, S: S,
outBuf: _rawLogitsBuf);
// ── CFG 合并:直接从 _rawLogitsBuf 读,写入 _resultBuf ──
int strideB = NUM_CODEBOOKS * S * VOCAB_SIZE;
int strideCB = S * VOCAB_SIZE;
// 清空 result 缓冲区(仅生成区域会被写入,其余不访问)
// Array.Clear(_resultBuf, 0, _resultBuf.Length); // 非必须,写入前会覆盖
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
for (int t = 0; t < targetLen; t++)
{
int sCond = genStart + t;
int sUncond = t;
int condBase = cb * strideCB + sCond * VOCAB_SIZE;
int uncondBase = cb * strideCB + sUncond * VOCAB_SIZE;
int condOff = 0 * strideB + condBase;
int uncondOff = 1 * strideB + uncondBase;
// ── log_softmax(cond) ──
LogSoftmaxSlice(_rawLogitsBuf, condOff, VOCAB_SIZE, _lsmWork); // condLSM → _lsmWork
// 暂存 condLSM(需要两次使用)
var condLSM = new float[VOCAB_SIZE];
Array.Copy(_lsmWork, condLSM, VOCAB_SIZE);
// ── log_softmax(uncond) ──
LogSoftmaxSlice(_rawLogitsBuf, uncondOff, VOCAB_SIZE, _lsmWork); // uncondLSM → _lsmWork
// ── CFG: cfg[v] = condLSM[v] + scale*(condLSM[v] - uncondLSM[v]) ──
// ── 然后再做一次 log_softmax ──
// 先把 cfgValues 写进 _lsmWork 的位置(复用同一块)
for (int v = 0; v < VOCAB_SIZE; v++)
_lsmWork[v] = condLSM[v] + GuidanceScale * (condLSM[v] - _lsmWork[v]);
// 原地对 _lsmWork 做 log_softmax,结果写入 resultBuf
int resultOff = cb * strideCB + sCond * VOCAB_SIZE;
LogSoftmaxSliceSelf(_lsmWork, VOCAB_SIZE, _resultBuf, resultOff);
// MASK token 的 log_prob 设为 -inf
_resultBuf[resultOff + MASK_TOKEN] = float.NegativeInfinity;
}
}
return _resultBuf;
}
else
{
// 无 CFG:单 batch
bool[,,,] attnMask = BuildFullMask(1, S);
long[,] posIds = BuildPositionIds(1, S);
LMForward(inputIds, audioMask, attnMask, posIds, batchSize: 1, S: S,
outBuf: _rawLogitsBuf);
int strideCB = S * VOCAB_SIZE;
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
for (int s = 0; s < S; s++)
{
int baseOff = cb * strideCB + s * VOCAB_SIZE;
LogSoftmaxSlice(_rawLogitsBuf, baseOff, VOCAB_SIZE, _resultBuf, baseOff);
_resultBuf[baseOff + MASK_TOKEN] = float.NegativeInfinity;
}
}
return _resultBuf;
}
}
// ════════════════════════════════════════════════════════════════
// CFG Batch 构建
// cond (b=0): 原样复制
// uncond (b=1): 生成区放前段,其余填 MASK
// ════════════════════════════════════════════════════════════════
static (long[,,] ids, bool[,] audio, bool[,,,] attn) BuildCFGBatch(
long[,,] srcIds, bool[,] srcAudio,
int genStart, int S, int T_text, int T_ref, int targetLen)
{
var ids = new long[2, NUM_CODEBOOKS, S];
var audio = new bool[2, S];
var attn = new bool[2, 1, S, S];
// ── cond (b=0):按 codebook 整行拷贝 ─────────────────────────
// srcIds[0, cb, *] → ids[0, cb, *]
// 利用 Buffer.BlockCopy(long=8 bytes)
int rowBytes = S * sizeof(long);
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
// 源 offset(字节): (0*C*S + cb*S) * 8
int srcOff = (0 * NUM_CODEBOOKS * S + cb * S) * sizeof(long);
int dstOff = (0 * NUM_CODEBOOKS * S + cb * S) * sizeof(long);
Buffer.BlockCopy(srcIds, srcOff, ids, dstOff, rowBytes);
}
// cond audioMask:整行拷贝
for (int s = 0; s < S; s++) audio[0, s] = srcAudio[0, s];
// cond attention:全 true
for (int i = 0; i < S; i++)
for (int j = 0; j < S; j++)
attn[0, 0, i, j] = true;
// ── uncond (b=1) ─────────────────────────────────────────────
// audioMask:生成区 true,其余 false
for (int t = 0; t < targetLen; t++) audio[1, t] = true;
// (bool 数组默认 false,其余不必清零)
// inputIds:生成区拷贝自 cond [genStart..S),其余 MASK
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
// 生成区:srcIds[0, cb, genStart..] → ids[1, cb, 0..]
int srcOff = (0 * NUM_CODEBOOKS * S + cb * S + genStart) * sizeof(long);
int dstOff = (1 * NUM_CODEBOOKS * S + cb * S + 0) * sizeof(long);
Buffer.BlockCopy(srcIds, srcOff, ids, dstOff, targetLen * sizeof(long));
// 填充区:MASK_TOKEN
for (int s = targetLen; s < S; s++)
ids[1, cb, s] = MASK_TOKEN;
}
// uncond attention:
// [0, targetLen) 互相可 attend
for (int i = 0; i < targetLen; i++)
for (int j = 0; j < targetLen; j++)
attn[1, 0, i, j] = true;
// [targetLen, S) 仅对角(pad_diag)
for (int s = targetLen; s < S; s++)
attn[1, 0, s, s] = true;
return (ids, audio, attn);
}
// ════════════════════════════════════════════════════════════════
// DiffusionStep --- 预测 token + 位置选择
// ════════════════════════════════════════════════════════════════
int DiffusionStep(long[,,] inputIds, float[] logProbs,
int genStart, int targetLen, int S, int kNew)
{
int strideCB = S * VOCAB_SIZE;
var predTokens = new long[targetLen, NUM_CODEBOOKS];
var scores = new float[targetLen, NUM_CODEBOOKS];
// 1. 预测 token + 置信度
for (int t = 0; t < targetLen; t++)
{
int s = genStart + t;
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
int baseOff = cb * strideCB + s * VOCAB_SIZE;
predTokens[t, cb] = ClassTemperature > 0f
? SampleTokenTopKRatio(logProbs, baseOff, 0.1f, ClassTemperature)
: ArgmaxToken(logProbs, baseOff);
// confidence = max log_prob(不含 MASK)
float best = float.NegativeInfinity;
for (int v = 0; v < VOCAB_SIZE; v++) // MASK_TOKEN 已被设为 -inf,无需特判
{
float lp = logProbs[baseOff + v];
if (lp > best) best = lp;
}
scores[t, cb] = best;
}
}
// 2. 层惩罚
for (int t = 0; t < targetLen; t++)
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
scores[t, cb] -= cb * LayerPenaltyFactor;
// 3. Gumbel 噪声(position temperature)
if (PositionTemperature > 0f)
{
float invTemp = 1f / PositionTemperature;
for (int t = 0; t < targetLen; t++)
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
double u = Math.Max(1e-10, _rng.NextDouble());
double gumbel = -Math.Log(-Math.Log(u));
scores[t, cb] = (float)(scores[t, cb] * invTemp + gumbel);
}
}
// 4. 已解 mask 位置 → -inf;统计仍 mask 的数量
int totalMasked = 0;
for (int t = 0; t < targetLen; t++)
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
if (inputIds[0, cb, genStart + t] != MASK_TOKEN)
scores[t, cb] = float.NegativeInfinity;
else
totalMasked++;
}
if (totalMasked == 0) return 0;
kNew = Math.Min(kNew, totalMasked);
// 5. Top-k 位置选择(partial sort:只需前 kNew 个)
var allScores = new (int t, int cb, float score)[totalMasked];
int idx = 0;
for (int t = 0; t < targetLen; t++)
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
if (inputIds[0, cb, genStart + t] == MASK_TOKEN)
allScores[idx++] = (t, cb, scores[t, cb]);
// 降序排序(选信心最高的 kNew 个)
Array.Sort(allScores, (a, b) => b.score.CompareTo(a.score));
// 6. 填入预测 token
int unmasked = 0;
for (int i = 0; i < kNew; i++)
{
var (t, cb, _) = allScores[i];
inputIds[0, cb, genStart + t] = predTokens[t, cb];
unmasked++;
}
return unmasked;
}
// ════════════════════════════════════════════════════════════════
// FinalUnmaskAll --- 快速路径 + 最终强制解 mask
// ════════════════════════════════════════════════════════════════
void FinalUnmaskAll(long[,,] inputIds, bool[,] audioMask,
int S, int genStart, int targetLen,
int T_ref, int T_text)
{
// ★ 快速路径:最后一步通常已解全部 mask,省掉第 N+1 次 forward
int maskCount = 0;
for (int t = 0; t < targetLen; t++)
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
if (inputIds[0, cb, genStart + t] == MASK_TOKEN)
maskCount++;
if (maskCount == 0) return;
Debug.Log($"[OmniVoiceLM] 最终强制解 mask: 残余 {maskCount} 个位置");
float[] logProbs = LMForwardWithCFG(
inputIds, audioMask, S, genStart, T_ref, T_text, targetLen);
int strideCB = S * VOCAB_SIZE;
for (int t = 0; t < targetLen; t++)
{
int s = genStart + t;
for (int cb = 0; cb < NUM_CODEBOOKS; cb++)
{
if (inputIds[0, cb, s] != MASK_TOKEN) continue;
int baseOff = cb * strideCB + s * VOCAB_SIZE;
inputIds[0, cb, s] = ArgmaxToken(logProbs, baseOff);
}
}
}
// ════════════════════════════════════════════════════════════════
// LMForward --- ORT 推理,输出写入 outBuf
// ════════════════════════════════════════════════════════════════
void LMForward(long[,,] inputIds, bool[,] audioMask,
bool[,,,] attnMask, long[,] posIds,
int batchSize, int S,
float[] outBuf)
{
var tIds = new DenseTensor<long>(Flatten3D(inputIds), new[] { batchSize, NUM_CODEBOOKS, S });
var tAudio = new DenseTensor<bool>(FlattenBool2D(audioMask), new[] { batchSize, S });
var tAttn = new DenseTensor<bool>(FlattenBool4D(attnMask), new[] { batchSize, 1, S, S });
var tPos = new DenseTensor<long>(Flatten2D(posIds), new[] { batchSize, S });
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("input_ids", tIds),
NamedOnnxValue.CreateFromTensor("audio_mask", tAudio),
NamedOnnxValue.CreateFromTensor("attention_mask", tAttn),
NamedOnnxValue.CreateFromTensor("position_ids", tPos),
};
using var results = _session.Run(inputs);
var logitsTensor = results[0].AsTensor<float>();
// ★ 优化:优先用 ToArray() 替代逐元素 foreach
// ToArray() 在 ORT 内部通常使用批量内存拷贝
float[] arr = logitsTensor.ToArray();
int len = arr.Length;
if (outBuf.Length < len)
{
Debug.LogError($"[OmniVoiceLM] outBuf 太小: {outBuf.Length} < {len},请检查 EnsureBuffers");
return;
}
Array.Copy(arr, outBuf, len);
}
// ════════════════════════════════════════════════════════════════
// Token 采样
// ════════════════════════════════════════════════════════════════
long ArgmaxToken(float[] logProbs, int baseOff)
{
float best = float.NegativeInfinity;
long tok = 0;
for (int v = 0; v < VOCAB_SIZE; v++)
{
float lp = logProbs[baseOff + v];
if (lp > best) { best = lp; tok = v; }
}
return tok;
}
long SampleTokenTopKRatio(float[] logProbs, int baseOff, float ratio, float temperature)
{
int k = (int)Math.Ceiling(ratio * VOCAB_SIZE);
var entries = new (float score, int idx)[VOCAB_SIZE];
for (int v = 0; v < VOCAB_SIZE; v++)
entries[v] = (logProbs[baseOff + v], v);
Array.Sort(entries, (a, b) => b.score.CompareTo(a.score));
var filtered = new float[VOCAB_SIZE];
for (int v = 0; v < VOCAB_SIZE; v++) filtered[v] = float.NegativeInfinity;
for (int i = 0; i < k && i < entries.Length; i++)
filtered[entries[i].idx] = entries[i].score;
for (int v = 0; v < VOCAB_SIZE; v++)
{
if (float.IsNegativeInfinity(filtered[v])) continue;
double u = Math.Max(1e-10, _rng.NextDouble());
filtered[v] = (float)(filtered[v] / temperature - Math.Log(-Math.Log(u)));
}
float best = float.NegativeInfinity;
long tok = 0;
for (int v = 0; v < VOCAB_SIZE; v++)
if (filtered[v] > best) { best = filtered[v]; tok = v; }
return tok;
}
// ════════════════════════════════════════════════════════════════
// LogSoftmax --- 原地写入,避免分配返回数组
// ════════════════════════════════════════════════════════════════
/// <summary>
/// src[srcOff .. srcOff+len) → dst[dstOff .. dstOff+len)
/// </summary>
static void LogSoftmaxSlice(float[] src, int srcOff, int len,
float[] dst, int dstOff)
{
float maxV = float.NegativeInfinity;
for (int i = 0; i < len; i++)
{
float v = src[srcOff + i];
if (v > maxV) maxV = v;
}
if (float.IsInfinity(maxV) || float.IsNaN(maxV))
{
for (int i = 0; i < len; i++) dst[dstOff + i] = float.NegativeInfinity;
return;
}
float sumExp = 0f;
for (int i = 0; i < len; i++) sumExp += MathF.Exp(src[srcOff + i] - maxV);
float logSum = maxV + MathF.Log(sumExp);
for (int i = 0; i < len; i++) dst[dstOff + i] = src[srcOff + i] - logSum;
}
/// <summary>
/// src[srcOff..] → dst[dstOff..] 版本(srcOff != dstOff 均可)
/// 同时也作为 src → _lsmWork 的重载入口
/// </summary>
static void LogSoftmaxSlice(float[] src, int srcOff, int len, float[] dst)
=> LogSoftmaxSlice(src, srcOff, len, dst, 0);
/// <summary>
/// 对 work[0..len) 原地做 log_softmax,结果写入 dst[dstOff..]
/// </summary>
static void LogSoftmaxSliceSelf(float[] work, int len, float[] dst, int dstOff)
{
float maxV = float.NegativeInfinity;
for (int i = 0; i < len; i++) if (work[i] > maxV) maxV = work[i];
if (float.IsInfinity(maxV) || float.IsNaN(maxV))
{
for (int i = 0; i < len; i++) dst[dstOff + i] = float.NegativeInfinity;
return;
}
float sumExp = 0f;
for (int i = 0; i < len; i++) sumExp += MathF.Exp(work[i] - maxV);
float logSum = maxV + MathF.Log(sumExp);
for (int i = 0; i < len; i++) dst[dstOff + i] = work[i] - logSum;
}
// ════════════════════════════════════════════════════════════════
// 辅助方法
// ════════════════════════════════════════════════════════════════
/// <summary>按实际 S 懒分配/扩容复用缓冲区</summary>
void EnsureBuffers(int S)
{
int cfgTotal = 2 * NUM_CODEBOOKS * S * VOCAB_SIZE;
int single = NUM_CODEBOOKS * S * VOCAB_SIZE;
if (_rawLogitsBuf == null || _rawLogitsBuf.Length < cfgTotal)
_rawLogitsBuf = new float[cfgTotal];
if (_resultBuf == null || _resultBuf.Length < single)
_resultBuf = new float[single];
}
bool IsCorrupted(float[] arr)
{
int bad = 0;
for (int i = 0; i < arr.Length; i++)
if (float.IsNaN(arr[i]) || float.IsInfinity(arr[i])) bad++;
return bad > arr.Length / 100;
}
static bool[,,,] BuildFullMask(int B, int S)
{
var m = new bool[B, 1, S, S];
for (int b = 0; b < B; b++)
for (int i = 0; i < S; i++)
for (int j = 0; j < S; j++)
m[b, 0, i, j] = true;
return m;
}
static long[,] BuildPositionIds(int B, int S)
{
var p = new long[B, S];
for (int b = 0; b < B; b++)
for (int s = 0; s < S; s++)
p[b, s] = s;
return p;
}
// ── Flatten 工具 ─────────────────────────────────────────────
static long[] Flatten3D(long[,,] a)
{
var r = new long[a.Length];
Buffer.BlockCopy(a, 0, r, 0, a.Length * sizeof(long));
return r;
}
static long[] Flatten2D(long[,] a)
{
var r = new long[a.Length];
Buffer.BlockCopy(a, 0, r, 0, a.Length * sizeof(long));
return r;
}
// bool 在 CLR 中保证为 1 byte,可以 BlockCopy
static bool[] FlattenBool2D(bool[,] a)
{
var r = new bool[a.Length];
Buffer.BlockCopy(a, 0, r, 0, a.Length);
return r;
}
static bool[] FlattenBool4D(bool[,,,] a)
{
var r = new bool[a.Length];
Buffer.BlockCopy(a, 0, r, 0, a.Length);
return r;
}
public void Dispose() => _session?.Dispose();
}
csharp
using System;
using System.IO;
using System.Collections;
using UnityEngine;
public class OmniVoiceRunner : MonoBehaviour
{
[Header("音频设置")]
public AudioClip referenceAudio;
[Tooltip("参考音频的文字内容(语音克隆模式需要)。留空则不合并到文本段。")]
public string referenceText = "";
[TextArea] public string targetText = "你好,这是使用语音克隆生成的音频。";
public string targetLanguage = "Chinese";
public AudioSource outputAudioSource;
[Header("模型路径(相对 StreamingAssets)")]
public string lmModelRelPath = "OmniVoice/omnivoice_lm_int8_hq/model.onnx";
public string encModelRelPath = "OmniVoice/audio_tokenizer_encoder_int8/model.onnx";
public string decModelRelPath = "OmniVoice/audio_tokenizer_decoder_int8/model.onnx";
public string tokenizerJsonRelPath = "OmniVoice/tokenizer.json";
[Header("推理加速(EP 选择)")]
[Tooltip("CUDA = NVIDIA GPU;DML = DirectML (Windows/AMD/Intel);CPU = 纯 CPU 多线程")]
public OmniVoiceLM.ExecutionProviderType executionProvider = OmniVoiceLM.ExecutionProviderType.CUDA;
[Tooltip("GPU device index,多卡环境可指定")]
public int deviceId = 0;
[Header("生成参数(与原版 Python 对齐)")]
[Tooltip("扩散步数,原版默认 32;速度优先可降至 16")]
public int numStep = 32;
[Tooltip("CFG 引导强度,原版默认 2.0;若输出异常可尝试 0(关闭 CFG)")]
public float guidanceScale = 2.0f;
[Tooltip("调度时移 τ,原版默认 0.1")]
public float tShift = 0.1f;
[Tooltip("position_temperature: 位置选择温度,原版默认 5.0")]
public float positionTemperature = 5.0f;
[Tooltip("class_temperature: token 采样温度,原版默认 0.0(greedy argmax);>0 时使用 top-k ratio + Gumbel")]
public float classTemperature = 0.0f;
[Tooltip("层惩罚系数,原版默认 5.0;控制 codebook 从低到高逐层解 mask")]
public float layerPenaltyFactor = 5.0f;
[Tooltip("目标生成时长(秒)。0 = 按文字长度自动估算")]
public float targetDurSec = 0f;
OmniVoiceLM _lm;
AudioTokenizer _tokenizer;
Qwen2Tokenizer _textTok;
bool _isGenerating;
void Start()
{
Application.targetFrameRate = 60;
string lmPath = Path.Combine(Application.streamingAssetsPath, lmModelRelPath);
string encPath = Path.Combine(Application.streamingAssetsPath, encModelRelPath);
string decPath = Path.Combine(Application.streamingAssetsPath, decModelRelPath);
string tokPath = Path.Combine(Application.streamingAssetsPath, tokenizerJsonRelPath);
// ★ 将 EP 选择和 deviceId 透传给 LM
_lm = new OmniVoiceLM(lmPath, executionProvider, deviceId)
{
NumStep = numStep,
GuidanceScale = guidanceScale,
TShift = tShift,
PositionTemperature = positionTemperature,
ClassTemperature = classTemperature,
LayerPenaltyFactor = layerPenaltyFactor,
};
_tokenizer = new AudioTokenizer(encPath, decPath);
if (File.Exists(tokPath))
{
_textTok = Qwen2Tokenizer.Load(tokPath);
if (_textTok != null)
Debug.Log("[OmniVoiceRunner] 文本 Tokenizer 已加载");
}
else
{
Debug.LogWarning($"[OmniVoiceRunner] 未找到 tokenizer.json ({tokPath})");
}
Debug.Log($"[OmniVoiceRunner] 初始化完成 (EP={executionProvider}, device={deviceId})");
}
public void CloneVoice() => StartCoroutine(CloneVoiceCoroutine());
void OnDestroy()
{
_lm?.Dispose();
_tokenizer?.Dispose();
}
IEnumerator CloneVoiceCoroutine()
{
if (_isGenerating) { Debug.LogWarning("上一次生成仍在进行"); yield break; }
_isGenerating = true;
float t0 = Time.realtimeSinceStartup;
// 1. 编码参考音频
long[,] refCodes = null;
int T_ref = 0;
float refRms = -1f;
if (referenceAudio != null)
{
float[] refPCM = AudioUtils.AudioClipToPCM(referenceAudio);
refRms = 0f;
foreach (float s in refPCM) refRms += s * s;
refRms = Mathf.Sqrt(refRms / refPCM.Length);
if (refRms > 0f && refRms < 0.1f)
{
float scale = 0.1f / refRms;
for (int i = 0; i < refPCM.Length; i++) refPCM[i] *= scale;
Debug.Log($"[OmniVoiceRunner] 参考音频 RMS 归一化: {refRms:F4} → 0.1 (×{scale:F2})");
}
refCodes = _tokenizer.Encode(refPCM);
T_ref = refCodes.GetLength(1);
float refDur = T_ref * 960f / 24000f;
Debug.Log($"[OmniVoiceRunner] 参考音频: {refDur:F1}s ({T_ref} 帧) RMS={refRms:F4}");
// Python 参考实现最长允许 ~20s(500 帧);之前 150 帧(6s)过于保守,
// 会导致参考语速基准帧数过少,EstimateTargetLen 估算偏差加大。
const int MAX_REF_FRAMES = 500;
if (T_ref > MAX_REF_FRAMES)
{
Debug.LogWarning($"[OmniVoiceRunner] 参考音频过长,截断至 {MAX_REF_FRAMES} 帧 (20s)");
var truncated = new long[OmniVoiceLM.NUM_CODEBOOKS, MAX_REF_FRAMES];
for (int cb = 0; cb < OmniVoiceLM.NUM_CODEBOOKS; cb++)
for (int t = 0; t < MAX_REF_FRAMES; t++)
truncated[cb, t] = refCodes[cb, t];
refCodes = truncated;
T_ref = MAX_REF_FRAMES;
}
if (refDur < 2f) Debug.LogWarning("参考音频过短(< 2s),克隆质量可能较差");
}
// 2. 构建文本 prompt
int[] textTokenIds;
bool hasRefAudio = referenceAudio != null;
string refTextStr = hasRefAudio && !string.IsNullOrEmpty(referenceText) ? referenceText : null;
string normalizedTarget = TextNormalizer.Normalize(targetText);
Debug.Log("normalizedTarget:"+ normalizedTarget);
if (_textTok != null && !string.IsNullOrEmpty(normalizedTarget))
{
textTokenIds = _textTok.BuildPrompt(normalizedTarget, targetLanguage, instruct: null,
refText: refTextStr, hasRefAudio: hasRefAudio);
Debug.Log($"[OmniVoiceRunner] 文本 prompt: {textTokenIds.Length} tokens " +
$"(hasRefAudio={hasRefAudio}, refText={refTextStr != null})");
}
else
{
textTokenIds = Array.Empty<int>();
}
// 3. 估算目标帧数
int targetLen = EstimateTargetLen(normalizedTarget, targetLanguage, T_ref);
Debug.Log($"[OmniVoiceRunner] 目标帧数: {targetLen} ({targetLen * 960f / 24000f:F1}s)");
// 4. 后台线程推理
long[,] generatedCodes = null;
bool done = false;
Exception err = null;
System.Threading.ThreadPool.QueueUserWorkItem(_ =>
{
try { generatedCodes = _lm.Generate(textTokenIds, refCodes, targetLen); }
catch (Exception e) { err = e; }
finally { done = true; }
});
while (!done) yield return null;
if (err != null)
{
Debug.LogError($"[OmniVoiceLM] 生成异常:\n{err}");
_isGenerating = false;
yield break;
}
if (generatedCodes == null || generatedCodes.GetLength(1) == 0)
{
Debug.LogError("[OmniVoiceRunner] 生成结果为空");
_isGenerating = false;
yield break;
}
// 5. 解码
float[] pcm = _tokenizer.Decode(generatedCodes);
// 6. 后处理(对齐 Python _post_process_audio)
if (refRms >= 0f && refRms < 0.1f)
{
float restoreScale = refRms / 0.1f;
for (int i = 0; i < pcm.Length; i++) pcm[i] *= restoreScale;
}
else if (refRms < 0f)
{
float peak = 0f;
foreach (float s in pcm) { float abs = Mathf.Abs(s); if (abs > peak) peak = abs; }
if (peak > 1e-6f)
for (int i = 0; i < pcm.Length; i++) pcm[i] = pcm[i] / peak * 0.5f;
}
// ★ 修复:生成音频只做淡出,不做淡入。
// 淡入会把模型对音频开头 token 的预测(置信度本就偏低)静音掉,
// 导致"无参考文字时音频开头缺失"的问题。
AudioUtils.ApplyFadeOut(pcm);
float elapsed = Time.realtimeSinceStartup - t0;
float audioDur = pcm.Length / 24000f;
Debug.Log($"[OmniVoiceRunner] ✅ 完成: 音频={audioDur:F1}s 耗时={elapsed:F1}s RTF={elapsed / audioDur:F2}");
var clip = AudioUtils.PCMToAudioClip(pcm, "omnivoice_output");
if (outputAudioSource != null) { outputAudioSource.clip = clip; outputAudioSource.Play(); }
string savePath = Path.Combine(Application.dataPath, "omnivoice_output.wav");
AudioUtils.SaveWav(savePath, pcm);
Debug.Log($"[OmniVoiceRunner] 已保存至: {savePath}");
_isGenerating = false;
}
int EstimateTargetLen(string text, string language, int T_ref)
{
if (targetDurSec > 0f)
return Mathf.RoundToInt(targetDurSec * 24000f / 960f);
if (string.IsNullOrEmpty(text))
return T_ref > 0 ? T_ref : 100;
string resolvedLang = _textTok != null
? Qwen2Tokenizer.ResolveLang(language)
: language;
bool isChinese = resolvedLang.StartsWith("zh", StringComparison.OrdinalIgnoreCase)
|| resolvedLang.StartsWith("yue", StringComparison.OrdinalIgnoreCase)
|| resolvedLang.StartsWith("wuu", StringComparison.OrdinalIgnoreCase)
|| resolvedLang.StartsWith("nan", StringComparison.OrdinalIgnoreCase);
float durSec;
if (isChinese)
{
int charCount = 0;
foreach (char c in text)
if (!char.IsPunctuation(c) && !char.IsWhiteSpace(c)) charCount++;
durSec = charCount * 0.22f;
}
else
{
int wordCount = text.Split(new[] { ' ', '\t', '\n' },
StringSplitOptions.RemoveEmptyEntries).Length;
durSec = wordCount * 0.4f;
}
durSec = Mathf.Clamp(durSec, 1.0f, 30.0f);
int frames = Mathf.RoundToInt(durSec * 24000f / 960f);
return Mathf.Max(frames, 25);
}
}
运行截图


1070卡的测试结果 OmniVoiceRunner ✅ 完成: 音频=22.6s 耗时=50.6s RTF=2.24