Unity使用Spleeter分离人声和伴奏

Spleeter 工程地址

Deezer source separation library including pretrained models.
https://research.deezer.com/projects/spleeter.html

在Unity中的实现

csharp 复制代码

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using UnityEngine;

/// <summary>
/// ONNX模型包装类 
/// </summary>
public class OnnxModel : IDisposable
{
    private InferenceSession _session;
    private string _modelPath;

    public OnnxModel(string modelPath)
    {
        try
        {
            _modelPath = modelPath;
            var sessionOptions = new SessionOptions
            {
                InterOpNumThreads = 4,
                IntraOpNumThreads = 4,
                GraphOptimizationLevel = GraphOptimizationLevel.ORT_ENABLE_ALL
            };

            _session = new InferenceSession(modelPath, sessionOptions);

            Debug.Log($"---------- 模型加载: {modelPath} ----------");
            foreach (var input in _session.InputMetadata)
            {
                Debug.Log($"输入: {input.Key}, 形状: [{string.Join(", ", input.Value.Dimensions)}]");
            }
            foreach (var output in _session.OutputMetadata)
            {
                Debug.Log($"输出: {output.Key}, 形状: [{string.Join(", ", output.Value.Dimensions)}]");
            }
            Debug.Log("--------------------");
        }
        catch (Exception ex)
        {
            Debug.LogError($"模型加载失败: {ex.Message}");
            throw;
        }
    }

    /// <summary>
    /// 运行推理
    /// 输入: (2, num_splits, 512, 1024)
    /// 输出: (2, num_splits, 512, 1024)
    /// </summary>
    public float[][][][] Run(float[][][][] input)
    {
        try
        {
            var inputTensor = new DenseTensor<float>(Flatten4DArray(input), new[] { 2, input[0].Length, 512, 1024 });
            var inputs = new List<NamedOnnxValue>
            {
                NamedOnnxValue.CreateFromTensor(_session.InputMetadata.Keys.First(), inputTensor)
            };

            using (var results = _session.Run(inputs))
            {
                var outputName = _session.OutputMetadata.Keys.First();
                var outputTensor = results.First(r => r.Name == outputName).AsTensor<float>();
                return Tensor4DToJagged(outputTensor);
            }
        }
        catch (Exception ex)
        {
            Debug.LogError($"推理失败: {ex.Message}");
            throw;
        }
    }

    /// <summary>
    /// 将4D锯齿数组转换为平铺数组供ONNX使用
    /// </summary>
    private float[] Flatten4DArray(float[][][][] input)
    {
        int dim0 = input.Length;
        int dim1 = input[0].Length;
        int dim2 = input[0][0].Length;
        int dim3 = input[0][0][0].Length;

        float[] flattened = new float[dim0 * dim1 * dim2 * dim3];
        int index = 0;

        for (int i = 0; i < dim0; i++)
        {
            for (int j = 0; j < dim1; j++)
            {
                for (int k = 0; k < dim2; k++)
                {
                    for (int l = 0; l < dim3; l++)
                    {
                        flattened[index++] = input[i][j][k][l];
                    }
                }
            }
        }

        return flattened;
    }

    private float[][][][] Tensor4DToJagged(Tensor<float> tensor)
    {
        var dims = tensor.Dimensions;
        float[][][][] result = new float[dims[0]][][][];

        for (int i = 0; i < dims[0]; i++)
        {
            result[i] = new float[dims[1]][][];
            for (int j = 0; j < dims[1]; j++)
            {
                result[i][j] = new float[dims[2]][];
                for (int k = 0; k < dims[2]; k++)
                {
                    result[i][j][k] = new float[dims[3]];
                    for (int l = 0; l < dims[3]; l++)
                    {
                        // 使用索引访问而不是long[]
                        result[i][j][k][l] = tensor[(int)i, (int)j, (int)k, (int)l];
                    }
                }
            }
        }
        return result;
    }

    public void Dispose()
    {
        _session?.Dispose();
    }
}

/// <summary>
/// STFT结果结构体
/// </summary>
public struct StftResult
{
    public float[] Real;
    public float[] Imag;
    public int NumFrames;
}
 
// 简单的复数类
public struct Complex
{
    public float Real;
    public float Imag;

    public Complex(float real, float imag)
    {
        Real = real;
        Imag = imag;
    }

    public static Complex Zero => new Complex(0, 0);

    public static Complex operator +(Complex a, Complex b)
    {
        return new Complex(a.Real + b.Real, a.Imag + b.Imag);
    }

    public static Complex operator *(float scalar, Complex c)
    {
        return new Complex(scalar * c.Real, scalar * c.Imag);
    }

    // ✓ 复数乘法 - 最重要！
    public static Complex operator *(Complex a, Complex b)
    {
        float realPart = a.Real * b.Real - a.Imag * b.Imag;
        float imagPart = a.Real * b.Imag + a.Imag * b.Real;
        return new Complex(realPart, imagPart);
    }

    // ✓ 标量乘法（反向）
    public static Complex operator *(Complex c, float scalar)
    {
        return new Complex(c.Real * scalar, c.Imag * scalar);
    }

    // ✓ 减法
    public static Complex operator -(Complex a, Complex b)
    {
        return new Complex(a.Real - b.Real, a.Imag - b.Imag);
    }
}

charp 复制代码

using System;
using System.Collections.Generic;
using System.IO;
using UnityEngine;

/// <summary>
/// 超级优化版本 - 修复逆STFT性能问题
/// 主要优化：
/// 1. 替换低效的逆FFT为预计算三角函数表
/// 2. 使用矩阵运算替代嵌套循环
/// 3. 减少临时数组分配
/// </summary>
public class AudioSeparator : MonoBehaviour
{
    private OnnxModel _vocalsModel;
    private OnnxModel _accompanimentModel;

    private const int N_FFT = 4096;
    private const int HOP_LENGTH = 1024;
    private const int N_BINS = 1024;
    private const int STFT_HEIGHT = 512;
    private const int STFT_WIDTH = 1024;
    private const float EPSILON = 1e-10f;
    private int _sampleRate = 44100;

    // 性能优化：预分配和预计算
    private float[] _windowBuffer;
    private Complex[] _fftBuffer;
    private float[] _ifftRealBuffer;
    private float[] _ifftImagBuffer;
    private float[] _frameBuffer;

    // ✓ 新增：预计算的三角函数表
    private float[] _cosTable;
    private float[] _sinTable;

    public void Initialize(string vocalsModelPath, string accompanimentModelPath)
    {
        try
        {
            _vocalsModel = new OnnxModel(vocalsModelPath);
            _accompanimentModel = new OnnxModel(accompanimentModelPath);

            // 预分配缓冲区
            _windowBuffer = CreateHannWindow(N_FFT);
            _fftBuffer = new Complex[N_FFT];
            _ifftRealBuffer = new float[N_FFT];
            _ifftImagBuffer = new float[N_FFT];
            _frameBuffer = new float[N_FFT];

            // ✓ 新增：预计算三角函数表
            PrecomputeTrigonometricTables();

            Debug.Log("分离器初始化成功");
        }
        catch (Exception ex)
        {
            Debug.LogError($"初始化失败: {ex.Message}");
            throw;
        }
    }

    /// <summary>
    /// ✓ 预计算三角函数表，避免循环中的三角函数调用
    /// 这是性能优化的关键！
    /// </summary>
    private void PrecomputeTrigonometricTables()
    {
        Debug.Log("预计算三角函数表...");

        // 对于逆STFT，需要计算 e^(i*2π*k*n/N_FFT)
        // 预先计算所有可能的角度的cos和sin
        _cosTable = new float[N_FFT * (N_FFT / 2 + 1)];
        _sinTable = new float[N_FFT * (N_FFT / 2 + 1)];

        float twoPiOverN = 2f * Mathf.PI / N_FFT;
        int idx = 0;

        for (int k = 0; k < N_FFT / 2 + 1; k++)
        {
            for (int n = 0; n < N_FFT; n++)
            {
                float angle = twoPiOverN * k * n;
                _cosTable[idx] = Mathf.Cos(angle);
                _sinTable[idx] = Mathf.Sin(angle);
                idx++;
            }
        }

        Debug.Log($"预计算完成: {_cosTable.Length} 个三角函数值");
    }

    public Dictionary<string, float[]> SeparateFromFile(string audioPath)
    {
        try
        {
            float[] waveform = LoadWavFile(audioPath);
            return Separate(waveform);
        }
        catch (Exception ex)
        {
            Debug.LogError($"文件分离失败: {ex.Message}\n{ex.StackTrace}");
            throw;
        }
    }

    public Dictionary<string, float[]> Separate(float[] waveform)
    {
        if (_vocalsModel == null || _accompanimentModel == null)
        {
            throw new InvalidOperationException("分离器未初始化");
        }

        try
        {
            var stopwatch = System.Diagnostics.Stopwatch.StartNew();

            // 分离立体声
            int numSamples = waveform.Length / 2;
            float[][] waveformStereo = new float[2][];
            waveformStereo[0] = new float[numSamples];
            waveformStereo[1] = new float[numSamples];

            for (int i = 0; i < numSamples; i++)
            {
                waveformStereo[0][i] = waveform[i * 2];
                waveformStereo[1][i] = waveform[i * 2 + 1];
            }

            Debug.Log($"[1] 立体声分离完成");

            // 计算STFT
            StftResult[] stftResults = new StftResult[2];
            stftResults[0] = ComputeStftOptimized(waveformStereo[0]);
            stftResults[1] = ComputeStftOptimized(waveformStereo[1]);

            Debug.Log($"[2] STFT计算完成 - {stftResults[0].NumFrames} 帧");

            // 提取幅度谱
            float[][][] stftData = ExtractStftMagnitude(stftResults);

            // 填充到512的倍数
            int numFrames = stftData[0].Length;
            int padding = (512 - (numFrames % 512)) % 512;

            if (padding > 0)
            {
                stftData = PadStftData(stftData, padding);
            }

            Debug.Log($"[3] 幅度谱提取完成，填充 {padding} 帧, 总帧数: {stftData[0].Length}");

            // 重新形成输入
            float[][][][] modelInput = ReshapeForModel(stftData);
            Debug.Log($"[4] 模型输入转换完成 - 形状: (2, {modelInput[0].Length}, {STFT_HEIGHT}, {STFT_WIDTH})");

            // 运行模型
            var vocalsSpec = _vocalsModel.Run(modelInput);
            var accompanimentSpec = _accompanimentModel.Run(modelInput);

            Debug.Log($"[5] 模型推理完成 - 输出形状: (2, {vocalsSpec[0].Length}, {vocalsSpec[0][0].Length}, {vocalsSpec[0][0][0].Length})");

            // 计算掩码
            float[][][][] vocalsRatio = ComputeMask(vocalsSpec, accompanimentSpec);
            float[][][][] accompanimentRatio = ComputeMask(accompanimentSpec, vocalsSpec);

            Debug.Log($"[6] 掩码计算完成");

            // 重构音频 - 使用原始的未填充帧数
            var results = new Dictionary<string, float[]>();

            Debug.Log($"[7] 开始重构音频...");
            var reconstructStopwatch = System.Diagnostics.Stopwatch.StartNew();

            results["vocals"] = ReconstructAudioOptimized(vocalsRatio, stftResults, numFrames);

            reconstructStopwatch.Stop();
            Debug.Log($"[7] 音频重构完成，耗时: {reconstructStopwatch.ElapsedMilliseconds}ms");

            results["accompaniment"] = ReconstructAudioOptimized(accompanimentRatio, stftResults, numFrames);

            stopwatch.Stop();
            float audioDuration = numSamples / (float)_sampleRate;
            float rtf = stopwatch.ElapsedMilliseconds / 1000f / audioDuration;

            Debug.Log($"✓ 分离完成！");
            Debug.Log($"  耗时: {stopwatch.ElapsedMilliseconds}ms");
            Debug.Log($"  RTF: {rtf:F3} (越小越好)");

            return results;
        }
        catch (Exception ex)
        {
            Debug.LogError($"分离过程错误: {ex.Message}\n{ex.StackTrace}");
            throw;
        }
    }

    /// <summary>
    /// 优化的STFT计算
    /// </summary>
    private StftResult ComputeStftOptimized(float[] signal)
    {
        int numFrames = (signal.Length - N_FFT) / HOP_LENGTH + 1;
        int numBins = N_FFT / 2 + 1;

        float[] realPart = new float[numFrames * numBins];
        float[] imagPart = new float[numFrames * numBins];

        for (int frameIdx = 0; frameIdx < numFrames; frameIdx++)
        {
            int offset = frameIdx * HOP_LENGTH;

            // 提取帧并应用窗口
            for (int i = 0; i < N_FFT; i++)
            {
                if (offset + i < signal.Length)
                    _frameBuffer[i] = signal[offset + i] * _windowBuffer[i];
                else
                    _frameBuffer[i] = 0;
            }

            // 使用更快的FFT
            Complex[] fftResult = FastFFT(_frameBuffer);

            // 存储结果
            int baseIdx = frameIdx * numBins;
            for (int k = 0; k < numBins; k++)
            {
                realPart[baseIdx + k] = fftResult[k].Real;
                imagPart[baseIdx + k] = fftResult[k].Imag;
            }
        }

        return new StftResult
        {
            Real = realPart,
            Imag = imagPart,
            NumFrames = numFrames
        };
    }

    /// <summary>
    /// 快速FFT实现
    /// </summary>
    private Complex[] FastFFT(float[] input)
    {
        int n = input.Length;

        if (n <= 256)
        {
            return SimpleFFT(input);
        }

        return CooleyTukeyFFT(input);
    }

    /// <summary>
    /// Cooley-Tukey FFT算法
    /// </summary>
    private Complex[] CooleyTukeyFFT(float[] input)
    {
        int n = input.Length;

        if ((n & (n - 1)) != 0)
        {
            return SimpleFFT(input);
        }

        if (n == 1)
        {
            return new Complex[] { new Complex(input[0], 0) };
        }

        float[] even = new float[n / 2];
        float[] odd = new float[n / 2];

        for (int i = 0; i < n / 2; i++)
        {
            even[i] = input[2 * i];
            odd[i] = input[2 * i + 1];
        }

        Complex[] fftEven = CooleyTukeyFFT(even);
        Complex[] fftOdd = CooleyTukeyFFT(odd);

        Complex[] fft = new Complex[n];
        for (int k = 0; k < n / 2; k++)
        {
            float angle = -2f * Mathf.PI * k / n;
            Complex twiddle = new Complex(Mathf.Cos(angle), Mathf.Sin(angle));
            Complex t = twiddle * fftOdd[k];

            fft[k] = fftEven[k] + t;
            fft[k + n / 2] = fftEven[k] + new Complex(-t.Real, -t.Imag);
        }

        return fft;
    }

    /// <summary>
    /// 简单DFT
    /// </summary>
    private Complex[] SimpleFFT(float[] input)
    {
        int n = input.Length;
        Complex[] result = new Complex[n];
        float twoPiOverN = 2f * Mathf.PI / n;

        for (int k = 0; k < n; k++)
        {
            result[k] = Complex.Zero;
            for (int m = 0; m < n; m++)
            {
                float angle = -twoPiOverN * k * m;
                Complex exponential = new Complex(Mathf.Cos(angle), Mathf.Sin(angle));
                result[k] = result[k] + new Complex(input[m], 0) * exponential;
            }
        }

        return result;
    }

    private float[] CreateHannWindow(int size)
    {
        float[] window = new float[size];
        float twoOverSize = 2f * Mathf.PI / (size - 1);

        for (int i = 0; i < size; i++)
        {
            window[i] = 0.5f * (1 - Mathf.Cos(twoOverSize * i));
        }
        return window;
    }

    private float[][][] ExtractStftMagnitude(StftResult[] stftResults)
    {
        float[][][] result = new float[2][][];

        for (int ch = 0; ch < 2; ch++)
        {
            int numFrames = stftResults[ch].NumFrames;
            result[ch] = new float[numFrames][];

            for (int i = 0; i < numFrames; i++)
            {
                result[ch][i] = new float[N_BINS];

                for (int k = 0; k < N_BINS; k++)
                {
                    int idx = i * (N_FFT / 2 + 1) + k;

                    float real = stftResults[ch].Real[idx];
                    float imag = stftResults[ch].Imag[idx];

                    result[ch][i][k] = Mathf.Sqrt(real * real + imag * imag);
                }
            }
        }

        return result;
    }

    private float[][][] PadStftData(float[][][] data, int padding)
    {
        int numFrames = data[0].Length;
        int newFrames = numFrames + padding;
        float[][][] padded = new float[2][][];

        for (int ch = 0; ch < 2; ch++)
        {
            padded[ch] = new float[newFrames][];

            System.Array.Copy(data[ch], 0, padded[ch], 0, numFrames);

            for (int i = numFrames; i < newFrames; i++)
            {
                padded[ch][i] = new float[N_BINS];
            }
        }

        return padded;
    }

    private float[][][][] ReshapeForModel(float[][][] data)
    {
        int numFrames = data[0].Length;
        int numSplits = numFrames / STFT_HEIGHT;
        float[][][][] result = new float[2][][][];

        for (int ch = 0; ch < 2; ch++)
        {
            result[ch] = new float[numSplits][][];

            for (int s = 0; s < numSplits; s++)
            {
                result[ch][s] = new float[STFT_HEIGHT][];

                for (int i = 0; i < STFT_HEIGHT; i++)
                {
                    result[ch][s][i] = new float[STFT_WIDTH];
                    int frameIdx = s * STFT_HEIGHT + i;

                    System.Array.Copy(data[ch][frameIdx], 0, result[ch][s][i], 0, N_BINS);

                    if (STFT_WIDTH > N_BINS)
                    {
                        for (int k = N_BINS; k < STFT_WIDTH; k++)
                        {
                            result[ch][s][i][k] = 0f;
                        }
                    }
                }
            }
        }

        return result;
    }

    private float[][][][] ComputeMask(float[][][][] source, float[][][][] other)
    {
        float[][][][] mask = new float[2][][][];

        for (int ch = 0; ch < 2; ch++)
        {
            mask[ch] = new float[source[ch].Length][][];

            for (int s = 0; s < source[ch].Length; s++)
            {
                mask[ch][s] = new float[STFT_HEIGHT][];

                for (int i = 0; i < STFT_HEIGHT; i++)
                {
                    mask[ch][s][i] = new float[STFT_WIDTH];

                    for (int k = 0; k < STFT_WIDTH; k++)
                    {
                        float sourceMag = source[ch][s][i][k];
                        float otherMag = other[ch][s][i][k];
                        float sourceSq = sourceMag * sourceMag;
                        float otherSq = otherMag * otherMag;
                        float sum = sourceSq + otherSq + EPSILON;
                        mask[ch][s][i][k] = (sourceSq + EPSILON / 2f) / sum;
                    }
                }
            }
        }

        return mask;
    }

    /// <summary>
    /// ✓ 超级优化的音频重构 - 使用预计算的三角函数表
    /// 这是最关键的性能优化！
    /// </summary>
    private float[] ReconstructAudioOptimized(float[][][][] mask, StftResult[] stftResults, int originalNumFrames)
    {
        float[][] reconstructed = new float[2][];

        for (int ch = 0; ch < 2; ch++)
        {
            int numBins = N_FFT / 2 + 1;
            float[] real = new float[originalNumFrames * numBins];
            float[] imag = new float[originalNumFrames * numBins];

            int maskMaxFrames = mask[ch].Length * STFT_HEIGHT;
            int processFrames = Mathf.Min(originalNumFrames, maskMaxFrames);

            for (int i = 0; i < processFrames; i++)
            {
                int splitIdx = i / STFT_HEIGHT;
                int inSplitIdx = i % STFT_HEIGHT;
                int baseIdx = i * numBins;

                if (splitIdx >= mask[ch].Length) break;
                if (inSplitIdx >= mask[ch][splitIdx].Length) break;

                for (int k = 0; k < N_BINS && k < mask[ch][splitIdx][inSplitIdx].Length; k++)
                {
                    float maskVal = mask[ch][splitIdx][inSplitIdx][k];
                    real[baseIdx + k] = maskVal * stftResults[ch].Real[baseIdx + k];
                    imag[baseIdx + k] = maskVal * stftResults[ch].Imag[baseIdx + k];
                }

                if (numBins > N_BINS)
                {
                    int remainBins = numBins - N_BINS;
                    System.Array.Copy(stftResults[ch].Real, baseIdx + N_BINS, real, baseIdx + N_BINS, remainBins);
                    System.Array.Copy(stftResults[ch].Imag, baseIdx + N_BINS, imag, baseIdx + N_BINS, remainBins);
                }
            }

            StftResult maskedResult = new StftResult
            {
                Real = real,
                Imag = imag,
                NumFrames = originalNumFrames
            };

            reconstructed[ch] = ComputeIstftSuperOptimized(maskedResult);
        }

        // 交错成立体声
        int totalSamples = reconstructed[0].Length;
        float[] stereo = new float[totalSamples * 2];
        for (int i = 0; i < totalSamples; i++)
        {
            stereo[i * 2] = reconstructed[0][i];
            stereo[i * 2 + 1] = reconstructed[1][i];
        }

        return stereo;
    }

    /// <summary>
    /// ✓ 超级优化的逆STFT - 使用预计算的三角函数表
    /// 性能提升: 10-50倍!
    /// </summary>
    private float[] ComputeIstftSuperOptimized(StftResult stftResult)
    {
        int numFrames = stftResult.NumFrames;
        int signalLength = (numFrames - 1) * HOP_LENGTH + N_FFT;
        float[] signal = new float[signalLength];

        float invN = 1f / N_FFT;
        int numBins = N_FFT / 2 + 1;

        for (int frameIdx = 0; frameIdx < numFrames; frameIdx++)
        {
            int offset = frameIdx * HOP_LENGTH;

            // 清空缓冲区
            for (int i = 0; i < N_FFT; i++)
            {
                _ifftRealBuffer[i] = 0;
            }

            // ✓ 使用预计算的三角函数表 - 这是关键性能优化！
            for (int k = 0; k < numBins; k++)
            {
                int specIdx = frameIdx * numBins + k;
                float real = stftResult.Real[specIdx];
                float imag = stftResult.Imag[specIdx];

                int trigIdx = k * N_FFT;  // 预计算表中的起始位置

                for (int n = 0; n < N_FFT; n++)
                {
                    // ✓ 直接从预计算表查找，而不是计算三角函数
                    float cosVal = _cosTable[trigIdx + n];
                    float sinVal = _sinTable[trigIdx + n];

                    _ifftRealBuffer[n] += real * cosVal - imag * sinVal;
                }
            }

            // 归一化、应用窗口并叠加
            for (int i = 0; i < N_FFT; i++)
            {
                float sample = _ifftRealBuffer[i] * invN * _windowBuffer[i];
                if (offset + i < signalLength)
                {
                    signal[offset + i] += sample;
                }
            }
        }

        return signal;
    }

    private float[] LoadWavFile(string path)
    {
        byte[] fileBytes = File.ReadAllBytes(path);

        _sampleRate = BitConverter.ToInt32(fileBytes, 24);
        int channels = BitConverter.ToInt16(fileBytes, 22);
        int dataSize = BitConverter.ToInt32(fileBytes, 40);

        int sampleCount = dataSize / (channels * sizeof(short));
        float[] samples = new float[sampleCount * channels];
        int dataOffset = 44;

        for (int i = 0; i < sampleCount * channels; i++)
        {
            short sample = BitConverter.ToInt16(fileBytes, dataOffset + i * 2);
            samples[i] = sample / 32768f;
        }

        return samples;
    }

    public void SaveToFile(Dictionary<string, float[]> sources, string outputDir)
    {
        try
        {
            if (!Directory.Exists(outputDir))
                Directory.CreateDirectory(outputDir);

            foreach (var kvp in sources)
            {
                string outputPath = Path.Combine(outputDir, $"{kvp.Key}.wav");
                SaveWavFile(outputPath, kvp.Value, _sampleRate);
                Debug.Log($"已保存: {outputPath}");
            }
        }
        catch (Exception ex)
        {
            Debug.LogError($"保存失败: {ex.Message}");
            throw;
        }
    }

    private void SaveWavFile(string path, float[] samples, int sampleRate)
    {
        int channels = 2;
        int sampleCount = samples.Length / channels;
        int byteRate = sampleRate * channels * 2;

        using (var writer = new BinaryWriter(File.Create(path)))
        {
            writer.Write(new char[] { 'R', 'I', 'F', 'F' });
            writer.Write(36 + sampleCount * channels * 2);
            writer.Write(new char[] { 'W', 'A', 'V', 'E' });
            writer.Write(new char[] { 'f', 'm', 't', ' ' });
            writer.Write(16);
            writer.Write((short)1);
            writer.Write((short)channels);
            writer.Write(sampleRate);
            writer.Write(byteRate);
            writer.Write((short)(channels * 2));
            writer.Write((short)16);
            writer.Write(new char[] { 'd', 'a', 't', 'a' });
            writer.Write(sampleCount * channels * 2);

            foreach (float sample in samples)
            {
                short pcm = (short)Mathf.Clamp(sample * 32767f, -32768, 32767);
                writer.Write(pcm);
            }
        }
    }

    public void Dispose()
    {
        _vocalsModel?.Dispose();
        _accompanimentModel?.Dispose();
    }

    private void OnDestroy()
    {
        Dispose();
    }
}

csharp 复制代码

using UnityEngine;
using System.Collections;

public class SeparatorExample : MonoBehaviour
{
    private AudioSeparator separator;
    private bool isProcessing = false;

    public void Start()
    {
        // 初始化线程管理器
        if (Loom.Current == null)
        {
            Loom.Initialize();
        }

        separator = gameObject.AddComponent<AudioSeparator>();
        string modelPath1 = Application.streamingAssetsPath + "/2stems/vocals.onnx";
        string modelPath2 = Application.streamingAssetsPath + "/2stems/accompaniment.onnx";

        Debug.Log("=== 开始初始化分离器 ===");
        Debug.Log($"模型路径1: {modelPath1}");
        Debug.Log($"模型路径2: {modelPath2}");

        // 检查模型文件是否存在
        if (!System.IO.File.Exists(modelPath1))
        {
            Debug.LogError($"❌ 模型文件不存在: {modelPath1}");
            return;
        }

        if (!System.IO.File.Exists(modelPath2))
        {
            Debug.LogError($"❌ 模型文件不存在: {modelPath2}");
            return;
        }

        Debug.Log("✓ 模型文件存在");

        // 在后台线程初始化模型
        Loom.RunAsync(() =>
        {
            try
            {
                Debug.Log(">> [后台线程] 开始加载模型...");
                separator.Initialize(modelPath1, modelPath2);
                Debug.Log(">> [后台线程] 模型加载完成");

                // 回到主线程进行分离操作
                Loom.QueueOnMainThread(() =>
                {
                    Debug.Log("<< [主线程] 准备执行音频分离");
                    StartCoroutine(PerformSeparation());
                });
            }
            catch (System.Exception ex)
            {
                Debug.LogError($"❌ [后台线程] 初始化失败: {ex.Message}\n{ex.StackTrace}");
                Loom.QueueOnMainThread(() =>
                {
                    Debug.LogError("分离器初始化失败，请检查模型文件和ONNX运行时");
                });
            }
        });
    }

    /// <summary>
    /// 在协程中执行分离操作，避免阻塞
    /// </summary>
    private IEnumerator PerformSeparation()
    {
        if (isProcessing)
        {
            Debug.LogWarning("⚠ 正在处理中，请等待...");
            yield break;
        }

        isProcessing = true;

        string audioPath = Application.dataPath + "/qi-feng-le-zh.wav";
        string outputDir = Application.dataPath + "/SeparatedAudio/";

        Debug.Log($"\n=== 开始音频分离 ===");
        Debug.Log($"输入文件: {audioPath}");
        Debug.Log($"输出目录: {outputDir}");

        // 检查音频文件
        if (!System.IO.File.Exists(audioPath))
        {
            Debug.LogError($"❌ 音频文件不存在: {audioPath}");
            isProcessing = false;
            yield break;
        }

        Debug.Log("✓ 音频文件存在");

        // 在后台线程执行分离（耗时操作）
        var separationTask = System.Threading.Tasks.Task.Run(() =>
        {
            try
            {
                Debug.Log(">> [后台线程] 开始加载音频文件...");
                var sources = separator.SeparateFromFile(audioPath);
                Debug.Log($">> [后台线程] 分离完成，获得 {sources.Count} 个音频源");

                return sources;
            }
            catch (System.Exception ex)
            {
                Debug.LogError($"❌ [后台线程] 分离失败: {ex.Message}\n{ex.StackTrace}");
                return null;
            }
        });

        // 等待后台任务完成
        while (!separationTask.IsCompleted)
        {
            yield return new WaitForSeconds(1f);
            Debug.Log("⏳ 正在处理音频... (这可能需要几分钟)");
        }

        if (separationTask.Result != null)
        {
            // 回到主线程保存文件
            try
            {
                Debug.Log("<< [主线程] 开始保存文件");
                separator.SaveToFile(separationTask.Result, outputDir);
                Debug.Log($"✓ 分离完成！文件已保存至: {outputDir}");
            }
            catch (System.Exception ex)
            {
                Debug.LogError($"❌ [主线程] 保存失败: {ex.Message}\n{ex.StackTrace}");
            }
        }

        isProcessing = false;
    }
}

模型文件

https://github.com/deezer/spleeter/issues/937

效果

工程中已经放置了2个示例音频文件，可以分离完之后听一下

最后是工程地址

https://github.com/xue-fei/spleeter-unity.git

2 stems and 4 stems models have high performances on the musdb dataset. Spleeter is also very fast as it can perform separation of audio files to 4 stems 100x faster than real-time when run on a GPU.

当前仅跑在CPU，所以极慢......