C++中使用Essentia实现STFT/ISTFT

最近在做一个项目，需要将音频送入 AI 模型进行处理。整个流程包括：

从 .wav 文件中加载音频；
进行 短时傅里叶变换（STFT）；
将变换结果输入模型；
使用 逆STFT（ISTFT） 重建回时域信号。

作为一个长期从事图像方向的 CVer，我对音频领域相对陌生。在调研后发现，和计算机视觉相比，优质的音频处理库屈指可数。而手撸 STFT / ISTFT 又繁琐、容易出错。

最终，我选择了开源音频分析库 Essentia，其功能强大、API 结构清晰，非常适合科研和快速原型验证。本文将分享如何在 C++ 中基于 Essentia 实现 STFT / ISTFT，并完成音频的重建。

🔍 什么是 Essentia？

Essentia 是由巴塞罗那庞培法布拉大学音乐技术小组（MTG）开发的开源 C++ 音频分析库，发布于 AGPLv3 许可下。

Essentia 提供了一套丰富的算法模块，覆盖以下能力：

音频 I/O 和预处理；
数字信号处理（DSP）基础块；
声学和音乐特征提取（如光谱、音调、节奏、情绪等）；
Python 包装与 Vamp 插件支持，便于快速原型与可视化。

虽然资料较少、使用门槛稍高，但其结构良好，非常适合做科研实验或工业应用中的音频前处理部分。

🛠 编译 Essentia（推荐 Docker 环境）

手动编译步骤如下：

bash 复制代码

git clone https://github.com/MTG/essentia.git
cd essentia
packaging/build_3rdparty_static_debian.sh

⚠️ 如果第三方依赖下载失败，可以手动修改 packaging/debian_3rdparty/ 下的对应脚本，将源码下载到指定目录。

接着执行：

bash 复制代码

./waf configure --with-static-examples
./waf

如果你希望跳过编译，可以从我上传的打包资源中下载使用。

📦 在 C++ 中使用 STFT

cpp 复制代码

AlgorithmFactory& factory = standard::AlgorithmFactory::instance();

// 1. 加载音频
Algorithm* loader = factory.create("MonoLoader", 
                                   "filename", "test.wav", 
                                   "sampleRate", 16000);
vector<Real> audio;
loader->output("audio").set(audio);
loader->compute();
delete loader;

设置参数：

cpp 复制代码

const int frameSize = 320; 
const int hopSize = 80;    // 25% overlap

STFT 流程：

cpp 复制代码

Algorithm* frameCutter = factory.create("FrameCutter",
                                        "frameSize", frameSize,
                                        "hopSize", hopSize);
Algorithm* windowing = factory.create("Windowing",
                                      "type", "hann",
                                      "normalized", false);
Algorithm* fft = factory.create("FFT", "size", frameSize);

frameCutter->input("signal").set(audio);
vector<Real> frame, windowedFrame;
vector<complex<Real>> fftFrame;

frameCutter->output("frame").set(frame);
windowing->input("frame").set(frame);
windowing->output("frame").set(windowedFrame);
fft->input("frame").set(windowedFrame);
fft->output("fft").set(fftFrame);

vector<vector<complex<Real>>> stftResult;

while (true) {
    frameCutter->compute();
    if (frame.empty()) break;

    windowing->compute();
    fft->compute();
    stftResult.push_back(fftFrame);
}

🔁 实现 ISTFT

由于 Essentia 并未提供完整的 Overlap-Add 封装，我们需要仿照 librosa 逻辑手动实现。

cpp 复制代码

Algorithm* ifft = factory.create("IFFT", "size", frameSize);

vector<Real> window = computeLibrosaHann(frameSize);
Real windowSum = 0.0;
for (int i = 0; i < hopSize; ++i) {
    windowSum += window[i] * window[i];
}
const Real compensation = 1.0 / windowSum;

vector<Real> reconstructedAudio(originalLength, 0.0);
vector<Real> ifftOutputFrame(frameSize);

for (int i = 0; i < stftResult.size(); ++i) {
    ifft->input("fft").set(stftResult[i]);
    ifft->output("frame").set(ifftOutputFrame);
    ifft->compute();

    int pos = i * hopSize;
    for (int n = 0; n < frameSize && pos + n < reconstructedAudio.size(); ++n) {
        reconstructedAudio[pos + n] += ifftOutputFrame[n] * window[n] * compensation;
    }
}

🧹 后处理与音频保存

cpp 复制代码

removeDCOffset(reconstructedAudio);
conservativeNormalize(reconstructedAudio, 0.99);

// 保存音频
Algorithm* writer = factory.create("MonoWriter",
                                   "filename", "./reconstructed.wav",
                                   "sampleRate", 16000);
writer->input("audio").set(reconstructedAudio);
writer->compute();
delete writer;

🧪 代码完整示例

cpp 复制代码

#include <iostream>
#include <essentia/essentia.h>
#include <essentia/algorithmfactory.h>
#include <essentia/pool.h>
#include <Eigen/Dense>
#include <unsupported/Eigen/CXX11/Tensor>
#include <numeric>
#include <cmath>

using namespace std;
using namespace essentia;
using namespace essentia::standard;

// Librosa风格的汉宁窗计算
vector<Real> computeLibrosaHann(int size) {
    vector<Real> window(size);
    for (int i = 0; i < size; ++i) {
        window[i] = sin(M_PI * i / (size - 1)) * sin(M_PI * i / (size - 1));
    }
    return window;
}

// 移除直流偏移
void removeDCOffset(vector<Real>& audio) {
    Real dcOffset = accumulate(audio.begin(), audio.end(), 0.0) / audio.size();
    for (auto& sample : audio) {
        sample -= dcOffset;
    }
}

// 保守归一化
void conservativeNormalize(vector<Real>& audio, Real targetPeak = 0.99) {
    Real peak = *max_element(audio.begin(), audio.end(),
                           [](Real a, Real b) { return abs(a) < abs(b); });
    if (abs(peak) > 1e-6) {  // 避免除以0
        for (auto& sample : audio) {
            sample *= (targetPeak / abs(peak));
        }
    }
}

int main() {

    // 初始化Essentia
    essentia::init();
    AlgorithmFactory& factory = standard::AlgorithmFactory::instance();
    
    // 1. 加载音频
    Algorithm* loader = factory.create("MonoLoader",
                                     "filename", "/work/000002.wav",
                                     "sampleRate", 16000);
    vector<Real> audio;
    loader->output("audio").set(audio);
    loader->compute();
    delete loader;
    cout << "Loaded audio with " << audio.size() << " samples" << endl;

    // 2. STFT参数设置
    const int frameSize = 320;      // 与librosa默认值一致
    const int hopSize = 80;        // 25%重叠
    const int originalLength = audio.size();
    const string windowType = "hann";

    // 3. STFT处理
    Algorithm* frameCutter = factory.create("FrameCutter",
                                          "frameSize", frameSize,
                                          "hopSize", hopSize);
    Algorithm* windowing = factory.create("Windowing",
                                        "type", windowType,
                                        "normalized", false,
                                        "zeroPhase", false);
    Algorithm* fft = factory.create("FFT", "size", frameSize);

    frameCutter->input("signal").set(audio);
    vector<Real> frame, windowedFrame;
    frameCutter->output("frame").set(frame);
    windowing->input("frame").set(frame);
    windowing->output("frame").set(windowedFrame);
    
    vector<complex<Real>> fftFrame;
    fft->input("frame").set(windowedFrame);
    fft->output("fft").set(fftFrame);

    vector<vector<complex<Real>>> stftResult;

    // STFT处理循环
    while (true) {
        frameCutter->compute();
        if (frame.empty()) break;

        windowing->compute();
        fft->compute();
        stftResult.push_back(fftFrame);
    }

    delete frameCutter;
    delete windowing;
    delete fft;

    cout << "STFT completed. Frames: " << stftResult.size() 
         << ", Bins: " << (stftResult.empty() ? 0 : stftResult[0].size()) << endl;

    // 4. ISTFT处理（librosa风格）
    Algorithm* ifft = factory.create("IFFT", "size", frameSize);
    
    // 计算窗函数和补偿因子
    vector<Real> window = computeLibrosaHann(frameSize);
    Real windowSum = 0.0;
    for (int i = 0; i < hopSize; ++i) {
        windowSum += window[i] * window[i];
    }
    const Real compensation = 1.0 / windowSum;

    // 重建音频初始化
    vector<Real> reconstructedAudio(originalLength, 0.0);
    vector<Real> ifftOutputFrame(frameSize);

    // 处理每一帧
    for (int i = 0; i < stftResult.size(); ++i) {
        // IFFT变换
        ifft->input("fft").set(stftResult[i]);
        ifft->output("frame").set(ifftOutputFrame);
        ifft->compute();

        // 重叠相加（librosa风格）
        int pos = i * hopSize;
        for (int n = 0; n < frameSize && pos + n < reconstructedAudio.size(); ++n) {
            reconstructedAudio[pos + n] += ifftOutputFrame[n] * window[n] * compensation;
        }
    }

    delete ifft;

    // 5. 后处理
    removeDCOffset(reconstructedAudio);
    conservativeNormalize(reconstructedAudio, 0.99);

    // 6. 结果验证
    // 计算RMS能量比
    auto computeRMS = [](const vector<Real>& x) {
        return sqrt(accumulate(x.begin(), x.end(), 0.0, 
                  [](Real sum, Real val) { return sum + val*val; }) / x.size());
    };
    
    Real originalRMS = computeRMS(audio);
    Real reconstructedRMS = computeRMS(reconstructedAudio);
    cout << "Volume ratio (reconstructed/original): " 
         << reconstructedRMS / originalRMS << endl;

    // 7. 保存结果
    Algorithm* writer = factory.create("MonoWriter",
                                     "filename", "./reconstructed.wav",
                                     "sampleRate", 16000);
    writer->input("audio").set(reconstructedAudio);
    writer->compute();
    delete writer;

    essentia::shutdown();
    return 0;
}

🧩 CMake 配置示例

Essentia 依赖众多第三方库，下面是一个完整的 CMakeLists.txt 配置参考：

cpp 复制代码

cmake_minimum_required(VERSION 3.10)
project(gcrn_cpp)

set(CMAKE_CXX_STANDARD 17)

file(GLOB_RECURSE CORE_SOURCE_FILES ${CMAKE_CURRENT_LIST_DIR}/source/*.cpp)
include_directories(${CMAKE_CURRENT_LIST_DIR}/3rdparty)
link_directories(${CMAKE_CURRENT_LIST_DIR}/libs/)

# 设置 PKG_CONFIG 路径
set(ENV{PKG_CONFIG_PATH} "/essentia-master/packaging/debian_3rdparty/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
find_package(PkgConfig REQUIRED)

# 依赖库查找
pkg_check_modules(SWRESAMPLE REQUIRED libswresample)
pkg_check_modules(AVCODEC REQUIRED libavcodec)
pkg_check_modules(AVFORMAT REQUIRED libavformat)
pkg_check_modules(SAMPLERATE REQUIRED samplerate)
pkg_check_modules(FFTW3 REQUIRED fftw3f)
pkg_check_modules(CHROMAPRINT REQUIRED libchromaprint)
pkg_check_modules(TAGLIB REQUIRED taglib)
pkg_check_modules(YAML REQUIRED yaml-0.1)
pkg_check_modules(EIGEN3 REQUIRED eigen3)

include_directories(${EIGEN3_INCLUDE_DIRS})
link_directories(${SAMPLERATE_LIBRARY_DIRS})

add_executable(gcrn_cpp main.cpp ${CORE_SOURCE_FILES})
target_link_libraries(gcrn_cpp PRIVATE
    essentia
    ${CHROMAPRINT_LIBRARIES}
    ${SWRESAMPLE_LIBRARIES}
    ${SAMPLERATE_LIBRARIES}
    ${AVFORMAT_LIBRARIES}
    ${AVCODEC_LIBRARIES}
    ${AVUTIL_LIBRARIES}
    ${FFTW3_LIBRARIES}
    ${TAGLIB_LIBRARIES}
    ${YAML_LIBRARIES}
    pthread dl m z
)

📚 参考资料

官方仓库：https://github.com/MTG/essentia
FAQ 页面：Frequently Asked Questions --- Essentia 2.1-beta6-dev documentation
Librosa STFT 文档：https://librosa.org/doc/main/generated/librosa.stft.html