基于微软认知服务的文本转语音解决方案：JavaScript实现

引言和选型

最近我一直专注于大型模型相关的项目。有一天使用小度音响时，我突然产生了一个想法：能不能将大模型的输出结果也通过语音播报出来？这促使我开始探索各种文本转语音（TTS）服务，包括Google Text-to-Speech、IBM Watson和Amazon Polly等。经过一番比较，我发现微软认知服务提供的TTS功能给我留下了最舒适的听觉体验。除了出色的音质，该服务还支持多种语言选项，提供了丰富的定制能力，并且与Azure云服务有优秀的集成性。因此，我最终选择了微软认知服务作为我的TTS解决方案。

TextToSpeech类：核心实现

下面是我们用于实现文本转语音功能的JavaScript类TextToSpeech。

类结构和构造函数

javascript 复制代码

/**
 * @file TextToSpeech.js
 * @description 该文件包含 TextToSpeech 类，用于将文本转换为语音。
 * @version 1.0.0
 */

import * as sdk from 'microsoft-cognitiveservices-speech-sdk';
import axios from 'axios';

// 选项映射表，用于将选项名称映射到 Speech SDK 属性 ID 或 speechSynthesis 属性名称
const OPTION_MAP = {
  'language': {
    prop: 'speechSynthesisLanguage' // 语言
  },
  'voiceName': {
    prop: 'speechSynthesisVoiceName' // 语音名称
  },
  'outputFormat': {
    prop: 'speechSynthesisOutputFormat' // 输出格式
  },
  'rate': {
    prop: sdk.PropertyId.SpeechServiceConnection_SynthSpeak_SsmlMaxRate // 语速
  },
  'pitch': {
    prop: sdk.PropertyId.SpeechServiceConnection_SynthSpeak_SsmlMinRate // 音调
  },
  'volume': {
    prop: sdk.PropertyId.SpeechServiceConnection_SynthVolume // 音量
  }
};

/**
 * TextToSpeech 类，用于将文本转换为语音。
 */
export default class TextToSpeech {
  /**
   * 构造函数，创建 TextToSpeech 实例。
   * @param {string} subscriptionKey - Azure 认知服务的订阅密钥。
   * @param {string} serviceRegion - Azure 认知服务的区域。
   * @param {number} bufferSize - 实时转换的最大缓存长度，太小会导致频繁触发tts服务，
   * @param {Object} options - 可选参数，用于配置 TextToSpeech 实例。
   */
  constructor(subscriptionKey, serviceRegion, bufferSize = 10, options = {}) {
    this.subscriptionKey = subscriptionKey;
    this.serviceRegion = serviceRegion;
    this.speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey, serviceRegion);
    this.configure(options);
    this.voices = [];
    // 初始化缓冲区和缓冲区大小
    this.bufferSize = bufferSize;
    // 初始化重试次数和最大重试次数
    this.retryCount = 0;
    this.maxRetries = 3;
    // 使用 Proxy 监听 buffer 的变化
    this.bufferObj = { text: '' };  // 使用对象来包装字符串

    // 使用 Proxy 监听 bufferObj 的变化
    this.bufferObj = new Proxy(this.bufferObj, {
      set: (target, property, value) => {
        // console.log(property, value)
        if (property === 'text') {
          target[property] = value;
          console.log(value.length, this.bufferSize, '999999')
          if (value.length >= this.bufferSize) {
            this.speak(value).then(() => {
              target[property] = '';  // 清空缓冲区
            });
          }
        }
        return true;
      }
    });
  }
  /**
     * 添加文本到缓冲区。
     * @param {string} text - 要添加到缓冲区的文本。
     */
  addToBuffer(text) {
    console.log(text)
    this.bufferObj.text += text;  // 更新 Proxy 对象的 text 属性
  }
  /**
   * 静态方法，异步创建 TextToSpeech 实例。
   * @param {string} subscriptionKey - Azure 认知服务的订阅密钥。
   * @param {string} serviceRegion - Azure 认知服务的区域。
   * @param {Object} options - 可选参数，用于配置 TextToSpeech 实例。
   * @returns {Promise<TextToSpeech>} - 返回一个 Promise，该 Promise 在 TextToSpeech 实例创建完成后解析为该实例。
   */
  static async build(subscriptionKey, serviceRegion, bufferSize, options = {}) {
    const instance = new TextToSpeech(subscriptionKey, serviceRegion, bufferSize, options);
    await instance.init();
    return instance;
  }

  /**
   * 初始化 TextToSpeech 实例，获取可用的语音列表。
   * @returns {Promise<void>} - 返回一个 Promise，该 Promise 在初始化完成后解析为 undefined。
   */
  async init() {
    if (!this.subscriptionKey || !this.serviceRegion) {
      console.error('Invalid configuration: subscriptionKey and serviceRegion are required.');
      throw new Error('Invalid configuration');
    }

    try {
      await this.fetchVoices();
      this.filterVoicesByLanguage(this.speechConfig.speechSynthesisLanguage);
    } catch (error) {
      console.error('Failed to initialize TextToSpeech:', error);
      throw error;
    }
  }

  /**
   * 释放 TextToSpeech 实例占用的资源。
   */
  dispose() {
    // 在此处释放任何资源（如果适用）
    console.log('Resources are released.');
  }

  /**
   * 获取可用的语音列表。
   * @returns {Promise<Array>} - 返回一个 Promise，该 Promise 在获取完成后解析为语音列表。
   */
  async fetchVoices() {
    try {
      const url = `https://${this.serviceRegion}.tts.speech.microsoft.com/cognitiveservices/voices/list`;
      const headers = { 'Ocp-Apim-Subscription-Key': this.subscriptionKey };
      const response = await axios.get(url, { headers });
      this.voices = response.data;
      return this.voices;
    } catch (error) {
      console.error('Failed to fetch voices:', error);
      throw error;
    }
  }

  /**
   * 根据语言过滤语音列表。
   * @param {string} language - 语言代码。
   * @returns {Array} - 返回过滤后的语音列表。
   */
  filterVoicesByLanguage(language) {
    return this.voices.filter(voice => voice.Locale.startsWith(language));
  }

  /**
   * 配置 TextToSpeech 实例。
   * @param {Object} options - 可选参数，用于配置 TextToSpeech 实例。
   */
  configure(options = {}) {
    console.log(options)
    if (!options || typeof options !== 'object') {
      console.error('Invalid options argument:', options);
      throw new Error('Invalid options argument');
    }
    Object.keys(options).forEach((key) => {
      const setting = OPTION_MAP[key];
      if (setting) {
        if (typeof setting.prop === 'string') {
          this.speechConfig[setting.prop] = options[key];
        } else {
          this.speechConfig.setProperty(setting.prop, options[key].toString());
        }
        console.log(`Configured ${key} as ${options[key]}`);
      } else {
        console.warn(`Unknown configuration key: ${key}`);
      }
    });
  }

  /**
   * 生成 SSML（Speech Synthesis Markup Language）文本。
   * @param {string} text - 要转换为语音的文本。
   * @param {string} style - 可选参数，用于指定语音样式。
   * @returns {string} - 返回生成的 SSML 文本。
   */
  generateSsml(text, style = null) {
    let ssml = `<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xml:lang='${this.speechConfig.speechSynthesisLanguage}'>`;
    ssml += `<voice name='${this.speechConfig.speechSynthesisVoiceName}'>`;

    if (style) {
      ssml += `<mstts:express-as style="${style}">`;
    }

    ssml += text;

    if (style) {
      ssml += `</mstts:express-as>`;
    }

    ssml += `</voice></speak>`;
    return ssml;
  }
  /**
   * 如果语音合成失败，则重试或拒绝。
   * @param {Error} error - 语音合成失败的错误对象。
   * @param {string} text - 要合成的文本。
   * @param {Object} style - 合成的样式选项。
   * @param {function} reject - Promise 的 reject 函数。
   */
  async retryOrReject(error, text, style, reject) {
    if (this.retryCount < this.maxRetries) {
      this.retryCount++;
      console.warn(`Synthesis failed, retrying... (${this.retryCount}/${this.maxRetries})`);
      await this.speak(text, style); // 重试
    } else {
      this.retryCount = 0; // 重置重试次数
      reject(`Synthesis failed after ${this.maxRetries} retries. ${error}`);
    }
  }

  /**
   * 将文本转换为语音。
   * @param {string} text - 要转换为语音的文本。
   * @param {string} style - 可选参数，用于指定语音样式。
   * @returns {Promise<string>} - 返回一个 Promise，该 Promise 在转换完成后解析为字符串。
   */
  async speak(text, style = null) {
    const audioConfig = sdk.AudioConfig.fromDefaultSpeakerOutput();
    const synthesizer = new sdk.SpeechSynthesizer(this.speechConfig, audioConfig);
    const ssml = this.generateSsml(text, style);
    console.log(ssml)
    return new Promise((resolve, reject) => {
      synthesizer.speakSsmlAsync(ssml,
        result => {
          if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
            this.retryCount = 0; // 重置重试次数
            resolve("Synthesis succeeded.");
          } else {
            this.retryOrReject(result.errorDetails, text, style, reject);
          }
          synthesizer.close();
        },
        error => {
          this.retryOrReject(error, text, style, reject);
          synthesizer.close();
        });
    });
  }

  /**
   * 将大文本分块处理，避免一次性转换过多文本导致性能问题。
   * @param {string} text - 要转换为语音的文本。
   * @param {number} maxChunkSize - 可选参数，用于指定每个块的最大长度。
   * @returns {Promise<void>} - 返回一个 Promise，该 Promise 在转换完成后解析为 undefined。
   */
  async processLargeText(text, maxChunkSize = 5000) {
    const sentenceEnders = /[\.\?!]/g;
    let lastEnderIndex = 0;
    let offset = 0;
    const length = text.length;

    while (offset < length) {
      sentenceEnders.lastIndex = offset + maxChunkSize; // 从此处开始搜索
      let sentenceEnd = sentenceEnders.exec(text);

      if (!sentenceEnd && offset + maxChunkSize >= length) {
        sentenceEnd = { index: length - 1 };
      }

      if (sentenceEnd) {
        lastEnderIndex = sentenceEnd.index;
        const chunk = text.substring(offset, lastEnderIndex + 1);
        await this.speak(chunk.trim());
        offset = lastEnderIndex + 1;
      } else {
        const chunk = text.substring(offset, offset + maxChunkSize);
        await this.speak(chunk.trim());
        offset += maxChunkSize;
      }
    }
  }
}

在构造函数中，我们接收Azure认知服务的subscriptionKey和serviceRegion，以及一个可选的options对象用于进一步的定制。

初始化和配置

javascript 复制代码

javascriptCopy code
async init() {
  // 初始化SDK和可用语音列表
  // ...
}

configure(newOptions) {
  this.options = {...this.options, ...newOptions};
  // 重新配置SDK
  // ...
}

init方法用于异步初始化服务，包括SDK和可用的语音列表。而configure方法则用于在运行时改变TTS的配置。

文本合成和输出

javascript 复制代码

javascriptCopy code
async speak(text) {
  // 使用SDK的SpeechSynthesizer进行文本合成
  // ...
}

speak方法是类的核心，用于接收一个文本字符串并调用SDK的SpeechSynthesizer进行语音合成。

优点和可配置项

优点：
1. 高度可定制和扩展。
2. 与Azure云服务天然集成。
3. 支持多种语言和方言。
可配置项：
- voiceName: 选择不同的语音。
- speed: 调整语速。
- pitch: 调整语调。

结论

在这篇文章中，我们详细介绍了如何用JavaScript和微软认知服务实现一个高度可定制和灵活的文本转语音解决方案。该解决方案不仅适用于多种应用场景，而且具有很高的性能和可靠性。选择微软认知服务作为底层平台，进一步确保了该解决方案的长期可维护和扩展性。希望这个实现能为你在开发相关应用时提供有用的参考和启示。