采用sherpa-onnx 实现 ios语音唤起的调研

背景

项目中需要实现一个语音唤起的功能,选择sherpa-onnx进行调研,要求各端都要验证没有问题。个人负责ios这部分的调研。查询官方发现没有直接针对ios语音唤起的稳定,主要技术平台也没有相关的可以借鉴。经过调研之后补充了一个。

一、下载 sherpa-onnx

https://github.com/k2-fsa/sherpa-onnx

二、构建 iOS 版 Sherpa-onnx

根据官方说明文档,先把项目构建编译起来。地址:Build sherpa-onnx for iOS 。根据文档要求安装环境,要两个小时左右。最终的效果的代码可以编译起来并且可以正常使用demo。

demo构建起来只是实现了语音转文字功能,还没有语音唤起能力。

三、补充语音唤起能力

目前官方文档只有安卓语音唤起的说明文档,但是总文档中说支持ios系统。那么原理应该是想通的,在ios的demo中搜索关键词 Keyword 发现,已经实现相关api。那么只要找到语音唤起模型并实现调用。

以下是构建成功之后的demo源码地址:

1、sherpa-onnx-ios-demo 。构建后的 ios-swift demo。构建后的全部内容有好几个G,因此支持提交部分,该demo需要在第二步运行成功之后,把该demo与源目录替换即可编译运行。

2、ios的语音模型、自定义关键词模型(用于语音唤起能力)

四、demo介绍

1、目录结构

2、sdk调用

Model.swift

swift 复制代码
// 在Model.swift中添加
func getKwsConfig() -> SherpaOnnxOnlineModelConfig {
//    let encoder = getResource("encoder-epoch-99-avg-1", "onnx")
//    let decoder = getResource("decoder-epoch-99-avg-1", "onnx")
//    let joiner = getResource("joiner-epoch-99-avg-1", "onnx")
//    let tokens = getResource("kws_tokens", "txt")
    
    let encoder = getResource("encoder-epoch-12-avg-2-chunk-16-left-64", "onnx")
    let decoder = getResource("decoder-epoch-12-avg-2-chunk-16-left-64", "onnx")
    let joiner = getResource("joiner-epoch-12-avg-2-chunk-16-left-64", "onnx")
    let tokens = getResource("kws_tokens", "txt")
    
    return sherpaOnnxOnlineModelConfig(
        tokens: tokens,
        transducer: sherpaOnnxOnlineTransducerModelConfig(
            encoder: encoder,
            decoder: decoder,
            joiner: joiner
        ),
        numThreads: 2,
        modelType: "zipformer2"
    )
}

func getKeywordsFilePath() -> String {
    return getResource("keywords", "txt") // 文件内容:小苗小苗
}

/// Please refer to
/// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
/// to download pre-trained models

/// sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20 (Bilingual, Chinese + English)
/// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/zipformer-transducer-models.html
func getBilingualStreamZhEnZipformer20230220() -> SherpaOnnxOnlineModelConfig {
  let encoder = getResource("encoder-epoch-99-avg-1", "onnx")
  let decoder = getResource("decoder-epoch-99-avg-1", "onnx")
  let joiner = getResource("joiner-epoch-99-avg-1", "onnx")
  let tokens = getResource("tokens", "txt")

  return sherpaOnnxOnlineModelConfig(
    tokens: tokens,
    transducer: sherpaOnnxOnlineTransducerModelConfig(
      encoder: encoder,
      decoder: decoder,
      joiner: joiner
    ),
    numThreads: 1,
    modelType: "zipformer"
  )
}

ViewController.swift

swift 复制代码
//
//  ViewController.swift
//  SherpaOnnx
//
//  Created by fangjun on 2023/1/28.
//

import AVFoundation
import UIKit

extension AudioBuffer {
    func array() -> [Float] {
        return Array(UnsafeBufferPointer(self))
    }
}

extension AVAudioPCMBuffer {
    func array() -> [Float] {
        return self.audioBufferList.pointee.mBuffers.array()
    }
}

class ViewController: UIViewController {
    @IBOutlet weak var resultLabel: UILabel!
    @IBOutlet weak var recordBtn: UIButton!

    var audioEngine: AVAudioEngine? = nil
    var recognizer: SherpaOnnxRecognizer! = nil
    // 新增关键词唤醒相关属性
    var kwsSpotter: SherpaOnnxKeywordSpotterWrapper!
    var isAwakened = false

    /// It saves the decoded results so far
    var sentences: [String] = [] {
        didSet {
            updateLabel()
        }
    }
    var lastSentence: String = ""
    let maxSentence: Int = 20
    var results: String {
        if sentences.isEmpty && lastSentence.isEmpty {
            return ""
        }
        if sentences.isEmpty {
            return "0: \(lastSentence.lowercased())"
        }

        let start = max(sentences.count - maxSentence, 0)
        if lastSentence.isEmpty {
            return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
                .joined(separator: "\n")
        } else {
            return sentences.enumerated().map { (index, s) in "\(index): \(s.lowercased())" }[start...]
                .joined(separator: "\n") + "\n\(sentences.count): \(lastSentence.lowercased())"
        }
    }

    func updateLabel() {
        DispatchQueue.main.async {
            self.resultLabel.text = self.results
        }
    }

    override func viewDidLoad() {
        super.viewDidLoad()
        // Do any additional setup after loading the view.

        resultLabel.text = "ASR with Next-gen Kaldi\n\nSee https://github.com/k2-fsa/sherpa-onnx\n\nPress the Start button to run!"
        recordBtn.setTitle("Start", for: .normal)
        initRecognizer()
        initKeywordSpotter()
        initRecorder()
        startRecorder()
    }
    
    // 关键词检测初始化
    func initKeywordSpotter() {
        let modelConfig = getKwsConfig()
        let featConfig = sherpaOnnxFeatureConfig()

        var config = sherpaOnnxKeywordSpotterConfig(
            featConfig: featConfig,
            modelConfig: modelConfig,
            keywordsFile: getKeywordsFilePath(),
            maxActivePaths: 4,
            numTrailingBlanks: 2,
            keywordsScore: 1.5,
            keywordsThreshold: 0.25
        )
        
        kwsSpotter = SherpaOnnxKeywordSpotterWrapper(config: &config)
    }
    
    // 新增状态变量
    private var isRecording = false
    @IBAction func onRecordBtnClick(_ sender: UIButton) {
        if recordBtn.currentTitle == "Start" {
            isAwakened = true // 强制唤醒状态
            startRecorder()
            recordBtn.setTitle("Stop", for: .normal)
        } else {
            isAwakened = false
            stopRecorder()
            recordBtn.setTitle("Start", for: .normal)
        }
    }
    
    // 初始化语音转文字模型
    func initRecognizer() {
        // Please select one model that is best suitable for you.
        //
        // You can also modify Model.swift to add new pre-trained models from
        // https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html

         let modelConfig = getBilingualStreamZhEnZipformer20230220()
        // let modelConfig = getZhZipformer20230615()
        // let modelConfig = getEnZipformer20230626()
//        let modelConfig = getBilingualStreamingZhEnParaformer()

        let featConfig = sherpaOnnxFeatureConfig(
            sampleRate: 16000,
            featureDim: 80)

        var config = sherpaOnnxOnlineRecognizerConfig(
            featConfig: featConfig,
            modelConfig: modelConfig,
            enableEndpoint: true,
            rule1MinTrailingSilence: 2.4,
            rule2MinTrailingSilence: 0.8,
            rule3MinUtteranceLength: 30,
            decodingMethod: "greedy_search",
            maxActivePaths: 4
        )
        recognizer = SherpaOnnxRecognizer(config: &config)
    }

    func initRecorder() {
        print("init recorder")
        audioEngine = AVAudioEngine()
        let inputNode = self.audioEngine?.inputNode
        let bus = 0
        let inputFormat = inputNode?.outputFormat(forBus: bus)
        let outputFormat = AVAudioFormat(
            commonFormat: .pcmFormatFloat32,
            sampleRate: 16000, channels: 1,
            interleaved: false)!

        let converter = AVAudioConverter(from: inputFormat!, to: outputFormat)!

        inputNode!.installTap(
            onBus: bus,
            bufferSize: 1024,
            format: inputFormat
        ) {
            (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
            var newBufferAvailable = true

            let inputCallback: AVAudioConverterInputBlock = {
                inNumPackets, outStatus in
                if newBufferAvailable {
                    outStatus.pointee = .haveData
                    newBufferAvailable = false

                    return buffer
                } else {
                    outStatus.pointee = .noDataNow
                    return nil
                }
            }

            let convertedBuffer = AVAudioPCMBuffer(
                pcmFormat: outputFormat,
                frameCapacity:
                    AVAudioFrameCount(outputFormat.sampleRate)
                * buffer.frameLength
                / AVAudioFrameCount(buffer.format.sampleRate))!

            var error: NSError?
            let _ = converter.convert(
                to: convertedBuffer,
                error: &error, withInputFrom: inputCallback)

            // TODO(fangjun): Handle status != haveData
            let array = convertedBuffer.array()
            // 并行处理关键词检测和ASR
            self.processKeywordDetection(array)
            
            if self.isAwakened || self.isRecording {
                self.processASR(array)
            }
        }
    }
    
    var wakeupConfidence = 0
    let requiredConfidence = 2
    private func processKeywordDetection(_ samples: [Float]) {
        kwsSpotter.acceptWaveform(samples: samples)
        while kwsSpotter.isReady() {
            kwsSpotter.decode()
            let result = kwsSpotter.getResult()
            let detected = result.keyword.lowercased().contains("小苗小苗")
            print("demo----> result.keyword:\(result.keyword)    detected:\(detected)")
            wakeupConfidence = detected ?
            min(wakeupConfidence + 1, requiredConfidence) :
            max(wakeupConfidence - 1, 0)
            
            if wakeupConfidence >= requiredConfidence && !isAwakened {
                handleWakeupEvent()
                wakeupConfidence = 0
            }
        }
    }
    
    private func processASR(_ samples: [Float]) {
        if !samples.isEmpty {
            recognizer.acceptWaveform(samples: samples)
            while (recognizer.isReady()){
                recognizer.decode()
            }
            let isEndpoint = recognizer.isEndpoint()
            let text = recognizer.getResult().text
            
            if !text.isEmpty && lastSentence != text {
                lastSentence = text
                updateLabel()
                print(text)
            }
            
            if isEndpoint {
                if !text.isEmpty {
                    let tmp = self.lastSentence
                    lastSentence = ""
                    sentences.append(tmp)
                }
                recognizer.reset()
            }
        }
    }
    
    private func handleWakeupEvent() {
        isAwakened = true
        DispatchQueue.main.async {
//            self.resultLabel.text = "已唤醒,请开始说话..."
            // 自动启动录音逻辑(可选)
            if self.recordBtn.currentTitle == "Start" {
                self.startRecorder()
                self.recordBtn.setTitle("Stop", for: .normal)
            }
        }
        // 重置ASR状态
        recognizer.reset()
    }

    func startRecorder() {
        lastSentence = ""
        sentences = []

        do {
            try self.audioEngine?.start()
            isRecording = true
        } catch let error as NSError {
            print("Got an error starting audioEngine: \(error.domain), \(error)")
        }
        print("started")
    }

    func stopRecorder() {
        audioEngine?.stop()
        kwsSpotter.reset()
        recognizer.reset()
        isRecording = false
        print("stopped")
    }
}
相关推荐
吴佳浩1 天前
OpenClaw macOS 完整安装与本地模型配置教程(实战版)
人工智能·macos·agent
开心就好20252 天前
iOS App 安全加固流程记录,代码、资源与安装包保护
后端·ios
开心就好20252 天前
iOS App 性能测试工具怎么选?使用克魔助手(Keymob)结合 Instruments 完成
后端·ios
zhongjiahao3 天前
面试常问的 RunLoop,到底在Loop什么?
ios
wvy4 天前
iOS 26手势返回到根页面时TabBar的动效问题
ios
RickeyBoy4 天前
iOS 图片取色完全指南:从像素格式到工程实践
ios
aiopencode4 天前
使用 Ipa Guard 命令行版本将 IPA 混淆接入自动化流程
后端·ios
二流小码农5 天前
鸿蒙开发:路由组件升级,支持页面一键创建
android·ios·harmonyos
vi_h5 天前
在 macOS 上通过 Docker 安装并运行 Ollama(详细可执行教程)
macos·docker·ollama
iceiceiceice5 天前
iOS PDF阅读器段评实现:如何从 PDFSelection 精准还原一个自然段
前端·人工智能·ios