IOS 实现语音实时转文字功能

一、项目info.plist配置

objectivec 复制代码
<key>NSSpeechRecognitionUsageDescription</key>
<string>我们需要使用语音识别功能将您的语音转换为文字,以便进行实时记录。</string>
<key>NSMicrophoneUsageDescription</key>
<string>我们需要访问麦克风以录制您的语音用于识别。</string>

在Objective-C中实现语音实时转文字可依托苹果原生框架完成,核心方案如下:

1. 核心依赖框架

使用iOS系统自带的‌Speech框架 ‌,无需额外引入第三方SDK,通过SFSpeechRecognizerSFSpeechAudioBufferRecognitionRequest等核心类即可实现实时语音转写,支持离线识别,适配iOS 10及以上系统。

2. 关键实现步骤

  • 提前申请麦克风权限与语音识别权限,避免权限缺失导致功能异常
  • 初始化SFSpeechRecognizer实例,指定目标识别语言(如简体中文)
  • 配置音频捕获会话,通过SFSpeechAudioBufferRecognitionRequest持续接收音频流
  • 实时回调获取SFSpeechRecognitionResult,提取bestTranscription字段得到实时转写文本

3. 原生方案优势

相比第三方工具,该方案无网络依赖、延迟更低,可实现系统级的实时字幕、会议实时记录等功能,同时能更好地保障用户语音数据隐私。

二、逻辑实现

1.Speech2TextManager.h

objectivec 复制代码
//
//  Speech2TextManager.h
//  QGB_IM2
//
//  Created by carbonzhao on 2026/7/1.
//

#import <Foundation/Foundation.h>
#import <Speech/Speech.h>
#import <AVFoundation/AVFoundation.h>

NS_ASSUME_NONNULL_BEGIN


@interface Speech2TextManager : NSObject

@property (nonatomic, strong, readonly) SFSpeechRecognizer *speechRecognizer;
@property (nonatomic, assign, readonly) BOOL isRecording;

@property (nonatomic, copy) void (^Speech2TextManagerDidReceiveTranscriptionBlock)(Speech2TextManager *manager,NSString *text,BOOL isFinal);
@property (nonatomic, copy) void (^Speech2TextManagerDidFailWithErrorBlock)(Speech2TextManager *manager,NSError * _Nonnull error);

+ (instancetype)sharedInstance;

- (void)requestPermissionsWithCompletion:(void(^)(BOOL granted))completion;
- (void)startRecording;
- (void)stopRecording;

@end
NS_ASSUME_NONNULL_END

2.Speech2TextManager.m

objectivec 复制代码
#import "Speech2TextManager.h"

@interface Speech2TextManager () <SFSpeechRecognitionTaskDelegate,SFSpeechRecognizerDelegate>

@property (nonatomic, strong) AVAudioEngine *audioEngine;
@property (nonatomic, strong) SFSpeechAudioBufferRecognitionRequest *recognitionRequest;
@property (nonatomic, strong) SFSpeechRecognitionTask *recognitionTask;
@property (nonatomic, strong) NSOperationQueue *operationQueue;

@end

@implementation Speech2TextManager

+ (instancetype)sharedInstance {
    static Speech2TextManager *instance = nil;
    static dispatch_once_t onceToken;
    dispatch_once(&onceToken,^{
        instance = [[self alloc] init];
    });
    return instance;
}

- (instancetype)init {
    self = [super init];
    if (self) {
        // 初始化中文识别器
        NSLocale *locale = [[NSLocale alloc] initWithLocaleIdentifier:@"zh-CN"];
        _speechRecognizer = [[SFSpeechRecognizer alloc] initWithLocale:locale];
        _speechRecognizer.delegate = self;
        
        _audioEngine = [[AVAudioEngine alloc] init];
        _operationQueue = [[NSOperationQueue alloc] init];
        _operationQueue.maxConcurrentOperationCount = 1;
    }
    return self;
}

#pragma mark - Permissions

- (void)requestPermissionsWithCompletion:(void(^)(BOOL granted))completion
{
    [SFSpeechRecognizer requestAuthorization:^(SFSpeechRecognizerAuthorizationStatus status)
    {
        dispatch_async(dispatch_get_main_queue(),^{
            if (status != SFSpeechRecognizerAuthorizationStatusAuthorized)
            {
                if (completion)
                {
                    completion(NO);
                }
                return;
            }
            
            [[AVAudioSession sharedInstance] requestRecordPermission:^(BOOL granted) {
                if (completion)
                {
                    completion(granted);
                }
            }];
        });
    }];
}

#pragma mark - Recording Control

- (void)startRecording {
    if (self.isRecording) return;
    
    // 检查识别器可用性
    if (!self.speechRecognizer.isAvailable) {
        NSError *error = [NSError errorWithDomain:@"Speech2TextManagerError" code:-1 userInfo:@{NSLocalizedDescriptionKey: @"语音识别服务不可用"}];
        self.Speech2TextManagerDidFailWithErrorBlock(self,error);
        return;
    }
    
    // 配置音频会话
    AVAudioSession *session = [AVAudioSession sharedInstance];
    NSError *sessionError = nil;
    [session setCategory:AVAudioSessionCategoryRecord mode:AVAudioSessionModeMeasurement options:AVAudioSessionCategoryOptionDuckOthers error:&sessionError];
    [session setActive:YES error:&sessionError];
    
    if (sessionError) {
        self.Speech2TextManagerDidFailWithErrorBlock(self,sessionError);
        return;
    }
    
    // 创建识别请求
    self.recognitionRequest = [[SFSpeechAudioBufferRecognitionRequest alloc] init];
    self.recognitionRequest.shouldReportPartialResults = YES; // 开启实时部分结果
    
    // 启动识别任务
    self.recognitionTask = [self.speechRecognizer recognitionTaskWithRequest:self.recognitionRequest delegate:self];
    
    // 配置音频引擎
    AVAudioFormat *recordingFormat = [self.audioEngine.inputNode outputFormatForBus:0];
    [self.audioEngine.inputNode installTapOnBus:0 bufferSize:1024 format:recordingFormat block:^(AVAudioPCMBuffer * _Nonnull buffer, AVAudioTime * _Nonnull when) {
        [self.recognitionRequest appendAudioPCMBuffer:buffer];
    }];
    
    [self.audioEngine prepare];
    NSError *engineError = nil;
    [self.audioEngine startAndReturnError:&engineError];
    
    if (engineError) {
        [self stopRecording];
        self.Speech2TextManagerDidFailWithErrorBlock(self,engineError);
    }
}

- (void)stopRecording {
    if (!self.isRecording) return;
    
    [self.audioEngine.inputNode removeTapOnBus:0];
    [self.audioEngine stop];
    [self.recognitionRequest endAudio];
    
    // 清理任务
    self.recognitionTask = nil;
    self.recognitionRequest = nil;
}

- (BOOL)isRecording {
    return self.audioEngine.isRunning;
}

#pragma mark - SFSpeechRecognitionTaskDelegate

- (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didHypothesizeTranscription:(SFTranscription *)transcription {
    // 实时中间结果
    NSString *text = transcription.formattedString;
    if (self.Speech2TextManagerDidReceiveTranscriptionBlock) {
        self.Speech2TextManagerDidReceiveTranscriptionBlock(self,text,NO);
    }
}

- (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishRecognition:(SFSpeechRecognitionResult *)recognitionResult {
    // 最终结果
    NSString *text = recognitionResult.bestTranscription.formattedString;
    if (self.Speech2TextManagerDidReceiveTranscriptionBlock) {
        self.Speech2TextManagerDidReceiveTranscriptionBlock(self,text,YES);
    }
}

- (void)speechRecognitionTaskFinishedReadingAudio:(SFSpeechRecognitionTask *)task {
    // 音频读取完毕,但可能还在处理最后的结果
}

- (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishSuccessfully:(BOOL)successfully {
    if (!successfully && task.error) {
        self.Speech2TextManagerDidFailWithErrorBlock(self,task.error);
    }
}

#pragma mark - SFSpeechRecognizerDelegate

- (void)speechRecognizer:(SFSpeechRecognizer *)speechRecognizer availabilityDidChange:(BOOL)available {
    if (!available) {
        NSError *error = [NSError errorWithDomain:@"Speech2TextManagerError" code:-2 userInfo:@{NSLocalizedDescriptionKey: @"语音识别服务暂时不可用"}];
        self.Speech2TextManagerDidFailWithErrorBlock(self,error);
    }
}

@end

交互UI在此就不贴了,你自己实现