如何将大模型(Gemini)集成到 Android 语音助手中

项目关键设计点说明:

1. 流式处理架构

  • 使用 Kotlin Flow 实现音频流和文本流的处理

  • 支持边生成边播放,减少延迟感知

2. 意图识别管道

  • 主分类器:Gemini 流式意图识别

  • 后备分类器:用于低置信度情况

  • 多级分类:意图 + 复杂度 + 置信度

3. 对话生成策略

  • 单次生成模式:一次完成,无二次调用

  • 主动澄清:当输入不明确时主动反问

  • 上下文感知:支持历史对话

4. TTS集成

  • Google Cloud TTS 服务

  • gRPC 调用,支持超时控制

  • 任务队列管理

5. 性能优化

  • 异步协程处理

  • 并行 TTS 合成

  • 实时回调机制

    直接上代码:

    // ==================== 核心模型类 ====================

    /**

    * 大模型抽象接口

    */

    interface LargeLanguageModel {

    suspend fun generateStreamingResponse(

    input: String,

    context: ConversationContext?

    ): Flow<TextChunk>

    suspend fun classifyIntent(

    audioStream: Flow<AudioChunk>? = null,

    text: String? = null

    ): IntentResult

    }

    /**

    * Google Gemini 模型实现

    */

    class GeminiModel(

    private val apiKey: String,

    private val config: ModelConfig

    ) : LargeLanguageModel {

    // Google Cloud API 客户端

    private val grpcChannel: ManagedChannel by lazy {

    ManagedChannelBuilder

    .forAddress("generativelanguage.googleapis.com", 443)

    .build()

    }

    private val textService: TextService by lazy {

    TextService.newBlockingStub(grpcChannel)

    .withCallCredentials(GoogleCredentialsProvider())

    }

    private val streamingService: StreamingService by lazy {

    StreamingService.newStub(grpcChannel)

    }

    override suspend fun generateStreamingResponse(

    input: String,

    context: ConversationContext?

    ): Flow<TextChunk> = flow {

    // 构建请求

    val request = GenerateContentRequest.newBuilder()

    .setModel("gemini-pro")

    .addContents(content {

    role = "user"

    parts { text = input }

    if (context != null) {

    context.history.forEach { history ->

    // 添加上下文历史

    }

    }

    })

    .setGenerationConfig(generationConfig {

    temperature = 0.7f

    topP = 0.95f

    maxOutputTokens = 1000

    })

    .build()

    // 流式调用

    textService.generateContentStream(request).collect { response ->

    response.candidatesList.forEach { candidate ->

    candidate.content.partsList.forEach { part ->

    emit(TextChunk(

    text = part.text,

    isFirst = false, // 实际需要根据位置判断

    isComplete = false

    ))

    }

    }

    }

    emit(TextChunk(text = "", isFirst = false, isComplete = true))

    }

    override suspend fun classifyIntent(

    audioStream: Flow<AudioChunk>?,

    text: String?

    ): IntentResult = withContext(Dispatchers.IO) {

    // 流式意图识别实现

    val startTime = System.currentTimeMillis()

    // 如果有音频流,先转文本

    val inputText = if (audioStream != null) {

    transcribeAudio(audioStream)

    } else {

    text ?: throw IllegalArgumentException("需要输入文本或音频")

    }

    // 调用 Gemini 进行意图分类

    val classificationRequest = ClassifyIntentRequest.newBuilder()

    .setModel("gemini-intent-classifier")

    .setInputText(inputText)

    .build()

    val response = streamingService.classifyIntentStream(classificationRequest)

    .first() // 获取第一个结果

    IntentResult(

    intent = mapToIntent(response.intentLabel),

    confidence = response.confidence,

    complexity = mapToComplexity(response.complexityScore),

    processingTime = System.currentTimeMillis() - startTime

    )

    }

    private suspend fun transcribeAudio(audioStream: Flow<AudioChunk>): String {

    // 音频转文本实现(简化)

    val audioData = audioStream.toList()

    // 调用 ASR 服务

    return "温度" // 示例返回

    }

    }

    // ==================== 意图处理管道 ====================

    /**

    * 意图分类管道

    */

    class IntentClassifierPipeline(

    private val streamingClassifier: LargeLanguageModel,

    private val fallbackClassifier: IntentClassifier? = null

    ) {

    private val logger = LoggerFactory.getLogger(IntentClassifierPipeline::class.java)

    suspend fun process(

    audioStream: Flow<AudioChunk>? = null,

    text: String? = null

    ): ClassificationResult {

    val startTime = System.currentTimeMillis()

    return try {

    // 1. 流式意图识别

    logger.debug("开始流式意图识别")

    val streamingResult = streamingClassifier.classifyIntent(audioStream, text)

    logger.debug("流式分类器完成,结果: {streamingResult.intent}, 耗时: {streamingResult.processingTime}ms")

    // 2. 如果置信度低,使用后备分类器

    val finalResult = if (streamingResult.confidence < 0.7 && fallbackClassifier != null) {

    logger.debug("置信度低(${streamingResult.confidence}),使用后备分类器")

    fallbackClassifier.classify(text ?: "")

    } else {

    streamingResult

    }

    // 3. 记录处理详情

    val totalTime = System.currentTimeMillis() - startTime

    logger.info("Pipeline 完成,总耗时: ${totalTime}ms")

    ClassificationResult(

    intent = finalResult.intent,

    confidence = finalResult.confidence,

    complexity = finalResult.complexity,

    rawInput = text,

    processingTime = totalTime

    )

    } catch (e: Exception) {

    logger.error("意图分类失败", e)

    ClassificationResult(

    intent = Intent.UNKNOWN,

    confidence = 0.0,

    complexity = Complexity.SIMPLE,

    rawInput = text,

    processingTime = System.currentTimeMillis() - startTime,

    error = e

    )

    }

    }

    }

    // ==================== 对话处理器 ====================

    /**

    * 对话意图处理器

    */

    class ConversationalIntentHandler(

    private val llm: LargeLanguageModel,

    private val ttsService: TTSService

    ) {

    private val logger = LoggerFactory.getLogger(ConversationalIntentHandler::class.java)

    suspend fun handleConversation(

    input: String,

    context: ConversationContext

    ): ConversationResult {

    val startTime = System.currentTimeMillis()

    logger.info("处理对话意图")

    // 1. 生成回复(流式)

    val responseFlow = llm.generateStreamingResponse(input, context)

    // 2. 边生成边播放(减少延迟)

    val ttsTasks = mutableListOf<Deferred<Unit>>()

    responseFlow.collect { chunk ->

    if (chunk.text.isNotEmpty()) {

    // 提交 TTS 任务

    val task = CoroutineScope(Dispatchers.IO).async {

    ttsService.synthesize(chunk.text)

    }

    ttsTasks.add(task)

    // 实时回调(如果需要)

    context.listener?.onTextChunk(chunk)

    }

    }

    // 3. 等待所有 TTS 任务完成

    ttsTasks.awaitAll()

    val totalTime = System.currentTimeMillis() - startTime

    logger.info("流式处理完成,总耗时 ${totalTime}ms")

    return ConversationResult(

    success = true,

    response = "", // 实际应从chunks组合

    processingTime = totalTime

    )

    }

    }

    // ==================== TTS 服务 ====================

    /**

    * Google Cloud TTS 服务

    */

    class GoogleCloudTTSService(

    private val credentials: GoogleCredentials,

    private val config: TTSConfig

    ) : TTSService {

    private val logger = LoggerFactory.getLogger(GoogleCloudTTSService::class.java)

    private val pendingTasks = AtomicInteger(0)

    private val speechClient: TextToSpeechClient by lazy {

    TextToSpeechClient.create(

    TextToSpeechSettings.newBuilder()

    .setCredentialsProvider(FixedCredentialsProvider.create(credentials))

    .build()

    )

    }

    override suspend fun synthesize(text: String): ByteArray {

    logger.d("开始合成语音,文本: $text")

    pendingTasks.incrementAndGet()

    return try {

    val synthesisInput = SynthesisInput.newBuilder()

    .setText(text)

    .build()

    val voiceSelection = VoiceSelectionParams.newBuilder()

    .setLanguageCode("cmn-CN")

    .setName("cmn-CN-Standard-A")

    .build()

    val audioConfig = AudioConfig.newBuilder()

    .setAudioEncoding(AudioEncoding.LINEAR16)

    .setSampleRateHertz(16000)

    .build()

    logger.d("调用 gRPC synthesizeSpeech (timeout=20s)...")

    logger.d("请求详情: language=cmn-CN, voice=cmn-CN-Standard-A, " +

    "sampleRate=16000, text=${text.take(20)}...")

    val response = withTimeout(20000) {

    speechClient.synthesizeSpeech(

    synthesisInput,

    voiceSelection,

    audioConfig

    )

    }

    response.audioContent.toByteArray()

    } finally {

    val remaining = pendingTasks.decrementAndGet()

    logger.d("任务完成,待处理任务: $remaining")

    }

    }

    }

    // ==================== 主控制器 ====================

    /**

    * 助手主控制器

    */

    class AssistantController(

    private val intentPipeline: IntentClassifierPipeline,

    private val intentHandlers: Map<Intent, IntentHandler>,

    private val ttsService: TTSService

    ) {

    private val logger = LoggerFactory.getLogger(AssistantController::class.java)

    suspend fun processInput(

    audioStream: Flow<AudioChunk>? = null,

    textInput: String? = null

    ): ProcessResult {

    logger.info("========== 开始处理用户输入 ==========")

    // 1. 意图识别

    val classification = intentPipeline.process(audioStream, textInput)

    logger.info("""

    ========== 意图识别详情 ==========

    原始输入: ${classification.rawInput}

    识别意图: ${classification.intent}

    意图类别: ${classification.intent.category}

    复杂度: ${classification.complexity}

    置信度: ${classification.confidence}

    是否有回复: ${classification.intent.hasResponse}

    """.trimIndent())

    // 2. 路由到对应处理器

    val handler = intentHandlers[classification.intent]

    ?: intentHandlers[Intent.UNKNOWN]!!

    logger.info("路由: ${handler.description}")

    // 3. 处理并生成回复

    val result = handler.handle(

    input = classification.rawInput ?: "",

    context = ConversationContext(

    history = emptyList(),

    sessionId = generateSessionId()

    )

    )

    // 4. TTS 合成(如果支持语音输出)

    if (result.response.isNotEmpty() && result.shouldSpeak) {

    ttsService.synthesize(result.response)

    }

    val totalTime = classification.processingTime + result.processingTime

    logger.info("处理完成,总耗时: ${totalTime}ms")

    return ProcessResult(

    intent = classification.intent,

    response = result.response,

    shouldSpeak = result.shouldSpeak,

    totalProcessingTime = totalTime

    )

    }

    fun onTTSChunk(chunk: TextChunk, isFirst: Boolean) {

    logger.v("LLM tts chunk (isFirst=isFirst): {chunk.text}")

    }

    }

    // ==================== 数据模型 ====================

    /**

    * 意图枚举

    */

    enum class Intent(

    val category: IntentCategory,

    val hasResponse: Boolean = true

    ) {

    CHITCHAT(IntentCategory.CONVERSATIONAL, true),

    WEATHER_QUERY(IntentCategory.INFORMATIONAL, true),

    DEVICE_CONTROL(IntentCategory.ACTION, true),

    UNKNOWN(IntentCategory.OTHER, false);

    enum class IntentCategory {

    CONVERSATIONAL, INFORMATIONAL, ACTION, OTHER

    }

    }

    /**

    * 复杂度级别

    */

    enum class Complexity {

    SIMPLE, CONVERSATIONAL, COMPLEX

    }

    /**

    * 文本块(用于流式输出)

    */

    data class TextChunk(

    val text: String,

    val isFirst: Boolean,

    val isComplete: Boolean

    )

    /**

    * 音频块(用于流式输入)

    */

    data class AudioChunk(

    val data: ByteArray,

    val timestamp: Long

    )

    /**

    * 意图识别结果

    */

    data class IntentResult(

    val intent: Intent,

    val confidence: Double,

    val complexity: Complexity,

    val processingTime: Long

    )

    // ==================== 使用示例 ====================

    fun main() = runBlocking {

    // 1. 初始化服务

    val geminiModel = GeminiModel(

    apiKey = "your-api-key",

    config = ModelConfig(

    temperature = 0.7,

    maxTokens = 1000

    )

    )

    val ttsService = GoogleCloudTTSService(

    credentials = GoogleCredentials.getApplicationDefault(),

    config = TTSConfig(

    languageCode = "cmn-CN",

    voiceName = "cmn-CN-Standard-A",

    sampleRate = 16000

    )

    )

    // 2. 构建意图管道

    val intentPipeline = IntentClassifierPipeline(

    streamingClassifier = geminiModel

    )

    // 3. 注册意图处理器

    val intentHandlers = mapOf(

    Intent.CHITCHAT to ConversationalIntentHandler(geminiModel, ttsService),

    Intent.WEATHER_QUERY to WeatherIntentHandler(),

    Intent.DEVICE_CONTROL to DeviceControlHandler(),

    Intent.UNKNOWN to FallbackIntentHandler()

    )

    // 4. 创建控制器

    val controller = AssistantController(

    intentPipeline = intentPipeline,

    intentHandlers = intentHandlers,

    ttsService = ttsService

    )

    // 5. 处理用户输入

    val result = controller.processInput(

    textInput = "温度"

    )

    println("回复: ${result.response}")

    println("处理时间: ${result.totalProcessingTime}ms")

    }

相关推荐
Rainman博6 小时前
WMS-窗口relayout&FinishDrawing
android
baidu_247438618 小时前
Android ViewModel定时任务
android·开发语言·javascript
有位神秘人9 小时前
Android中Notification的使用详解
android·java·javascript
·云扬·9 小时前
MySQL Binlog落盘机制深度解析:性能与安全性的平衡艺术
android·mysql·adb
独自破碎E10 小时前
【BISHI9】田忌赛马
android·java·开发语言
代码s贝多芬的音符12 小时前
android 两个人脸对比 mlkit
android
darkb1rd13 小时前
五、PHP类型转换与类型安全
android·安全·php
gjxDaniel14 小时前
Kotlin编程语言入门与常见问题
android·开发语言·kotlin
csj5014 小时前
安卓基础之《(22)—高级控件(4)碎片Fragment》
android
峥嵘life15 小时前
Android16 【CTS】CtsMediaCodecTestCases等一些列Media测试存在Failed项
android·linux·学习