CANN多模态模型部署指南:图文音频联合推理的完整方案

本文基于CANN开源社区的多个仓库进行应用案例讲解

CANN组织地址:https://atomgit.com/cann

ops-cv仓库地址:https://atomgit.com/cann/ops-cv

ops-nn仓库地址:https://atomgit.com/cann/ops-nn

前言

多模态模型能够同时处理图像、文本、音频等多种数据类型,是AI发展的重要方向。CLIP、DALL-E、Whisper等模型都属于多模态模型。

本文将展示如何在NPU上部署各种多模态模型,包括图文匹配、图像生成、语音识别等应用。

CLIP图文匹配

1. 模型架构

python 复制代码
import torch
import torch.nn as nn
import torch_npu
from transformers import CLIPModel, CLIPProcessor

class CLIPInference:
    def __init__(self, model_name='openai/clip-vit-base-patch32'):
        # 加载模型和处理器
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
      
        # 移到NPU
        self.model.eval()
        self.model = self.model.npu()
  
    def encode_image(self, image):
        """编码图像"""
        # 预处理
        inputs = self.processor(images=image, return_tensors='pt')
        pixel_values = inputs['pixel_values'].npu()
      
        # 编码
        with torch.no_grad():
            image_features = self.model.get_image_features(pixel_values)
      
        # 归一化
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
      
        return image_features
  
    def encode_text(self, text):
        """编码文本"""
        # 预处理
        inputs = self.processor(text=text, return_tensors='pt', padding=True)
        input_ids = inputs['input_ids'].npu()
        attention_mask = inputs['attention_mask'].npu()
      
        # 编码
        with torch.no_grad():
            text_features = self.model.get_text_features(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
      
        # 归一化
        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
      
        return text_features
  
    def compute_similarity(self, image_features, text_features):
        """计算相似度"""
        # 余弦相似度
        similarity = (image_features @ text_features.T) * 100
        return similarity
  
    def image_text_matching(self, image, texts):
        """图文匹配"""
        # 编码图像
        image_features = self.encode_image(image)
      
        # 编码文本
        text_features = self.encode_text(texts)
      
        # 计算相似度
        similarity = self.compute_similarity(image_features, text_features)
      
        # 获取最匹配的文本
        probs = torch.softmax(similarity, dim=-1)
      
        results = []
        for i, text in enumerate(texts):
            results.append({
                'text': text,
                'probability': probs[0, i].item()
            })
      
        return sorted(results, key=lambda x: x['probability'], reverse=True)

# 使用示例
from PIL import Image

clip = CLIPInference()

# 加载图像
image = Image.open('cat.jpg')

# 候选文本
texts = [
    "a photo of a cat",
    "a photo of a dog",
    "a photo of a bird"
]

# 匹配
results = clip.image_text_matching(image, texts)

print("图文匹配结果:")
for result in results:
    print(f"  {result['text']}: {result['probability']:.4f}")

2. 图像检索

python 复制代码
class ImageRetrieval:
    def __init__(self, clip_model):
        self.clip = clip_model
        self.image_database = []
        self.image_features = None
  
    def build_index(self, image_paths):
        """构建图像索引"""
        print(f"构建索引,共 {len(image_paths)} 张图像...")
      
        features_list = []
      
        for i, image_path in enumerate(image_paths):
            # 加载图像
            image = Image.open(image_path)
          
            # 编码
            features = self.clip.encode_image(image)
            features_list.append(features)
          
            # 保存路径
            self.image_database.append(image_path)
          
            if (i + 1) % 100 == 0:
                print(f"  已处理 {i + 1} 张图像")
      
        # 合并特征
        self.image_features = torch.cat(features_list, dim=0)
      
        print("索引构建完成")
  
    def search(self, query_text, top_k=5):
        """文本搜索图像"""
        # 编码查询文本
        text_features = self.clip.encode_text([query_text])
      
        # 计算相似度
        similarity = self.clip.compute_similarity(
            self.image_features,
            text_features
        )
      
        # 获取top-k
        top_k_values, top_k_indices = torch.topk(similarity[:, 0], top_k)
      
        # 构建结果
        results = []
        for i in range(top_k):
            idx = top_k_indices[i].item()
            score = top_k_values[i].item()
          
            results.append({
                'image_path': self.image_database[idx],
                'score': score
            })
      
        return results

# 使用示例
retrieval = ImageRetrieval(clip)

# 构建索引
image_paths = ['image1.jpg', 'image2.jpg', 'image3.jpg']  # 实际应用中会有更多图像
retrieval.build_index(image_paths)

# 搜索
query = "a cute cat playing with a ball"
results = retrieval.search(query, top_k=5)

print(f"\n搜索: {query}")
print("结果:")
for i, result in enumerate(results):
    print(f"  {i+1}. {result['image_path']} (分数: {result['score']:.2f})")

Stable Diffusion图像生成

1. 文生图

python 复制代码
from diffusers import StableDiffusionPipeline
import torch
import torch_npu

class StableDiffusionInference:
    def __init__(self, model_id='stabilityai/stable-diffusion-2-1'):
        # 加载模型
        self.pipe = StableDiffusionPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.float16
        )
      
        # 移到NPU
        self.pipe = self.pipe.to('npu')
      
        # 启用注意力优化
        self.pipe.enable_attention_slicing()
  
    def generate(self, prompt, negative_prompt="", num_images=1, 
                 num_inference_steps=50, guidance_scale=7.5):
        """生成图像"""
        # 生成
        with torch.no_grad():
            images = self.pipe(
                prompt=prompt,
                negative_prompt=negative_prompt,
                num_images_per_prompt=num_images,
                num_inference_steps=num_inference_steps,
                guidance_scale=guidance_scale
            ).images
      
        return images
  
    def generate_batch(self, prompts, batch_size=4):
        """批量生成"""
        all_images = []
      
        for i in range(0, len(prompts), batch_size):
            batch_prompts = prompts[i:i+batch_size]
          
            with torch.no_grad():
                images = self.pipe(
                    prompt=batch_prompts,
                    num_inference_steps=50
                ).images
          
            all_images.extend(images)
      
        return all_images

# 使用示例
sd = StableDiffusionInference()

# 生成单张图像
prompt = "a beautiful landscape with mountains and a lake, sunset, highly detailed"
negative_prompt = "blurry, low quality, distorted"

images = sd.generate(
    prompt=prompt,
    negative_prompt=negative_prompt,
    num_images=4,
    num_inference_steps=50,
    guidance_scale=7.5
)

# 保存图像
for i, image in enumerate(images):
    image.save(f'generated_{i}.png')

print(f"生成了 {len(images)} 张图像")

2. 图生图

python 复制代码
from diffusers import StableDiffusionImg2ImgPipeline

class Img2ImgInference:
    def __init__(self, model_id='stabilityai/stable-diffusion-2-1'):
        self.pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
            model_id,
            torch_dtype=torch.float16
        )
        self.pipe = self.pipe.to('npu')
  
    def transform(self, image, prompt, strength=0.75):
        """图像转换"""
        # 调整图像大小
        image = image.resize((512, 512))
      
        # 生成
        with torch.no_grad():
            result = self.pipe(
                prompt=prompt,
                image=image,
                strength=strength,  # 0-1,越大变化越大
                num_inference_steps=50
            ).images[0]
      
        return result

# 使用示例
img2img = Img2ImgInference()

# 加载原始图像
original_image = Image.open('photo.jpg')

# 转换
prompt = "turn this photo into an oil painting"
result = img2img.transform(original_image, prompt, strength=0.75)

result.save('transformed.png')

Whisper语音识别

1. 基础识别

python 复制代码
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import torch_npu
import librosa

class WhisperInference:
    def __init__(self, model_name='openai/whisper-base'):
        # 加载模型
        self.processor = WhisperProcessor.from_pretrained(model_name)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
      
        # 移到NPU
        self.model.eval()
        self.model = self.model.npu()
  
    def transcribe(self, audio_path, language='zh'):
        """语音转文字"""
        # 加载音频
        audio, sr = librosa.load(audio_path, sr=16000)
      
        # 预处理
        inputs = self.processor(
            audio,
            sampling_rate=16000,
            return_tensors='pt'
        )
        input_features = inputs.input_features.npu()
      
        # 生成
        with torch.no_grad():
            predicted_ids = self.model.generate(
                input_features,
                language=language
            )
      
        # 解码
        transcription = self.processor.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]
      
        return transcription
  
    def transcribe_with_timestamps(self, audio_path):
        """带时间戳的转录"""
        # 加载音频
        audio, sr = librosa.load(audio_path, sr=16000)
      
        # 预处理
        inputs = self.processor(
            audio,
            sampling_rate=16000,
            return_tensors='pt'
        )
        input_features = inputs.input_features.npu()
      
        # 生成(带时间戳)
        with torch.no_grad():
            predicted_ids = self.model.generate(
                input_features,
                return_timestamps=True
            )
      
        # 解码
        result = self.processor.batch_decode(
            predicted_ids,
            skip_special_tokens=False
        )[0]
      
        return result

# 使用示例
whisper = WhisperInference()

# 转录
audio_file = 'speech.wav'
text = whisper.transcribe(audio_file, language='zh')

print(f"转录结果: {text}")

# 带时间戳的转录
result_with_time = whisper.transcribe_with_timestamps(audio_file)
print(f"带时间戳: {result_with_time}")

2. 实时语音识别

python 复制代码
import pyaudio
import numpy as np
from collections import deque

class RealtimeWhisper:
    def __init__(self, whisper_model):
        self.whisper = whisper_model
        self.audio_buffer = deque(maxlen=16000 * 30)  # 30秒缓冲
      
        # 音频参数
        self.sample_rate = 16000
        self.chunk_size = 1024
  
    def start_recording(self):
        """开始录音"""
        audio = pyaudio.PyAudio()
      
        stream = audio.open(
            format=pyaudio.paInt16,
            channels=1,
            rate=self.sample_rate,
            input=True,
            frames_per_buffer=self.chunk_size
        )
      
        print("开始录音...")
      
        try:
            while True:
                # 读取音频数据
                data = stream.read(self.chunk_size)
                audio_data = np.frombuffer(data, dtype=np.int16)
              
                # 添加到缓冲区
                self.audio_buffer.extend(audio_data)
              
                # 每5秒转录一次
                if len(self.audio_buffer) >= self.sample_rate * 5:
                    self.transcribe_buffer()
      
        except KeyboardInterrupt:
            print("停止录音")
      
        finally:
            stream.stop_stream()
            stream.close()
            audio.terminate()
  
    def transcribe_buffer(self):
        """转录缓冲区"""
        # 获取音频数据
        audio_data = np.array(list(self.audio_buffer), dtype=np.float32)
        audio_data = audio_data / 32768.0  # 归一化
      
        # 转录
        text = self.whisper.transcribe_array(audio_data)
      
        if text.strip():
            print(f"识别: {text}")
  
    def transcribe_array(self, audio_array):
        """转录numpy数组"""
        # 预处理
        inputs = self.whisper.processor(
            audio_array,
            sampling_rate=16000,
            return_tensors='pt'
        )
        input_features = inputs.input_features.npu()
      
        # 生成
        with torch.no_grad():
            predicted_ids = self.whisper.model.generate(input_features)
      
        # 解码
        text = self.whisper.processor.batch_decode(
            predicted_ids,
            skip_special_tokens=True
        )[0]
      
        return text

# 使用
realtime = RealtimeWhisper(whisper)
realtime.start_recording()

BLIP图像描述

1. 图像标注

python 复制代码
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import torch_npu

class BLIPInference:
    def __init__(self, model_name='Salesforce/blip-image-captioning-base'):
        # 加载模型
        self.processor = BlipProcessor.from_pretrained(model_name)
        self.model = BlipForConditionalGeneration.from_pretrained(model_name)
      
        # 移到NPU
        self.model.eval()
        self.model = self.model.npu()
  
    def caption(self, image, max_length=50):
        """生成图像描述"""
        # 预处理
        inputs = self.processor(images=image, return_tensors='pt')
        pixel_values = inputs.pixel_values.npu()
      
        # 生成
        with torch.no_grad():
            generated_ids = self.model.generate(
                pixel_values,
                max_length=max_length
            )
      
        # 解码
        caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
      
        return caption
  
    def caption_batch(self, images, batch_size=8):
        """批量生成描述"""
        captions = []
      
        for i in range(0, len(images), batch_size):
            batch_images = images[i:i+batch_size]
          
            # 预处理
            inputs = self.processor(images=batch_images, return_tensors='pt')
            pixel_values = inputs.pixel_values.npu()
          
            # 生成
            with torch.no_grad():
                generated_ids = self.model.generate(pixel_values)
          
            # 解码
            batch_captions = self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )
          
            captions.extend(batch_captions)
      
        return captions
  
    def visual_question_answering(self, image, question):
        """视觉问答"""
        # 预处理
        inputs = self.processor(
            images=image,
            text=question,
            return_tensors='pt'
        )
      
        pixel_values = inputs.pixel_values.npu()
        input_ids = inputs.input_ids.npu()
      
        # 生成答案
        with torch.no_grad():
            generated_ids = self.model.generate(
                pixel_values=pixel_values,
                input_ids=input_ids
            )
      
        # 解码
        answer = self.processor.decode(generated_ids[0], skip_special_tokens=True)
      
        return answer

# 使用示例
blip = BLIPInference()

# 图像描述
image = Image.open('photo.jpg')
caption = blip.caption(image)
print(f"图像描述: {caption}")

# 视觉问答
question = "What is in the image?"
answer = blip.visual_question_answering(image, question)
print(f"问题: {question}")
print(f"答案: {answer}")

多模态融合服务

1. 统一推理服务

python 复制代码
class MultiModalService:
    def __init__(self):
        # 加载各个模型
        self.clip = CLIPInference()
        self.sd = StableDiffusionInference()
        self.whisper = WhisperInference()
        self.blip = BLIPInference()
  
    def image_to_text(self, image):
        """图像转文本"""
        return self.blip.caption(image)
  
    def text_to_image(self, text):
        """文本转图像"""
        return self.sd.generate(text)[0]
  
    def audio_to_text(self, audio_path):
        """音频转文本"""
        return self.whisper.transcribe(audio_path)
  
    def image_text_similarity(self, image, text):
        """图文相似度"""
        image_features = self.clip.encode_image(image)
        text_features = self.clip.encode_text([text])
        similarity = self.clip.compute_similarity(image_features, text_features)
        return similarity[0, 0].item()
  
    def audio_to_image(self, audio_path):
        """音频转图像(通过文本)"""
        # 音频 -> 文本
        text = self.audio_to_text(audio_path)
      
        # 文本 -> 图像
        image = self.text_to_image(text)
      
        return image, text

# 使用示例
service = MultiModalService()

# 图像转文本
image = Image.open('photo.jpg')
caption = service.image_to_text(image)
print(f"图像描述: {caption}")

# 文本转图像
prompt = "a beautiful sunset over the ocean"
generated_image = service.text_to_image(prompt)
generated_image.save('generated.png')

# 音频转图像
audio_file = 'description.wav'
image, text = service.audio_to_image(audio_file)
print(f"识别的文本: {text}")
image.save('from_audio.png')

2. REST API服务

python 复制代码
from flask import Flask, request, jsonify, send_file
import io
import base64

app = Flask(__name__)
service = MultiModalService()

@app.route('/caption', methods=['POST'])
def caption():
    """图像描述接口"""
    if 'image' not in request.files:
        return jsonify({'error': 'No image provided'}), 400
  
    image_file = request.files['image']
    image = Image.open(image_file)
  
    caption_text = service.image_to_text(image)
  
    return jsonify({'caption': caption_text})

@app.route('/generate', methods=['POST'])
def generate():
    """文生图接口"""
    data = request.json
    prompt = data.get('prompt')
  
    if not prompt:
        return jsonify({'error': 'No prompt provided'}), 400
  
    image = service.text_to_image(prompt)
  
    # 转换为base64
    buffer = io.BytesIO()
    image.save(buffer, format='PNG')
    image_base64 = base64.b64encode(buffer.getvalue()).decode()
  
    return jsonify({'image': image_base64})

@app.route('/transcribe', methods=['POST'])
def transcribe():
    """语音识别接口"""
    if 'audio' not in request.files:
        return jsonify({'error': 'No audio provided'}), 400
  
    audio_file = request.files['audio']
    audio_path = '/tmp/audio.wav'
    audio_file.save(audio_path)
  
    text = service.audio_to_text(audio_path)
  
    return jsonify({'text': text})

@app.route('/similarity', methods=['POST'])
def similarity():
    """图文相似度接口"""
    if 'image' not in request.files:
        return jsonify({'error': 'No image provided'}), 400
  
    data = request.form
    text = data.get('text')
  
    if not text:
        return jsonify({'error': 'No text provided'}), 400
  
    image_file = request.files['image']
    image = Image.open(image_file)
  
    score = service.image_text_similarity(image, text)
  
    return jsonify({'similarity': score})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8080)

性能优化

1. 模型量化

python 复制代码
# 使用FP16减少内存和提升速度
model = model.half()

2. 批量处理

python 复制代码
# 批量处理提升吞吐量
images = [Image.open(f'image_{i}.jpg') for i in range(32)]
captions = blip.caption_batch(images, batch_size=8)

3. 模型缓存

python 复制代码
# 缓存编码结果
from functools import lru_cache

@lru_cache(maxsize=1000)
def encode_text_cached(text):
    return clip.encode_text([text])

总结

CANN多模态模型部署要点:

  • CLIP:图文匹配和检索
  • Stable Diffusion:文生图和图生图
  • Whisper:语音识别
  • BLIP:图像描述和视觉问答
  • 统一服务:多模态融合
  • 性能优化:量化、批处理、缓存

通过CANN的ops-cv和ops-nn算子库,可以高效部署各种多模态模型。

相关链接

ops-cv仓库地址:https://atomgit.com/cann/ops-cv

ops-nn仓库地址:https://atomgit.com/cann/ops-nn

CANN组织地址:https://atomgit.com/cann

相关推荐
九.九10 小时前
ops-transformer:AI 处理器上的高性能 Transformer 算子库
人工智能·深度学习·transformer
春日见10 小时前
拉取与合并:如何让个人分支既包含你昨天的修改,也包含 develop 最新更新
大数据·人工智能·深度学习·elasticsearch·搜索引擎
恋猫de小郭10 小时前
AI 在提高你工作效率的同时,也一直在增加你的疲惫和焦虑
前端·人工智能·ai编程
deephub11 小时前
Agent Lightning:微软开源的框架无关 Agent 训练方案,LangChain/AutoGen 都能用
人工智能·microsoft·langchain·大语言模型·agent·强化学习
大模型RAG和Agent技术实践11 小时前
从零构建本地AI合同审查系统:架构设计与流式交互实战(完整源代码)
人工智能·交互·智能合同审核
老邋遢11 小时前
第三章-AI知识扫盲看这一篇就够了
人工智能
互联网江湖11 小时前
Seedance2.0炸场:长短视频们“修坝”十年,不如AI放水一天?
人工智能
PythonPioneer11 小时前
在AI技术迅猛发展的今天,传统职业该如何“踏浪前行”?
人工智能
冬奇Lab12 小时前
一天一个开源项目(第20篇):NanoBot - 轻量级AI Agent框架,极简高效的智能体构建工具
人工智能·开源·agent
阿里巴巴淘系技术团队官网博客12 小时前
设计模式Trustworthy Generation:提升RAG信赖度
人工智能·设计模式