本文基于CANN开源社区的多个仓库进行应用案例讲解
CANN组织地址:https://atomgit.com/cann
ops-cv仓库地址:https://atomgit.com/cann/ops-cv
ops-nn仓库地址:https://atomgit.com/cann/ops-nn
前言
多模态模型能够同时处理图像、文本、音频等多种数据类型,是AI发展的重要方向。CLIP、DALL-E、Whisper等模型都属于多模态模型。
本文将展示如何在NPU上部署各种多模态模型,包括图文匹配、图像生成、语音识别等应用。
CLIP图文匹配
1. 模型架构
python
import torch
import torch.nn as nn
import torch_npu
from transformers import CLIPModel, CLIPProcessor
class CLIPInference:
def __init__(self, model_name='openai/clip-vit-base-patch32'):
# 加载模型和处理器
self.model = CLIPModel.from_pretrained(model_name)
self.processor = CLIPProcessor.from_pretrained(model_name)
# 移到NPU
self.model.eval()
self.model = self.model.npu()
def encode_image(self, image):
"""编码图像"""
# 预处理
inputs = self.processor(images=image, return_tensors='pt')
pixel_values = inputs['pixel_values'].npu()
# 编码
with torch.no_grad():
image_features = self.model.get_image_features(pixel_values)
# 归一化
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
return image_features
def encode_text(self, text):
"""编码文本"""
# 预处理
inputs = self.processor(text=text, return_tensors='pt', padding=True)
input_ids = inputs['input_ids'].npu()
attention_mask = inputs['attention_mask'].npu()
# 编码
with torch.no_grad():
text_features = self.model.get_text_features(
input_ids=input_ids,
attention_mask=attention_mask
)
# 归一化
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
return text_features
def compute_similarity(self, image_features, text_features):
"""计算相似度"""
# 余弦相似度
similarity = (image_features @ text_features.T) * 100
return similarity
def image_text_matching(self, image, texts):
"""图文匹配"""
# 编码图像
image_features = self.encode_image(image)
# 编码文本
text_features = self.encode_text(texts)
# 计算相似度
similarity = self.compute_similarity(image_features, text_features)
# 获取最匹配的文本
probs = torch.softmax(similarity, dim=-1)
results = []
for i, text in enumerate(texts):
results.append({
'text': text,
'probability': probs[0, i].item()
})
return sorted(results, key=lambda x: x['probability'], reverse=True)
# 使用示例
from PIL import Image
clip = CLIPInference()
# 加载图像
image = Image.open('cat.jpg')
# 候选文本
texts = [
"a photo of a cat",
"a photo of a dog",
"a photo of a bird"
]
# 匹配
results = clip.image_text_matching(image, texts)
print("图文匹配结果:")
for result in results:
print(f" {result['text']}: {result['probability']:.4f}")
2. 图像检索
python
class ImageRetrieval:
def __init__(self, clip_model):
self.clip = clip_model
self.image_database = []
self.image_features = None
def build_index(self, image_paths):
"""构建图像索引"""
print(f"构建索引,共 {len(image_paths)} 张图像...")
features_list = []
for i, image_path in enumerate(image_paths):
# 加载图像
image = Image.open(image_path)
# 编码
features = self.clip.encode_image(image)
features_list.append(features)
# 保存路径
self.image_database.append(image_path)
if (i + 1) % 100 == 0:
print(f" 已处理 {i + 1} 张图像")
# 合并特征
self.image_features = torch.cat(features_list, dim=0)
print("索引构建完成")
def search(self, query_text, top_k=5):
"""文本搜索图像"""
# 编码查询文本
text_features = self.clip.encode_text([query_text])
# 计算相似度
similarity = self.clip.compute_similarity(
self.image_features,
text_features
)
# 获取top-k
top_k_values, top_k_indices = torch.topk(similarity[:, 0], top_k)
# 构建结果
results = []
for i in range(top_k):
idx = top_k_indices[i].item()
score = top_k_values[i].item()
results.append({
'image_path': self.image_database[idx],
'score': score
})
return results
# 使用示例
retrieval = ImageRetrieval(clip)
# 构建索引
image_paths = ['image1.jpg', 'image2.jpg', 'image3.jpg'] # 实际应用中会有更多图像
retrieval.build_index(image_paths)
# 搜索
query = "a cute cat playing with a ball"
results = retrieval.search(query, top_k=5)
print(f"\n搜索: {query}")
print("结果:")
for i, result in enumerate(results):
print(f" {i+1}. {result['image_path']} (分数: {result['score']:.2f})")
Stable Diffusion图像生成
1. 文生图
python
from diffusers import StableDiffusionPipeline
import torch
import torch_npu
class StableDiffusionInference:
def __init__(self, model_id='stabilityai/stable-diffusion-2-1'):
# 加载模型
self.pipe = StableDiffusionPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16
)
# 移到NPU
self.pipe = self.pipe.to('npu')
# 启用注意力优化
self.pipe.enable_attention_slicing()
def generate(self, prompt, negative_prompt="", num_images=1,
num_inference_steps=50, guidance_scale=7.5):
"""生成图像"""
# 生成
with torch.no_grad():
images = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
num_images_per_prompt=num_images,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale
).images
return images
def generate_batch(self, prompts, batch_size=4):
"""批量生成"""
all_images = []
for i in range(0, len(prompts), batch_size):
batch_prompts = prompts[i:i+batch_size]
with torch.no_grad():
images = self.pipe(
prompt=batch_prompts,
num_inference_steps=50
).images
all_images.extend(images)
return all_images
# 使用示例
sd = StableDiffusionInference()
# 生成单张图像
prompt = "a beautiful landscape with mountains and a lake, sunset, highly detailed"
negative_prompt = "blurry, low quality, distorted"
images = sd.generate(
prompt=prompt,
negative_prompt=negative_prompt,
num_images=4,
num_inference_steps=50,
guidance_scale=7.5
)
# 保存图像
for i, image in enumerate(images):
image.save(f'generated_{i}.png')
print(f"生成了 {len(images)} 张图像")
2. 图生图
python
from diffusers import StableDiffusionImg2ImgPipeline
class Img2ImgInference:
def __init__(self, model_id='stabilityai/stable-diffusion-2-1'):
self.pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
model_id,
torch_dtype=torch.float16
)
self.pipe = self.pipe.to('npu')
def transform(self, image, prompt, strength=0.75):
"""图像转换"""
# 调整图像大小
image = image.resize((512, 512))
# 生成
with torch.no_grad():
result = self.pipe(
prompt=prompt,
image=image,
strength=strength, # 0-1,越大变化越大
num_inference_steps=50
).images[0]
return result
# 使用示例
img2img = Img2ImgInference()
# 加载原始图像
original_image = Image.open('photo.jpg')
# 转换
prompt = "turn this photo into an oil painting"
result = img2img.transform(original_image, prompt, strength=0.75)
result.save('transformed.png')
Whisper语音识别
1. 基础识别
python
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import torch_npu
import librosa
class WhisperInference:
def __init__(self, model_name='openai/whisper-base'):
# 加载模型
self.processor = WhisperProcessor.from_pretrained(model_name)
self.model = WhisperForConditionalGeneration.from_pretrained(model_name)
# 移到NPU
self.model.eval()
self.model = self.model.npu()
def transcribe(self, audio_path, language='zh'):
"""语音转文字"""
# 加载音频
audio, sr = librosa.load(audio_path, sr=16000)
# 预处理
inputs = self.processor(
audio,
sampling_rate=16000,
return_tensors='pt'
)
input_features = inputs.input_features.npu()
# 生成
with torch.no_grad():
predicted_ids = self.model.generate(
input_features,
language=language
)
# 解码
transcription = self.processor.batch_decode(
predicted_ids,
skip_special_tokens=True
)[0]
return transcription
def transcribe_with_timestamps(self, audio_path):
"""带时间戳的转录"""
# 加载音频
audio, sr = librosa.load(audio_path, sr=16000)
# 预处理
inputs = self.processor(
audio,
sampling_rate=16000,
return_tensors='pt'
)
input_features = inputs.input_features.npu()
# 生成(带时间戳)
with torch.no_grad():
predicted_ids = self.model.generate(
input_features,
return_timestamps=True
)
# 解码
result = self.processor.batch_decode(
predicted_ids,
skip_special_tokens=False
)[0]
return result
# 使用示例
whisper = WhisperInference()
# 转录
audio_file = 'speech.wav'
text = whisper.transcribe(audio_file, language='zh')
print(f"转录结果: {text}")
# 带时间戳的转录
result_with_time = whisper.transcribe_with_timestamps(audio_file)
print(f"带时间戳: {result_with_time}")
2. 实时语音识别
python
import pyaudio
import numpy as np
from collections import deque
class RealtimeWhisper:
def __init__(self, whisper_model):
self.whisper = whisper_model
self.audio_buffer = deque(maxlen=16000 * 30) # 30秒缓冲
# 音频参数
self.sample_rate = 16000
self.chunk_size = 1024
def start_recording(self):
"""开始录音"""
audio = pyaudio.PyAudio()
stream = audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size
)
print("开始录音...")
try:
while True:
# 读取音频数据
data = stream.read(self.chunk_size)
audio_data = np.frombuffer(data, dtype=np.int16)
# 添加到缓冲区
self.audio_buffer.extend(audio_data)
# 每5秒转录一次
if len(self.audio_buffer) >= self.sample_rate * 5:
self.transcribe_buffer()
except KeyboardInterrupt:
print("停止录音")
finally:
stream.stop_stream()
stream.close()
audio.terminate()
def transcribe_buffer(self):
"""转录缓冲区"""
# 获取音频数据
audio_data = np.array(list(self.audio_buffer), dtype=np.float32)
audio_data = audio_data / 32768.0 # 归一化
# 转录
text = self.whisper.transcribe_array(audio_data)
if text.strip():
print(f"识别: {text}")
def transcribe_array(self, audio_array):
"""转录numpy数组"""
# 预处理
inputs = self.whisper.processor(
audio_array,
sampling_rate=16000,
return_tensors='pt'
)
input_features = inputs.input_features.npu()
# 生成
with torch.no_grad():
predicted_ids = self.whisper.model.generate(input_features)
# 解码
text = self.whisper.processor.batch_decode(
predicted_ids,
skip_special_tokens=True
)[0]
return text
# 使用
realtime = RealtimeWhisper(whisper)
realtime.start_recording()
BLIP图像描述
1. 图像标注
python
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
import torch_npu
class BLIPInference:
def __init__(self, model_name='Salesforce/blip-image-captioning-base'):
# 加载模型
self.processor = BlipProcessor.from_pretrained(model_name)
self.model = BlipForConditionalGeneration.from_pretrained(model_name)
# 移到NPU
self.model.eval()
self.model = self.model.npu()
def caption(self, image, max_length=50):
"""生成图像描述"""
# 预处理
inputs = self.processor(images=image, return_tensors='pt')
pixel_values = inputs.pixel_values.npu()
# 生成
with torch.no_grad():
generated_ids = self.model.generate(
pixel_values,
max_length=max_length
)
# 解码
caption = self.processor.decode(generated_ids[0], skip_special_tokens=True)
return caption
def caption_batch(self, images, batch_size=8):
"""批量生成描述"""
captions = []
for i in range(0, len(images), batch_size):
batch_images = images[i:i+batch_size]
# 预处理
inputs = self.processor(images=batch_images, return_tensors='pt')
pixel_values = inputs.pixel_values.npu()
# 生成
with torch.no_grad():
generated_ids = self.model.generate(pixel_values)
# 解码
batch_captions = self.processor.batch_decode(
generated_ids,
skip_special_tokens=True
)
captions.extend(batch_captions)
return captions
def visual_question_answering(self, image, question):
"""视觉问答"""
# 预处理
inputs = self.processor(
images=image,
text=question,
return_tensors='pt'
)
pixel_values = inputs.pixel_values.npu()
input_ids = inputs.input_ids.npu()
# 生成答案
with torch.no_grad():
generated_ids = self.model.generate(
pixel_values=pixel_values,
input_ids=input_ids
)
# 解码
answer = self.processor.decode(generated_ids[0], skip_special_tokens=True)
return answer
# 使用示例
blip = BLIPInference()
# 图像描述
image = Image.open('photo.jpg')
caption = blip.caption(image)
print(f"图像描述: {caption}")
# 视觉问答
question = "What is in the image?"
answer = blip.visual_question_answering(image, question)
print(f"问题: {question}")
print(f"答案: {answer}")
多模态融合服务
1. 统一推理服务
python
class MultiModalService:
def __init__(self):
# 加载各个模型
self.clip = CLIPInference()
self.sd = StableDiffusionInference()
self.whisper = WhisperInference()
self.blip = BLIPInference()
def image_to_text(self, image):
"""图像转文本"""
return self.blip.caption(image)
def text_to_image(self, text):
"""文本转图像"""
return self.sd.generate(text)[0]
def audio_to_text(self, audio_path):
"""音频转文本"""
return self.whisper.transcribe(audio_path)
def image_text_similarity(self, image, text):
"""图文相似度"""
image_features = self.clip.encode_image(image)
text_features = self.clip.encode_text([text])
similarity = self.clip.compute_similarity(image_features, text_features)
return similarity[0, 0].item()
def audio_to_image(self, audio_path):
"""音频转图像(通过文本)"""
# 音频 -> 文本
text = self.audio_to_text(audio_path)
# 文本 -> 图像
image = self.text_to_image(text)
return image, text
# 使用示例
service = MultiModalService()
# 图像转文本
image = Image.open('photo.jpg')
caption = service.image_to_text(image)
print(f"图像描述: {caption}")
# 文本转图像
prompt = "a beautiful sunset over the ocean"
generated_image = service.text_to_image(prompt)
generated_image.save('generated.png')
# 音频转图像
audio_file = 'description.wav'
image, text = service.audio_to_image(audio_file)
print(f"识别的文本: {text}")
image.save('from_audio.png')
2. REST API服务
python
from flask import Flask, request, jsonify, send_file
import io
import base64
app = Flask(__name__)
service = MultiModalService()
@app.route('/caption', methods=['POST'])
def caption():
"""图像描述接口"""
if 'image' not in request.files:
return jsonify({'error': 'No image provided'}), 400
image_file = request.files['image']
image = Image.open(image_file)
caption_text = service.image_to_text(image)
return jsonify({'caption': caption_text})
@app.route('/generate', methods=['POST'])
def generate():
"""文生图接口"""
data = request.json
prompt = data.get('prompt')
if not prompt:
return jsonify({'error': 'No prompt provided'}), 400
image = service.text_to_image(prompt)
# 转换为base64
buffer = io.BytesIO()
image.save(buffer, format='PNG')
image_base64 = base64.b64encode(buffer.getvalue()).decode()
return jsonify({'image': image_base64})
@app.route('/transcribe', methods=['POST'])
def transcribe():
"""语音识别接口"""
if 'audio' not in request.files:
return jsonify({'error': 'No audio provided'}), 400
audio_file = request.files['audio']
audio_path = '/tmp/audio.wav'
audio_file.save(audio_path)
text = service.audio_to_text(audio_path)
return jsonify({'text': text})
@app.route('/similarity', methods=['POST'])
def similarity():
"""图文相似度接口"""
if 'image' not in request.files:
return jsonify({'error': 'No image provided'}), 400
data = request.form
text = data.get('text')
if not text:
return jsonify({'error': 'No text provided'}), 400
image_file = request.files['image']
image = Image.open(image_file)
score = service.image_text_similarity(image, text)
return jsonify({'similarity': score})
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8080)
性能优化
1. 模型量化
python
# 使用FP16减少内存和提升速度
model = model.half()
2. 批量处理
python
# 批量处理提升吞吐量
images = [Image.open(f'image_{i}.jpg') for i in range(32)]
captions = blip.caption_batch(images, batch_size=8)
3. 模型缓存
python
# 缓存编码结果
from functools import lru_cache
@lru_cache(maxsize=1000)
def encode_text_cached(text):
return clip.encode_text([text])
总结
CANN多模态模型部署要点:
- CLIP:图文匹配和检索
- Stable Diffusion:文生图和图生图
- Whisper:语音识别
- BLIP:图像描述和视觉问答
- 统一服务:多模态融合
- 性能优化:量化、批处理、缓存
通过CANN的ops-cv和ops-nn算子库,可以高效部署各种多模态模型。
相关链接
ops-cv仓库地址:https://atomgit.com/cann/ops-cv
ops-nn仓库地址:https://atomgit.com/cann/ops-nn
CANN组织地址:https://atomgit.com/cann