一、简介
多模态(Multimodal)指的是系统能够处理和整合多种不同形式的数据。LangChain 中的多模态能力,核心是让 RAG/Agent 能处理文本、图片、音频、视频、表格等多种类型数据,而非仅局限于纯文本 ------ 这是实现 "图文问答、多模态知识库、视觉 Agent" 的关键。在AI领域,常见的模态包括:
| 模态 | 示例 | 应用场景 |
|---|---|---|
| 文本 | 文字、文档 | 问答、写作、翻译 |
| 图像 | 照片、图表、截图 | 图像识别、描述生成 |
| 音频 | 语音、音乐、环境音 | 语音识别、音乐分析 |
| 视频 | 录像、动画 | 视频理解、摘要生成 |
| 文档 | PDF、PPT、Word | 文档理解、信息提取 |
二、多模态架构
LangChain 多模态架构采用分层设计:
bash
应用层 ──┬── 多模态Chain
├── 多模态Agent
└── 多模态RAG
│
中间层 ──┬── 多模态Document Loaders
├── 多模态Transformers
├── 多模态Embeddings
└── 多模态Output Parsers
│
基础层 ──┬── 多模态LLM (GPT-4V, Claude-3, Gemini)
├── 多模态Vector Stores
└── 多模态Tools
其核心逻辑:LangChain 多模态通过 "多模态加载器 + 多模态嵌入 + 多模态 LLM",实现图文 / 音视频的统一处理和检索;
三、基础层
基础层是整个多模态架构的基石,提供了最核心的底层能力。
3.1 多模态LLM
作用:提供多模态理解和生成能力,支持图像+文本的联合处理,是连接视觉和语言的桥梁。
-
OpenAI GPT-4V(视觉版)
pythonfrom langchain_openai import ChatOpenAI from langchain.schema.messages import HumanMessage, SystemMessage import base64 # 初始化模型 model = ChatOpenAI( model="gpt-4o", # 或 gpt-4-turbo(支持视觉) max_tokens=1024 ) # 准备图像(本地文件) def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") image_data = encode_image("cat.jpg") # 创建多模态消息 messages = [ SystemMessage(content="你是一个图像分析助手,请详细描述图片内容。"), HumanMessage(content=[ {"type": "text", "text": "这张图片里有什么?"}, {"type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{image_data}" }} ]) ] # 调用模型 response = model.invoke(messages) print(response.content) -
Google Gemini(原生多模态)
pythonfrom langchain_google_genai import ChatGoogleGenerativeAI # 初始化Gemini llm = ChatGoogleGenerativeAI( model="gemini-1.5-pro-vision", # 支持图像、视频、音频 temperature=0.7 ) # 直接传入图像路径 from langchain_core.messages import HumanMessage message = HumanMessage( content=[ {"type": "text", "text": "这张图表展示了什么趋势?"}, {"type": "image_url", "image_url": "chart.png"}, # 直接传路径 {"type": "image_url", "image_url": "https://example.com/image.jpg"} # 或URL ] ) response = llm.invoke([message]) -
Anthropic Claude-3(支持图像)
pythonfrom langchain_anthropic import ChatAnthropic model = ChatAnthropic(model="claude-3-sonnet-20240229") # 使用URL或本地图像 messages = [ HumanMessage(content=[ {"type": "text", "text": "这张图表说明了什么?"}, {"type": "image_url", "image_url": { "url": "https://example.com/chart.png" }} ]) ] response = model.invoke(messages) -
国内多模态 LLM(通义千问 / 文心一言)
python# 阿里云通义千问多模态 from langchain_community.chat_models import ChatDashScope from langchain_core.messages import HumanMessage llm = ChatDashScope( model="qwen-vl-plus", # 通义千问多模态模型 dashscope_api_key="YOUR_DASHSCOPE_KEY" ) # 本地图片需转base64 import base64 def image_to_base64(image_path): with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") message = HumanMessage( content=[ {"type": "text", "text": "分析这张PDF里的表格数据"}, {"type": "image", "image": image_to_base64("表格截图.png")} ] ) response = llm.invoke([message]) print(response.content)
3.2 多模态Vector Stores
作用:存储和索引多模态数据的向量表示,支持跨模态相似性搜索(文本搜图像、图像搜图像等)。
-
Chroma多模态存储
pythonfrom langchain_community.vectorstores import Chroma from langchain_community.embeddings import CLIPEmbeddings # 初始化嵌入模型 embeddings = CLIPEmbeddings() # 创建多模态向量存储 vectorstore = Chroma( collection_name="multimodal_collection", embedding_function=embeddings, persist_directory="./multimodal_db" ) # 准备多模态文档 multimodal_docs = [ { "page_content": "这张图片展示了一只猫", "metadata": {"type": "text", "source": "description"}, "image_path": "cat.jpg" # 会生成图像嵌入 }, { "page_content": "产品使用说明书", "metadata": {"type": "document"}, "images": ["manual_page1.png", "diagram.png"] # 多张图像 } ] # 添加到向量库 for doc in multimodal_docs: if "image_path" in doc: # 图像文档 vectorstore.add_images( texts=[doc["page_content"]], images=[doc["image_path"]], metadatas=[doc["metadata"]] ) else: # 文本文档 vectorstore.add_texts( texts=[doc["page_content"]], metadatas=[doc["metadata"]] ) # 保存 vectorstore.persist() -
多模态检索
python# 文本查询 text_results = vectorstore.similarity_search( "动物照片", k=5, filter={"type": "image"} ) # 图像查询(以图搜图) image_results = vectorstore.similarity_search_by_image( "query_cat.jpg", k=3 ) # 混合查询(同时使用文本和图像) hybrid_results = vectorstore.similarity_search_multimodal( text_query="宠物", image_query="animal.jpg", weights=[0.5, 0.5] # 文本和图像的权重 ) # 查看结果 for doc in hybrid_results: print(f"内容:{doc.page_content}") print(f"类型:{doc.metadata.get('type')}") print(f"相似度:{doc.metadata.get('score')}") print("---")
3.3 多模态Tools
作用:提供专门处理多模态任务的工具,如OCR、物体检测、图像描述、音频转录等。
-
内置多模态工具
pythonfrom langchain.tools import BaseTool from langchain_community.tools import ( ImageCaptionTool, ObjectDetectionTool, OCRTool, AudioTranscriptionTool ) from langchain.agents import initialize_agent, AgentType # 初始化多模态工具 tools = [ ImageCaptionTool(), # 图像描述 ObjectDetectionTool(model="yolov8"), # 物体检测 OCRTool(language="chinese"), # 文字识别 AudioTranscriptionTool(model="whisper-1") # 音频转录 ] # 创建多模态Agent agent = initialize_agent( tools=tools, llm=llm, # 多模态LLM agent=AgentType.OPENAI_FUNCTIONS, verbose=True ) # 使用Agent处理多模态任务 response = agent.run( "分析这张照片(image.jpg)中的内容,识别所有文字,并描述听到的音频(audio.mp3)" ) -
自定义多模态工具
pythonfrom langchain.tools import BaseTool from typing import Type, Optional from pydantic import BaseModel, Field class ImageAnalysisInput(BaseModel): """图像分析工具的输入""" image_path: str = Field(description="图像文件路径") analysis_type: str = Field(description="分析类型:caption/detect/ocr/color") class CustomImageAnalyzer(BaseTool): name = "image_analyzer" description = "分析图像内容,支持描述、物体检测、OCR和颜色分析" args_schema: Type[BaseModel] = ImageAnalysisInput def _run(self, image_path: str, analysis_type: str) -> str: """执行图像分析""" from PIL import Image import requests # 加载图像 image = Image.open(image_path) if analysis_type == "caption": # 调用图像描述模型 response = llm.invoke([ HumanMessage(content=[ {"type": "text", "text": "用一句话描述"}, {"type": "image_url", "image_url": image_path} ]) ]) return response.content elif analysis_type == "detect": # 物体检测逻辑 return "检测到:人、车、树" elif analysis_type == "ocr": # OCR逻辑 return "识别的文字:..." elif analysis_type == "colors": # 颜色分析 return "主要颜色:蓝色、绿色" return "未知分析类型" def _arun(self, *args, **kwargs): raise NotImplementedError("不支持异步") # 使用自定义工具 custom_tool = CustomImageAnalyzer() result = custom_tool.run({"image_path": "scene.jpg", "analysis_type": "caption"})
四、中间层
中间层负责数据的加载、转换、嵌入和解析,是连接基础层和应用层的桥梁。
4.1 多模态Document Loaders
作用:加载各种格式的文件(图像、PDF、音频、视频),并提取其中的多模态内容。
-
图像加载器
pythonfrom langchain_community.document_loaders import ImageLoader from langchain_community.document_loaders import UnstructuredImageLoader # 方法1:基础图像加载器 loader = ImageLoader("images/") docs = loader.load() # 方法2:Unstructured(支持OCR) loader = UnstructuredImageLoader( "receipt.jpg", mode="elements", # 按元素分割 strategy="ocr_only" # 使用OCR提取文字 ) elements = loader.load() # 提取图像中的文本 for element in elements: if element.metadata["category"] == "Image": print(f"图像描述:{element.page_content}") -
PDF加载器(支持图像提取)
pythonfrom langchain_community.document_loaders import PyPDFLoader from langchain_community.document_loaders.parsers import ( OpenAIWhisperParser, AzureAIDocumentIntelligenceParser ) # 使用Azure Document Intelligence(支持图像识别) loader = PyPDFLoader( "document.pdf", parser=AzureAIDocumentIntelligenceParser( api_endpoint="your_endpoint", api_key="your_key", model="prebuilt-layout" # 提取布局,包括图像 ) ) docs = loader.load() # 访问图像内容 for doc in docs: if "images" in doc.metadata: for image in doc.metadata["images"]: print(f"图像位置:{image['page']}") print(f"图像描述:{image['caption']}") -
音频/视频加载器
pythonfrom langchain_community.document_loaders import ( YoutubeAudioLoader, AssemblyAIAudioLoader ) # 加载YouTube音频并转录 loader = YoutubeAudioLoader( ["https://youtube.com/watch?v=..."], save_dir="./audio/" ) docs = loader.load() # 使用AssemblyAI进行高级音频分析 from langchain_community.document_loaders import AssemblyAIAudioLoader loader = AssemblyAIAudioLoader( file_path="meeting.mp3", api_key="your_api_key", params={ "speaker_labels": True, # 识别说话人 "auto_chapters": True, # 自动分章 "sentiment_analysis": True # 情感分析 } ) transcript = loader.load() -
多模态目录加载器
pythonfrom langchain_community.document_loaders import DirectoryLoader from langchain_community.document_loaders import ( TextLoader, ImageLoader, UnstructuredPDFLoader ) # 混合加载多种文件类型 loaders = { ".txt": TextLoader, ".jpg": ImageLoader, ".png": ImageLoader, ".pdf": UnstructuredPDFLoader } # 创建多模态目录加载器 loader = DirectoryLoader( "./data/", loader_mapping=loaders, # 根据扩展名使用不同加载器 recursive=True, show_progress=True ) documents = loader.load() print(f"加载了 {len(documents)} 个文档")
4.2 多模态Transformers
作用:对加载的多模态数据进行预处理、增强、格式转换等操作。
-
图像转换器
pythonclass ImageTransformer: """图像预处理和增强""" @staticmethod def resize_image(image_path: str, max_size: int = 1024) -> str: """调整图像大小,减少token消耗""" img = Image.open(image_path) # 计算新尺寸 if max(img.size) > max_size: ratio = max_size / max(img.size) new_size = tuple(int(dim * ratio) for dim in img.size) img = img.resize(new_size, Image.Resampling.LANCZOS) # 保存处理后的图像 output_path = f"resized_{image_path}" img.save(output_path, optimize=True, quality=85) return output_path @staticmethod def enhance_image(image_path: str) -> np.ndarray: """图像增强(对比度、锐化等)""" img = cv2.imread(image_path) # 转换为灰度 gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 自适应直方图均衡化 clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) enhanced = clahe.apply(gray) # 锐化 kernel = np.array([[-1,-1,-1], [-1, 9,-1], [-1,-1,-1]]) sharpened = cv2.filter2D(enhanced, -1, kernel) return sharpened @staticmethod def extract_frames(video_path: str, interval_sec: int = 5) -> List[str]: """从视频提取关键帧""" cap = cv2.VideoCapture(video_path) fps = cap.get(cv2.CAP_PROP_FPS) frame_interval = int(fps * interval_sec) frames = [] frame_count = 0 while True: ret, frame = cap.read() if not ret: break if frame_count % frame_interval == 0: frame_path = f"frame_{frame_count}.jpg" cv2.imwrite(frame_path, frame) frames.append(frame_path) frame_count += 1 cap.release() return frames -
音频转换器
pythonclass AudioTransformer: """音频预处理""" @staticmethod def convert_to_wav(audio_path: str) -> str: """转换为WAV格式(Whisper优化)""" import subprocess output_path = audio_path.rsplit('.', 1)[0] + '.wav' cmd = [ 'ffmpeg', '-i', audio_path, '-ar', '16000', # 16kHz采样率 '-ac', '1', # 单声道 '-c:a', 'pcm_s16le', output_path ] subprocess.run(cmd, check=True) return output_path @staticmethod def split_audio(audio_path: str, segment_duration: int = 60) -> List[str]: """分割长音频为片段""" from pydub import AudioSegment audio = AudioSegment.from_file(audio_path) duration_ms = len(audio) segment_ms = segment_duration * 1000 segments = [] for i, start in enumerate(range(0, duration_ms, segment_ms)): end = min(start + segment_ms, duration_ms) segment = audio[start:end] segment_path = f"segment_{i}.wav" segment.export(segment_path, format="wav") segments.append(segment_path) return segments -
文本转换器
pythonclass TextTransformer: """文本预处理""" @staticmethod def clean_text(text: str) -> str: """清理文本""" import re # 移除多余空格 text = re.sub(r'\s+', ' ', text) # 移除特殊字符(保留基本标点) text = re.sub(r'[^\w\s\u4e00-\u9fff\.\,\!\?\-\:]', '', text) return text.strip() @staticmethod def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]: """文本分块""" chunks = [] start = 0 text_len = len(text) while start < text_len: end = min(start + chunk_size, text_len) if end < text_len: # 尝试在句子边界分割 last_period = text.rfind('。', start, end) last_dot = text.rfind('.', start, end) split_point = max(last_period, last_dot) if split_point > start: end = split_point + 1 chunks.append(text[start:end]) start = end - overlap return chunks -
多模态转换管道
pythonclass MultimodalTransformPipeline: """多模态转换管道""" def __init__(self): self.image_transformer = ImageTransformer() self.audio_transformer = AudioTransformer() self.text_transformer = TextTransformer() async def process_document(self, doc: Document) -> Document: """处理单个文档""" file_path = doc.metadata.get("source", "") file_ext = file_path.split('.')[-1].lower() # 根据文件类型应用不同转换 if file_ext in ['jpg', 'jpeg', 'png']: # 图像处理 enhanced = self.image_transformer.enhance_image(file_path) doc.page_content = f"图像已增强,原始描述:{doc.page_content}" elif file_ext in ['mp3', 'wav', 'm4a']: # 音频处理 wav_path = self.audio_transformer.convert_to_wav(file_path) doc.metadata['processed_audio'] = wav_path elif file_ext in ['txt', 'md']: # 文本处理 doc.page_content = self.text_transformer.clean_text(doc.page_content) return doc async def process_batch(self, docs: List[Document]) -> List[Document]: """批量处理""" tasks = [self.process_document(doc) for doc in docs] return await asyncio.gather(*tasks) # 使用转换管道 transformer = MultimodalTransformPipeline() processed_docs = await transformer.process_batch(all_docs)
4.3 多模态Embeddings
作用:将不同模态的数据(文本、图像、音频)映射到同一个向量空间,使它们可以相互比较和检索。
-
CLIP嵌入(图像+文本对齐)
pythonfrom langchain_community.embeddings import CLIPEmbeddings # 初始化CLIP嵌入模型 embedding_model = CLIPEmbeddings( model_name="openai/clip-vit-base-patch32", device="cpu" # 或 "cuda" ) # 文本嵌入 text_embeddings = embedding_model.embed_documents([ "一只可爱的猫", "美丽的日落风景" ]) # 图像嵌入 image_paths = ["cat.jpg", "sunset.png"] image_embeddings = embedding_model.embed_images(image_paths) # 计算相似度 import numpy as np similarity = np.dot(text_embeddings[0], image_embeddings[0]) print(f"文本与图像的相似度:{similarity}") -
BridgeTower(视觉语言联合嵌入)
pythonfrom langchain_experimental.embeddings import BridgeTowerEmbeddings # 初始化BridgeTower embeddings = BridgeTowerEmbeddings( model_name="BridgeTower/bridgetower-large", batch_size=16 ) # 同时处理文本和图像对 pairs = [ {"text": "一只狗在草地上奔跑", "image": "dog.jpg"}, {"text": "汽车在高速公路上", "image": "car.jpg"} ] # 生成联合嵌入 joint_embeddings = embeddings.embed_image_text_pairs(pairs) -
ImageBind(六模态对齐)
pythonfrom langchain_experimental.embeddings import ImageBindEmbeddings # ImageBind支持6种模态:图像、文本、音频、深度、热成像、IMU embeddings = ImageBindEmbeddings( model_name="imagebind-huge", modalities=["vision", "text", "audio"] ) # 嵌入不同模态 modality_inputs = { "vision": ["photo.jpg", "drawing.png"], "text": ["海浪的声音", "鸟鸣"], "audio": ["wave.mp3", "bird.wav"] } all_embeddings = embeddings.embed_modalities(modality_inputs)
4.4 多模态Output Parsers
作用:将多模态LLM的输出解析为结构化数据,便于后续处理和使用。
-
多模态Pydantic解析器
pythonfrom langchain_core.output_parsers import PydanticOutputParser from pydantic import BaseModel, Field from typing import Optional, List from langchain.output_parsers import OutputFixingParser # 定义多模态输出模型 class ImageAnalysis(BaseModel): """图像分析结果""" description: str = Field(description="图像描述") objects: List[str] = Field(description="检测到的物体") colors: List[str] = Field(description="主要颜色") text_detected: Optional[str] = Field(description="检测到的文字") sentiment: str = Field(description="图像情感") confidence: float = Field(description="分析置信度") class MultimodalDocument(BaseModel): """多模态文档""" title: str document_type: str = Field(description="文档类型:invoice/report/letter") images: List[ImageAnalysis] = Field(description="图像分析列表") tables: Optional[List[dict]] = Field(description="提取的表格") summary: str key_points: List[str] # 创建解析器 parser = PydanticOutputParser(pydantic_object=MultimodalDocument) # 使用修正解析器(处理格式错误) fixing_parser = OutputFixingParser.from_llm( parser=parser, llm=ChatOpenAI(model="gpt-4") ) # 在提示中使用 from langchain_core.prompts import PromptTemplate prompt = PromptTemplate( template="分析以下文档并输出JSON格式:\n{document}\n{format_instructions}", input_variables=["document"], partial_variables={"format_instructions": parser.get_format_instructions()} ) chain = prompt | llm | fixing_parser result = chain.invoke({"document": "发票扫描件..."}) print(result.images[0].description) -
多模态数据提取器
pythonfrom langchain.output_parsers import StructuredOutputParser from langchain.output_parsers import ResponseSchema # 定义响应结构 response_schemas = [ ResponseSchema( name="text_content", description="提取的文本内容" ), ResponseSchema( name="image_analysis", description="图像分析结果列表", type="list" ), ResponseSchema( name="audio_transcript", description="音频转录结果", type="string" ), ResponseSchema( name="metadata", description="文档元数据", type="dict" ) ] parser = StructuredOutputParser.from_response_schemas(response_schemas) # 使用 format_instructions = parser.get_format_instructions()
五、应用层
应用层基于基础层和中间层提供的能力,构建具体的多模态应用。
5.1 多模态Chain
作用:将多个多模态组件组合成可执行的链式流程,实现复杂的多模态处理逻辑。
python
from langchain.chains import (
LLMChain,
SequentialChain,
TransformChain,
SimpleSequentialChain
)
from langchain.chains.base import Chain
from langchain_core.prompts import ChatPromptTemplate
from typing import Dict, Any, List
# ========== 基础多模态链 ==========
class MultimodalBaseChain(LLMChain):
"""基础多模态链"""
def __init__(self, llm, prompt_template: str, **kwargs):
prompt = ChatPromptTemplate.from_messages([
("system", "你是一个多模态AI助手"),
("user", prompt_template)
])
super().__init__(llm=llm, prompt=prompt, **kwargs)
def prep_inputs(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""预处理输入"""
# 处理图像输入
if "image" in inputs:
if isinstance(inputs["image"], str):
# 如果是文件路径,转换为base64
if inputs["image"].endswith(('.jpg', '.png', '.jpeg')):
with open(inputs["image"], "rb") as f:
img_base64 = base64.b64encode(f.read()).decode()
inputs["image"] = {
"type": "image_url",
"image_url": f"data:image/jpeg;base64,{img_base64}"
}
return inputs
# ========== 图像分析链 ==========
class ImageAnalysisChain(SequentialChain):
"""图像分析链:描述 -> 检测 -> 分析"""
def __init__(self, llm, **kwargs):
# 第一步:图像描述
describe_prompt = ChatPromptTemplate.from_messages([
("system", "详细描述图像内容"),
("user", [
{"type": "text", "text": "描述这张图像"},
{"type": "image_url", "image_url": "{image}"}
])
])
describe_chain = LLMChain(
llm=llm,
prompt=describe_prompt,
output_key="description"
)
# 第二步:物体检测
detect_prompt = ChatPromptTemplate.from_messages([
("system", "列出图像中的所有物体"),
("user", [
{"type": "text", "text": "基于描述:{description},列出所有物体"},
{"type": "image_url", "image_url": "{image}"}
])
])
detect_chain = LLMChain(
llm=llm,
prompt=detect_prompt,
output_key="objects"
)
# 第三步:场景分析
scene_prompt = ChatPromptTemplate.from_messages([
("system", "分析图像场景和情感"),
("user", "图像描述:{description}\n物体:{objects}\n请分析:场景类型、时间、情感氛围")
])
scene_chain = LLMChain(
llm=llm,
prompt=scene_prompt,
output_key="scene_analysis"
)
# 组合成序列链
super().__init__(
chains=[describe_chain, detect_chain, scene_chain],
input_variables=["image"],
output_variables=["description", "objects", "scene_analysis"],
**kwargs
)
# ========== 多模态RAG链 ==========
class MultimodalRAGChain(Chain):
"""多模态RAG链:检索 + 生成"""
def __init__(self, retriever, llm, **kwargs):
super().__init__(**kwargs)
self.retriever = retriever
self.llm = llm
self.prompt = ChatPromptTemplate.from_messages([
("system", "基于以下多模态上下文回答问题"),
("user", "上下文:{context}\n\n问题:{question}")
])
@property
def input_keys(self) -> List[str]:
return ["question", "modality"]
@property
def output_keys(self) -> List[str]:
return ["answer", "sources"]
def _call(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
# 检索相关文档
if inputs.get("modality") == "image":
# 图像查询
docs = self.retriever.similarity_search_by_image(
inputs["question"],
k=3
)
else:
# 文本查询
docs = self.retriever.similarity_search(
inputs["question"],
k=5
)
# 构建上下文
context = "\n".join([
f"[{doc.metadata.get('type', 'text')}] {doc.page_content}"
for doc in docs
])
# 生成回答
response = self.llm.invoke(
self.prompt.format_messages(
context=context,
question=inputs["question"]
)
)
return {
"answer": response.content,
"sources": [doc.metadata for doc in docs]
}
# ========== 自定义多模态转换链 ==========
class MultimodalTransformChain(TransformChain):
"""多模态数据转换链"""
def __init__(self, **kwargs):
transform_func = self._transform_multimodal
super().__init__(
input_variables=["input_data"],
output_variables=["processed_data"],
transform=transform_func,
**kwargs
)
def _transform_multimodal(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
"""转换多模态数据"""
data = inputs["input_data"]
result = {"processed_data": {}}
if isinstance(data, dict):
for key, value in data.items():
if key == "image" and isinstance(value, str):
# 处理图像
if value.endswith(('.jpg', '.png')):
result["processed_data"][key] = self._process_image(value)
elif key == "audio" and isinstance(value, str):
# 处理音频
result["processed_data"][key] = self._process_audio(value)
else:
result["processed_data"][key] = value
else:
result["processed_data"]["text"] = str(data)
return result
def _process_image(self, image_path: str) -> Dict:
"""处理图像"""
from PIL import Image
img = Image.open(image_path)
return {
"path": image_path,
"size": img.size,
"format": img.format,
"mode": img.mode
}
def _process_audio(self, audio_path: str) -> Dict:
"""处理音频"""
import os
stat = os.stat(audio_path)
return {
"path": audio_path,
"size": stat.st_size,
"duration": "unknown" # 实际可使用audio库获取
}
# ========== 使用示例 ==========
def multimodal_chain_example():
"""多模态链使用示例"""
# 初始化
llm = ChatOpenAI(model="gpt-4-vision-preview")
# 创建图像分析链
image_chain = ImageAnalysisChain(llm=llm)
# 执行图像分析
image_result = image_chain({
"image": "complex_scene.jpg"
})
print("=== 图像分析结果 ===")
print(f"描述:{image_result['description']}")
print(f"物体:{image_result['objects']}")
print(f"场景分析:{image_result['scene_analysis']}")
# 创建RAG链
# 假设已有vectorstore
vectorstore = Chroma(embedding_function=CLIPEmbeddings())
rag_chain = MultimodalRAGChain(
retriever=vectorstore.as_retriever(),
llm=llm
)
# 执行RAG
rag_result = rag_chain({
"question": "找到的图片中有什么共同特点?",
"modality": "image"
})
print("\n=== RAG结果 ===")
print(f"答案:{rag_result['answer']}")
print(f"来源:{rag_result['sources']}")
return image_result, rag_result
5.2 多模态Agent
作用:智能调度多模态工具和模型,自主完成复杂的多模态任务。
python
from langchain.agents import (
AgentExecutor,
create_openai_tools_agent,
create_react_agent
)
from langchain.agents.format_scratchpad import format_to_openai_function_messages
from langchain.tools import BaseTool
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from typing import Optional, Type
import asyncio
# ========== 多模态工具集 ==========
class MultimodalToolkit:
"""多模态工具包"""
def __init__(self, llm):
self.llm = llm
self.tools = self._create_tools()
def _create_tools(self) -> List[BaseTool]:
"""创建工具集"""
return [
ImageCaptionTool(),
ObjectDetectionTool(),
OCRTool(),
AudioTranscriptionTool(),
self._create_image_search_tool(),
self._create_multimodal_qa_tool()
]
def _create_image_search_tool(self):
"""创建图像搜索工具"""
class ImageSearchTool(BaseTool):
name = "image_search"
description = "搜索相关图像"
def _run(self, query: str) -> str:
# 实际可以调用图像搜索引擎API
return f"找到与'{query}'相关的图像:image1.jpg, image2.jpg"
def _arun(self, query: str):
raise NotImplementedError
return ImageSearchTool()
def _create_multimodal_qa_tool(self):
"""创建多模态问答工具"""
class MultimodalQATool(BaseTool):
name = "multimodal_qa"
description = "回答关于图像、音频等多模态内容的问题"
def _run(self, query: str, file_path: Optional[str] = None) -> str:
if file_path and file_path.endswith(('.jpg', '.png')):
# 图像问答
return self._answer_image_question(query, file_path)
elif file_path and file_path.endswith(('.mp3', '.wav')):
# 音频问答
return self._answer_audio_question(query, file_path)
else:
return "请提供有效的文件路径"
def _answer_image_question(self, question: str, image_path: str) -> str:
"""图像问答"""
# 使用多模态LLM
message = HumanMessage(content=[
{"type": "text", "text": question},
{"type": "image_url", "image_url": image_path}
])
response = self.llm.invoke([message])
return response.content
def _answer_audio_question(self, question: str, audio_path: str) -> str:
"""音频问答"""
# 先转录再问答
transcript = AudioTranscriptionTool().run(audio_path)
response = self.llm.invoke(f"基于以下转录:{transcript}\n问题:{question}")
return response.content
def _arun(self, *args, **kwargs):
raise NotImplementedError
return MultimodalQATool(llm=self.llm)
# ========== 多模态Agent ==========
class MultimodalAgent:
"""多模态智能Agent"""
def __init__(self, llm, tools: List[BaseTool]):
self.llm = llm
self.tools = tools
self.agent = self._create_agent()
self.agent_executor = AgentExecutor(
agent=self.agent,
tools=tools,
verbose=True,
handle_parsing_errors=True,
max_iterations=5
)
def _create_agent(self):
"""创建Agent"""
prompt = ChatPromptTemplate.from_messages([
("system", """你是一个多模态AI助手,可以处理文本、图像、音频等多种数据。
你可以使用以下工具:
1. image_caption: 生成图像描述
2. object_detection: 检测图像中的物体
3. ocr: 识别图像中的文字
4. audio_transcription: 转录音频
5. image_search: 搜索相关图像
6. multimodal_qa: 回答多模态问题
请根据用户需求选择合适的工具。"""),
("user", "{input}"),
MessagesPlaceholder(variable_name="agent_scratchpad")
])
return create_openai_tools_agent(
llm=self.llm,
tools=self.tools,
prompt=prompt
)
async def ainvoke(self, input_text: str, files: Optional[List[str]] = None):
"""异步调用"""
# 准备输入
full_input = input_text
if files:
full_input += f"\n附带的文件:{', '.join(files)}"
# 执行
response = await self.agent_executor.ainvoke({
"input": full_input
})
return response
def invoke(self, input_text: str, files: Optional[List[str]] = None):
"""同步调用"""
full_input = input_text
if files:
full_input += f"\n附带的文件:{', '.join(files)}"
response = self.agent_executor.invoke({
"input": full_input
})
return response
# ========== 专用多模态Agent ==========
class ImageAnalysisAgent(MultimodalAgent):
"""图像分析专用Agent"""
def __init__(self, llm):
tools = [
ImageCaptionTool(),
ObjectDetectionTool(),
OCRTool(),
self._create_image_enhance_tool()
]
super().__init__(llm, tools)
def _create_image_enhance_tool(self):
"""创建图像增强工具"""
class ImageEnhanceTool(BaseTool):
name = "image_enhance"
description = "增强图像质量(调整亮度、对比度等)"
def _run(self, image_path: str, operation: str = "enhance") -> str:
from PIL import Image, ImageEnhance
img = Image.open(image_path)
if operation == "brightness":
enhancer = ImageEnhance.Brightness(img)
enhanced = enhancer.enhance(1.5)
elif operation == "contrast":
enhancer = ImageEnhance.Contrast(img)
enhanced = enhancer.enhance(1.5)
else:
enhanced = img
output_path = f"enhanced_{image_path}"
enhanced.save(output_path)
return f"图像已增强,保存为:{output_path}"
def _arun(self, *args, **kwargs):
raise NotImplementedError
return ImageEnhanceTool()
class AudioAnalysisAgent(MultimodalAgent):
"""音频分析专用Agent"""
def __init__(self, llm):
tools = [
AudioTranscriptionTool(),
self._create_speaker_diarization_tool(),
self._create_sentiment_analysis_tool()
]
super().__init__(llm, tools)
def _create_speaker_diarization_tool(self):
"""创建说话人识别工具"""
class SpeakerDiarizationTool(BaseTool):
name = "speaker_diarization"
description = "识别音频中的不同说话人"
def _run(self, audio_path: str) -> str:
# 实际可使用pyannote等库
return "检测到2个说话人:Speaker A(男性,0-30秒),Speaker B(女性,30-60秒)"
def _arun(self, *args, **kwargs):
raise NotImplementedError
return SpeakerDiarizationTool()
def _create_sentiment_analysis_tool(self):
"""创建情感分析工具"""
class SentimentAnalysisTool(BaseTool):
name = "sentiment_analysis"
description = "分析音频中的情感"
def _run(self, audio_path: str) -> str:
return "整体情感:积极(80%),情感变化:从平静到兴奋"
def _arun(self, *args, **kwargs):
raise NotImplementedError
return SentimentAnalysisTool()
# ========== 使用示例 ==========
async def multimodal_agent_example():
"""多模态Agent使用示例"""
# 初始化
llm = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)
# 创建通用多模态Agent
toolkit = MultimodalToolkit(llm)
agent = MultimodalAgent(llm, toolkit.tools)
# 执行任务
response = await agent.ainvoke(
"分析这张图片(photo.jpg)中的内容,然后搜索类似的图片,并告诉我搜索结果",
files=["photo.jpg"]
)
print("=== Agent响应 ===")
print(response['output'])
# 创建专用Agent
image_agent = ImageAnalysisAgent(llm)
image_response = image_agent.invoke(
"增强这张图片的亮度,然后描述内容",
files=["dark_photo.jpg"]
)
print("\n=== 图像Agent响应 ===")
print(image_response['output'])
# 创建音频Agent
audio_agent = AudioAnalysisAgent(llm)
audio_response = audio_agent.invoke(
"转录这段音频,识别说话人,并分析情感",
files=["meeting.mp3"]
)
print("\n=== 音频Agent响应 ===")
print(audio_response['output'])
return response, image_response, audio_response
# 运行
# asyncio.run(multimodal_agent_example())
5.3 多模态RAG
作用:实现多模态检索增强生成,从多模态知识库中检索相关信息并生成答案。
python
from langchain.schema import Document
from langchain.retrievers import (
EnsembleRetriever,
ContextualCompressionRetriever
)
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
import uuid
# ========== 多模态文档 ==========
class MultimodalDocument(Document):
"""多模态文档类"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.modality = kwargs.get("modality", "text")
self.file_path = kwargs.get("file_path")
self.thumbnail = kwargs.get("thumbnail")
self.duration = kwargs.get("duration") # 音频/视频时长
def to_text(self) -> str:
"""转换为文本表示"""
if self.modality == "image":
return f"[图像] {self.page_content}"
elif self.modality == "audio":
return f"[音频] {self.page_content}"
elif self.modality == "video":
return f"[视频] {self.page_content}"
else:
return self.page_content
# ========== 多模态检索器 ==========
class MultimodalRetriever:
"""多模态检索器"""
def __init__(
self,
vectorstore,
text_weight: float = 0.5,
image_weight: float = 0.3,
audio_weight: float = 0.2
):
self.vectorstore = vectorstore
self.weights = {
"text": text_weight,
"image": image_weight,
"audio": audio_weight
}
def get_relevant_documents(self, query: str, **kwargs) -> List[Document]:
"""检索相关文档"""
all_docs = []
# 按模态检索
for modality, weight in self.weights.items():
docs = self.vectorstore.similarity_search(
query,
k=kwargs.get("k_per_modality", 3),
filter={"modality": modality}
)
# 添加权重分数
for doc in docs:
doc.metadata["retrieval_score"] = doc.metadata.get("score", 0) * weight
doc.metadata["modality_weight"] = weight
all_docs.append(doc)
# 按分数排序
all_docs.sort(
key=lambda x: x.metadata.get("retrieval_score", 0),
reverse=True
)
return all_docs[:kwargs.get("k", 5)]
async def aget_relevant_documents(self, query: str, **kwargs):
"""异步检索"""
return self.get_relevant_documents(query, **kwargs)
# ========== 多模态RAG系统 ==========
class MultimodalRAG:
"""多模态RAG系统"""
def __init__(
self,
llm,
vectorstore,
embeddings,
retriever: Optional[MultimodalRetriever] = None
):
self.llm = llm
self.vectorstore = vectorstore
self.embeddings = embeddings
if retriever is None:
self.retriever = MultimodalRetriever(vectorstore)
else:
self.retriever = retriever
self.setup_chains()
def setup_chains(self):
"""设置处理链"""
# 图像描述链
self.caption_chain = LLMChain(
llm=self.llm,
prompt=ChatPromptTemplate.from_messages([
("user", [
{"type": "text", "text": "用一句话描述这张图像"},
{"type": "image_url", "image_url": "{image}"}
])
])
)
# 音频转录链
self.transcribe_chain = LLMChain(
llm=self.llm,
prompt=ChatPromptTemplate.from_template(
"转录这段音频的内容:{audio}"
)
)
# 答案生成链
self.answer_chain = LLMChain(
llm=self.llm,
prompt=ChatPromptTemplate.from_messages([
("system", "你是一个多模态AI助手。基于以下多模态上下文回答问题。"),
("user", "上下文:\n{context}\n\n问题:{question}")
])
)
def add_documents(self, documents: List[MultimodalDocument]):
"""添加文档到知识库"""
texts = []
metadatas = []
ids = []
for doc in documents:
# 生成唯一ID
doc_id = str(uuid.uuid4())
ids.append(doc_id)
# 根据模态处理
if doc.modality == "image":
# 图像:存储路径和描述
texts.append(doc.to_text())
metadatas.append({
"modality": "image",
"file_path": doc.file_path,
"description": doc.page_content,
"id": doc_id
})
# 同时存储图像向量
if hasattr(doc, "image_path") and doc.image_path:
image_embedding = self.embeddings.embed_images([doc.image_path])[0]
self.vectorstore._collection.add(
embeddings=[image_embedding],
documents=[doc.page_content],
metadatas=[{"modality": "image_vector", "doc_id": doc_id}],
ids=[f"img_vec_{doc_id}"]
)
elif doc.modality == "audio":
# 音频:存储转录文本
texts.append(doc.to_text())
metadatas.append({
"modality": "audio",
"file_path": doc.file_path,
"transcript": doc.page_content,
"duration": doc.duration,
"id": doc_id
})
else:
# 文本
texts.append(doc.page_content)
metadatas.append({
"modality": "text",
"id": doc_id,
**doc.metadata
})
# 添加到向量存储
self.vectorstore.add_texts(
texts=texts,
metadatas=metadatas,
ids=ids
)
def query(self, question: str, k: int = 5) -> Dict[str, Any]:
"""查询RAG系统"""
# 检索相关文档
docs = self.retriever.get_relevant_documents(question, k=k)
# 构建多模态上下文
context_parts = []
images_to_show = []
for doc in docs:
if doc.metadata.get("modality") == "image":
# 图像:使用描述
context_parts.append(f"[图像描述] {doc.page_content}")
if doc.metadata.get("file_path"):
images_to_show.append(doc.metadata["file_path"])
elif doc.metadata.get("modality") == "audio":
# 音频:使用转录
context_parts.append(f"[音频转录] {doc.page_content}")
else:
# 文本
context_parts.append(doc.page_content)
context = "\n\n".join(context_parts)
# 生成答案
response = self.answer_chain.run(
context=context,
question=question
)
return {
"answer": response,
"source_documents": docs,
"images": images_to_show,
"context": context
}
async def aquery(self, question: str, k: int = 5):
"""异步查询"""
return self.query(question, k)
# ========== 高级多模态RAG ==========
class AdvancedMultimodalRAG(MultimodalRAG):
"""高级多模态RAG(支持混合检索和重排序)"""
def __init__(self, llm, vectorstore, embeddings):
super().__init__(llm, vectorstore, embeddings)
# 创建混合检索器
self.setup_hybrid_retriever()
def setup_hybrid_retriever(self):
"""设置混合检索器"""
# 向量检索器
vector_retriever = self.vectorstore.as_retriever(
search_kwargs={"k": 10}
)
# 创建混合检索器(向量 + 关键词)
self.hybrid_retriever = EnsembleRetriever(
retrievers=[vector_retriever], # 可添加BM25等
weights=[1.0]
)
# 添加重排序
compressor = LLMChainExtractor.from_llm(self.llm)
self.compression_retriever = ContextualCompressionRetriever(
base_compressor=compressor,
base_retriever=self.hybrid_retriever
)
def query_with_compression(self, question: str, k: int = 5):
"""使用压缩检索查询"""
compressed_docs = self.compression_retriever.get_relevant_documents(question)
# 构建上下文
context = "\n\n".join([doc.page_content for doc in compressed_docs[:k]])
response = self.answer_chain.run(
context=context,
question=question
)
return {
"answer": response,
"compressed_docs": compressed_docs[:k],
"all_docs": compressed_docs
}
# ========== 使用示例 ==========
def multimodal_rag_example():
"""多模态RAG使用示例"""
# 初始化
llm = ChatOpenAI(model="gpt-4-vision-preview")
embeddings = CLIPEmbeddings()
vectorstore = Chroma(
embedding_function=embeddings,
persist_directory="./multimodal_rag_db"
)
# 创建RAG系统
rag = MultimodalRAG(llm, vectorstore, embeddings)
# 准备多模态文档
documents = [
MultimodalDocument(
page_content="一只猫在沙发上睡觉",
modality="text",
metadata={"source": "description"}
),
MultimodalDocument(
page_content="夕阳下的海滩,有两个人在散步",
modality="image",
file_path="beach.jpg",
metadata={"location": "海边"}
),
MultimodalDocument(
page_content="会议讨论产品发布计划,决定下个月发布",
modality="audio",
file_path="meeting.mp3",
duration=120.5,
metadata={"speakers": 3}
)
]
# 添加到知识库
rag.add_documents(documents)
# 查询
result = rag.query("会议讨论了什么内容?有没有相关的图像?", k=3)
print("=== 查询结果 ===")
print(f"答案:{result['answer']}")
print(f"相关图像:{result['images']}")
print("\n来源文档:")
for doc in result['source_documents']:
print(f" - [{doc.metadata.get('modality')}] {doc.page_content[:50]}...")
# 创建高级RAG
advanced_rag = AdvancedMultimodalRAG(llm, vectorstore, embeddings)
# 使用压缩检索
compressed_result = advanced_rag.query_with_compression(
"总结所有关于海滩的内容"
)
print("\n=== 压缩检索结果 ===")
print(f"答案:{compressed_result['answer']}")
return result, compressed_result
# 运行示例
# result = multimodal_rag_example()