环境是 PyTorch + transformers + open_clip。
一、环境准备
bash
pip install torch transformers open_clip_torch pillow
# 可选:FAISS 用于大规模检索
pip install faiss-cpu # 或 faiss-gpu
模型选型:
| 用途 | 推荐模型 | 加载方式 |
|---|---|---|
| 通用,平衡 | google/siglip2-so400m-patch14-384 |
transformers |
| 长文本/多语言 | google/siglip2-so400m-patch16-naflex |
transformers |
| 极致精度 | BAAI/EVA-CLIP-18B |
open_clip |
| 中等规模 EVA | EVA02-CLIP-L-14-336 |
open_clip |
二、Zero-Shot 图像分类
最基础的应用:给定一组类别名,判断图像属于哪一类。
SigLIP 2 版本
python
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "google/siglip2-so400m-patch14-384"
model = AutoModel.from_pretrained(model_id, torch_dtype=torch.bfloat16).to(device).eval()
processor = AutoProcessor.from_pretrained(model_id)
image = Image.open("cat.jpg").convert("RGB")
candidate_labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
inputs = processor(
text=candidate_labels,
images=image,
padding="max_length",
return_tensors="pt",
).to(device)
with torch.no_grad():
outputs = model(**inputs)
# 关键:SigLIP 用 sigmoid,不是 softmax
logits_per_image = outputs.logits_per_image
probs = torch.sigmoid(logits_per_image) # 每个类别独立的概率,不归一化
for label, prob in zip(candidate_labels, probs[0]):
print(f"{label}: {prob.item():.4f}")
几个容易踩的坑:
- SigLIP 用 sigmoid 不是 softmax 。如果你照搬 CLIP 代码用
softmax,结果会看起来"对"但其实是错的概率解释。 padding="max_length"是 SigLIP 推荐的填充方式,因为它训练时就是固定长度的。用 CLIP 风格的padding=True在某些情况下会掉点。- bf16 推理:SigLIP 训练用 bf16,推理也推荐 bf16,fp16 偶尔会数值不稳。
EVA-CLIP 版本(用 open_clip)
python
import torch
from PIL import Image
import open_clip
device = "cuda"
model, _, preprocess = open_clip.create_model_and_transforms(
"EVA02-L-14-336",
pretrained="merged2b_s6b_b61k", # EVA02-CLIP-L 的标准权重
device=device,
)
model.eval()
tokenizer = open_clip.get_tokenizer("EVA02-L-14-336")
image = preprocess(Image.open("cat.jpg").convert("RGB")).unsqueeze(0).to(device)
candidate_labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
text = tokenizer(candidate_labels).to(device)
with torch.no_grad(), torch.cuda.amp.autocast():
image_features = model.encode_image(image)
text_features = model.encode_text(text)
# L2 归一化
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
# EVA-CLIP 用 softmax(标准 CLIP 范式)
logits = (100.0 * image_features @ text_features.T).softmax(dim=-1)
for label, prob in zip(candidate_labels, logits[0]):
print(f"{label}: {prob.item():.4f}")
三、图文检索(含批量优化)
实际项目里很少做单图分类,更常见的是大规模图文检索。这里给一个工程化版本:
python
import torch
import torch.nn.functional as F
from PIL import Image
from transformers import AutoProcessor, AutoModel
from torch.utils.data import Dataset, DataLoader
class ImageDataset(Dataset):
def __init__(self, image_paths, processor):
self.paths = image_paths
self.processor = processor
def __len__(self):
return len(self.paths)
def __getitem__(self, idx):
img = Image.open(self.paths[idx]).convert("RGB")
# 只做 image processing,文本另外处理
pixel_values = self.processor(images=img, return_tensors="pt").pixel_values[0]
return pixel_values, self.paths[idx]
@torch.no_grad()
def encode_images(model, processor, image_paths, batch_size=64, device="cuda"):
"""批量编码图像,返回归一化后的特征矩阵"""
dataset = ImageDataset(image_paths, processor)
loader = DataLoader(dataset, batch_size=batch_size, num_workers=4, pin_memory=True)
all_features = []
all_paths = []
for pixel_values, paths in loader:
pixel_values = pixel_values.to(device, dtype=torch.bfloat16)
features = model.get_image_features(pixel_values=pixel_values)
features = F.normalize(features, dim=-1)
all_features.append(features.float().cpu())
all_paths.extend(paths)
return torch.cat(all_features), all_paths
@torch.no_grad()
def encode_texts(model, processor, texts, batch_size=128, device="cuda"):
all_features = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
inputs = processor(
text=batch,
padding="max_length",
return_tensors="pt",
truncation=True,
).to(device)
features = model.get_text_features(**inputs)
features = F.normalize(features, dim=-1)
all_features.append(features.float().cpu())
return torch.cat(all_features)
# 使用
device = "cuda"
model = AutoModel.from_pretrained(
"google/siglip2-so400m-patch14-384",
torch_dtype=torch.bfloat16,
).to(device).eval()
processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
image_paths = ["img1.jpg", "img2.jpg", ...] # 假设有 10 万张
queries = ["a red sports car", "a sleeping cat", ...]
image_feats, paths = encode_images(model, processor, image_paths)
text_feats = encode_texts(model, processor, queries)
# 检索:text -> image
similarity = text_feats @ image_feats.T # [num_queries, num_images]
top_k = 5
top_values, top_indices = similarity.topk(top_k, dim=-1)
for q_idx, query in enumerate(queries):
print(f"\nQuery: {query}")
for rank, (score, idx) in enumerate(zip(top_values[q_idx], top_indices[q_idx])):
print(f" #{rank+1}: {paths[idx]} (score={score:.4f})")
工程要点:
-
特征缓存 :图像特征算一次就存盘,下次直接 load。
torch.save(image_feats, "feats.pt")即可 -
检索后端 :超过 100 万规模就别用矩阵乘了,上 FAISS:
pythonimport faiss index = faiss.IndexFlatIP(image_feats.shape[1]) # 内积,要求已归一化 index.add(image_feats.numpy()) D, I = index.search(text_feats.numpy(), k=5) -
避免 OOM :大规模 image features 用
torch.float16存储够用,节省一半显存/内存
四、提取视觉特征作为下游模型输入(MLLM 视觉前端)
这是当前最主流的用法------把 SigLIP / EVA-CLIP 当做特征提取器接到 LLM 上。关键是怎么取特征:取 CLS、pooled、还是 patch tokens?
python
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel
device = "cuda"
model = AutoModel.from_pretrained(
"google/siglip2-so400m-patch14-384",
torch_dtype=torch.bfloat16,
).to(device).eval()
processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
image = Image.open("scene.jpg").convert("RGB")
inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
vision_outputs = model.vision_model(
pixel_values=inputs.pixel_values.to(torch.bfloat16),
output_hidden_states=True,
)
# 三种常用特征
pooled = vision_outputs.pooler_output # [B, D] 全局特征,用于分类/检索
patch_tokens = vision_outputs.last_hidden_state # [B, N, D] 所有 patch,用于 MLLM
penultimate = vision_outputs.hidden_states[-2] # [B, N, D] 倒数第二层
print(f"Pooled: {pooled.shape}")
print(f"Patch tokens: {patch_tokens.shape}") # 384/14=27 → 27*27=729 个 patch
print(f"Penultimate: {penultimate.shape}")
实践经验:
- MLLM 普遍用倒数第二层 (penultimate) 的 patch tokens,而不是最后一层。LLaVA、Qwen-VL 等都是这么做的。原因是最后一层经过对比学习压缩到全局表示,丢失了局部细节
- 不要用 pooled output 喂给 LLM------会丢失空间信息
- 如果用 SigLIP 2 NaFlex 变体,处理时要传入原始宽高比信息,否则会回退到 fixed-resolution 模式
简化的"接 LLM"代码骨架:
python
import torch.nn as nn
class VisionProjector(nn.Module):
"""把 SigLIP patch features 投影到 LLM embedding 空间"""
def __init__(self, vision_dim=1152, llm_dim=4096):
super().__init__()
# LLaVA-1.5 风格的 2 层 MLP
self.proj = nn.Sequential(
nn.Linear(vision_dim, llm_dim),
nn.GELU(),
nn.Linear(llm_dim, llm_dim),
)
def forward(self, patch_features):
# patch_features: [B, N, vision_dim]
return self.proj(patch_features) # [B, N, llm_dim]
# 完整 pipeline
def encode_image_for_llm(image, vision_model, projector, processor, device):
inputs = processor(images=image, return_tensors="pt").to(device)
with torch.no_grad():
vision_out = vision_model.vision_model(
pixel_values=inputs.pixel_values.to(torch.bfloat16),
output_hidden_states=True,
)
patch_tokens = vision_out.hidden_states[-2] # 倒数第二层
visual_embeds = projector(patch_tokens) # 投影到 LLM 空间
return visual_embeds # 之后和文本 embedding 拼接喂 LLM
五、Linear Probing(验证表征质量)
如果想评估某个 CLIP 变体的视觉表征质量,最简单的方法是 linear probing:冻结主干,只在 pooled 特征上训一个线性分类器。
python
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
device = "cuda"
# 1. 加载冻结的 SigLIP
model = AutoModel.from_pretrained(
"google/siglip2-so400m-patch14-384",
torch_dtype=torch.float32, # linear probing 用 fp32 稳一些
).to(device).eval()
for p in model.parameters():
p.requires_grad = False
processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
# 2. 提取所有训练集特征(一次性算完,缓存起来)
@torch.no_grad()
def extract_features(dataset, batch_size=128):
feats, labels = [], []
loader = DataLoader(dataset, batch_size=batch_size, num_workers=4)
for imgs, lbls in loader:
inputs = processor(images=list(imgs), return_tensors="pt").to(device)
f = model.get_image_features(**inputs)
feats.append(f.cpu())
labels.append(lbls)
return torch.cat(feats), torch.cat(labels)
# 3. 训一个线性头
class LinearProbe(nn.Module):
def __init__(self, dim, num_classes):
super().__init__()
self.fc = nn.Linear(dim, num_classes)
def forward(self, x):
return self.fc(x)
# 用 sklearn 的 LogisticRegression 其实更方便(一行搞定,免训练循环)
from sklearn.linear_model import LogisticRegression
train_feats, train_labels = extract_features(train_dataset)
test_feats, test_labels = extract_features(test_dataset)
clf = LogisticRegression(max_iter=1000, C=1.0, n_jobs=-1)
clf.fit(train_feats.numpy(), train_labels.numpy())
acc = clf.score(test_feats.numpy(), test_labels.numpy())
print(f"Linear probe accuracy: {acc:.4f}")
LogisticRegression 比手写训练循环简单得多,CLIP 原论文用的也是这个。
六、对比微调(domain adaptation)
如果你有领域内的图文对数据(如医学影像 + 报告),可以在 SigLIP 上微调。这里给一个最简实现:
python
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
class SigLIPLoss(nn.Module):
"""SigLIP 的 sigmoid 损失"""
def __init__(self, temperature=10.0, bias=-10.0):
super().__init__()
# 注意:可学习的 t 和 b
self.t = nn.Parameter(torch.tensor(temperature).log()) # 训练 log(t)
self.b = nn.Parameter(torch.tensor(bias))
def forward(self, img_feats, txt_feats):
# 假设输入已归一化
img_feats = F.normalize(img_feats, dim=-1)
txt_feats = F.normalize(txt_feats, dim=-1)
logits = img_feats @ txt_feats.T * self.t.exp() + self.b
# 对角线是正对,其他是负对
n = logits.size(0)
labels = 2 * torch.eye(n, device=logits.device) - 1 # +1 正对,-1 负对
# log-sigmoid 形式更稳定
loss = -F.logsigmoid(labels * logits).mean()
return loss
class ImageTextDataset(Dataset):
def __init__(self, pairs, processor):
self.pairs = pairs # [(image_path, caption), ...]
self.processor = processor
def __len__(self):
return len(self.pairs)
def __getitem__(self, idx):
path, caption = self.pairs[idx]
img = Image.open(path).convert("RGB")
return img, caption
def collate_fn(batch, processor):
images, captions = zip(*batch)
inputs = processor(
text=list(captions),
images=list(images),
padding="max_length",
return_tensors="pt",
truncation=True,
)
return inputs
# 训练循环
model = AutoModel.from_pretrained("google/siglip2-base-patch16-224").cuda()
processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
loss_fn = SigLIPLoss().cuda()
# 微调时通常只解冻 vision encoder 的最后几层
for name, p in model.named_parameters():
if "vision_model.encoder.layers.11" in name or "vision_model.encoder.layers.10" in name:
p.requires_grad = True
elif "logit_scale" in name or "logit_bias" in name:
p.requires_grad = True
else:
p.requires_grad = False
optimizer = torch.optim.AdamW(
[p for p in model.parameters() if p.requires_grad] + list(loss_fn.parameters()),
lr=1e-5,
)
loader = DataLoader(
ImageTextDataset(pairs, processor),
batch_size=32,
collate_fn=lambda b: collate_fn(b, processor),
shuffle=True,
)
model.train()
for epoch in range(3):
for batch in loader:
batch = {k: v.cuda() for k, v in batch.items()}
outputs = model(**batch)
img_emb = outputs.image_embeds
txt_emb = outputs.text_embeds
loss = loss_fn(img_emb, txt_emb)
optimizer.zero_grad()
loss.backward()
optimizer.step()
微调几个关键经验:
- 小学习率:1e-5 到 5e-6,比从头训低一个数量级
- 不要全参微调:除非数据量 >100k,否则只调最后几层 + projection head
- 保留对比损失:领域微调时如果只用单任务(如下游分类),表征会快速塌缩。最好保留 sigmoid 对比作为正则
- 小心 catastrophic forgetting:可以加入一部分通用图文对作为 replay buffer
- bias 项一定要可训练:负对/正对比例在你的数据集上和预训练数据可能差很多
七、几个常见的"坑"汇总
| 问题 | 现象 | 原因 | 解决 |
|---|---|---|---|
| SigLIP 概率全都很低 | 所有类别 sigmoid 输出都 <0.1 | 用了 softmax 思维 | 这是正常的,sigmoid 不归一化,用相对大小排序就行 |
| EVA-CLIP 加载失败 | open_clip 找不到 pretrained | 权重名拼错 | 用 open_clip.list_pretrained() 查准确名字 |
| MLLM 用 SigLIP 后效果差 | 接入 LLM 性能不如预期 | 用了最后一层 | 改用倒数第二层 patch tokens |
| 检索 batch 算不动 | OOM | 完整相似度矩阵 N×M 太大 | 分块算 + FAISS |
| 微调后泛化变差 | 域内好域外崩 | 全参微调 + 数据少 | 只调最后几层,或加 LoRA |
| SigLIP 2 长文本截断 | text 超过 64 token 被切 | 默认 max_length 是 64 | 用 NaFlex 变体,或手动设 max_length |