1.手工提示词
有关CLIP和ActionClip的手工特征,也是一个进步。通过给标签填入不同的修饰语当中,组成一段话来,来增强语义理解
python
def text_prompt(data):
text_aug = [f"a photo of action {{}}", f"a picture of action {{}}", f"Human action of {{}}", f"{{}}, an action",
f"{{}} this is an action", f"{{}}, a video of action", f"Playing action of {{}}", f"{{}}",
f"Playing a kind of action, {{}}", f"Doing a kind of action, {{}}", f"Look, the human is {{}}",
f"Can you recognize the action of {{}}?", f"Video classification of {{}}", f"A video of {{}}",
f"The man is {{}}", f"The woman is {{}}"]
text_dict = {}
num_text_aug = len(text_aug)
for ii, txt in enumerate(text_aug):
text_dict[ii] = torch.cat([clip.tokenize(txt.format(c)) for i, c in data.classes])
classes = torch.cat([v for k, v in text_dict.items()])
return classes, num_text_aug,text_dict
这个就是一个手工提示词,针对于特殊的任务设计出来的。text_dict就是蕴含着一个模板是键,对应不同的一句话!
应用为:
python
text_id = numpy.random.randint(num_text_aug,size=len(list_id))#类别长度 上限
#text_dict[j][i, :]:根据 text_id 中的索引 j 从 text_dict 中选择对应的编码张量,再根据 list_id 中的索引 i 从该张量中选择一行。
texts = torch.stack([text_dict[j][i,:] for i,j in zip(list_id,text_id)])
将输入标签随机选一个模板与视频进行对比学习!
2.自动生成提示
(1) 视觉特征的提取
首先,从视频中提取视觉特征。这些特征可以是视频帧的特征,也可以是视频的整体特征。视觉特征的提取通常使用预训练的视觉模型(如 CLIP 的视觉编码器)来完成。
(2) 文本特征的初始化
文本特征可以是类别标签的嵌入,也可以是其他与任务相关的文本信息。这些文本特征将作为提示的初始输入。
(3)跨模态注意力机制
python
class MulitHeadAttention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
self.scale = qk_scale or head_dim ** -0.5
self.q_proj = nn.Linear(dim, dim, bias=qkv_bias)
self.k_proj = nn.Linear(dim, dim, bias=qkv_bias)
self.v_proj = nn.Linear(dim, dim, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, q, k, v):
B, N, C = q.shape
B, M, C = k.shape
q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads).permute(0,2,1,3)
k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads).permute(0,2,1,3)
v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads).permute(0,2,1,3)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
(4) PromptGeneratorLayer(本质上是一个transfomer编码器)
python
class PromptGeneratorLayer(nn.Module):
def __init__(
self,
d_model,
nhead,
dropout=0.,
):
super().__init__()
self.cross_attn = MulitHeadAttention(d_model, nhead, proj_drop=dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
self.mlp = nn.Sequential(
nn.Linear(d_model, d_model * 4),
QuickGELU(),
nn.Dropout(dropout),
nn.Linear(d_model * 4, d_model)
)
def forward(self, x, visual):
q = k = v = self.norm1(x)
x = x + self.cross_attn(q, visual, visual)
x = x + self.dropout(self.mlp(self.norm3(x)))
return x
(5)融合
python
class VideoSpecificPrompt(nn.Module):
def __init__(self, layers=2, embed_dim=512, alpha=0.1,):
super().__init__()
self.norm = nn.LayerNorm(embed_dim)
self.decoder = nn.ModuleList([PromptGeneratorLayer(embed_dim, embed_dim//64) for _ in range(layers)])
self.alpha = nn.Parameter(torch.ones(embed_dim) * alpha)
self.apply(self._init_weights)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
def forward(self, text, visual):
B, N, C = visual.shape
visual = self.norm(visual)
for layer in self.decoder:
text = layer(text, visual)
应用:
python
def generate_text(data):
text_aug = f"{{}}"
classes = torch.cat([clip.tokenize(text_aug.format(c), context_length=77) for i, c in data.classes])
return classes
python
def classes(self):
classes_all = pd.read_csv(self.labels_file)
return classes_all.values.tolist()
class是:classes
是一个 torch.Tensor
类型的对象,其形状为 (n, 77)
,其中 n
是类别名称的数量,每一行代表一个类别名称经过 CLIP 分词后的结果。