Qwen3-VL:视觉理解模型
本文记录如何利用Qwen3-VL对视频内容进行批量标注,即对文件夹中的视频进行打标。
目录
一、权重下载
下载地址:https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct
该权重为30B的模型,推理需要单张A100(80G)或者两张A100(如果输出长文本);将权重下载保存到本地,加载模型时直接从本地路径加载;
二、模型加载
python
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch
def load_models():
model = AutoModelForImageTextToText.from_pretrained(
"your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
)
processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")
return model, processor
三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址
python
def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
if txt_path is not None and input_type == "txt":
with open(txt_path, "r") as f:
datas = f.readlines()
videos = [data.strip("\n") for data in datas] # absolute pth of video
if video_dir is not None and input_type == "dir":
video_names = os.listdir(video_dir)
videos = [os.path.join(video_dir, video) for video in video_names]
videos.sort()
return videos
四、批量推理并写入txt
python
for video in tqdm(videos):
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": video,
},
{"type": "text", "text": "Describe the video and tell me what happened in the video?"},
],
}
]
try:
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
with torch.inference_mode():
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
output_text = output_text[0].replace("\n", "")
output_dir = "select_save_dir"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/dataset_captions.txt", "a") as f:
f.writelines(f"{video}#####{output_text}\n")
except:
print(f"Error video: {video}")
可替换提问:Describe the video and tell me what happened in the video?
五、完整代码
python
import os
import torch
from tqdm import tqdm
from transformers import AutoModelForImageTextToText, AutoProcessor
def load_models():
model = AutoModelForImageTextToText.from_pretrained(
"your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
)
processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")
return model, processor
def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
if txt_path is not None and input_type == "txt":
with open(txt_path, "r") as f:
datas = f.readlines()
videos = [data.strip("\n") for data in datas] # absolute pth of video
if video_dir is not None and input_type == "dir":
video_names = os.listdir(video_dir)
videos = [os.path.join(video_dir, video) for video in video_names]
videos.sort()
return videos
if __name__=="__main__":
model, processor = load_models()
model.eval()
video_dir = "your_local_video_saved_pth/video"
videos = get_inference_videos(video_dir=video_dir, input_type="dir")
for video in tqdm(videos):
messages = [
{
"role": "user",
"content": [
{
"type": "video",
"video": video,
},
{"type": "text", "text": "Describe the video and tell me what happened in the video?"},
],
}
]
try:
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(model.device)
with torch.inference_mode():
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
output_text = output_text[0].replace("\n", "")
output_dir = "select_save_dir"
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/dataset_captions.txt", "a") as f:
f.writelines(f"{video}#####{output_text}\n")
except:
print(f"Error video: {video}")