使用Qwen3-VL模型批量标注视频内容(视频理解)

Qwen3-VL:视觉理解模型

本文记录如何利用Qwen3-VL对视频内容进行批量标注,即对文件夹中的视频进行打标。

目录

一、权重下载

二、模型加载

三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址

四、批量推理并写入txt

五、完整代码


一、权重下载

下载地址:https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct

该权重为30B的模型,推理需要单张A100(80G)或者两张A100(如果输出长文本);将权重下载保存到本地,加载模型时直接从本地路径加载;

二、模型加载

python 复制代码
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch

def load_models():
    model = AutoModelForImageTextToText.from_pretrained(
        "your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")
    
    return model, processor

三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址

python 复制代码
def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
    if txt_path is not None and input_type == "txt":
        with open(txt_path, "r") as f:
            datas = f.readlines()
            videos = [data.strip("\n") for data in datas] # absolute pth of video
    
    if video_dir is not None and input_type == "dir":
        video_names = os.listdir(video_dir)
        videos = [os.path.join(video_dir, video) for video in video_names]
        videos.sort()
    
    return videos

四、批量推理并写入txt

python 复制代码
    for video in tqdm(videos):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video,
                    },
                    {"type": "text", "text": "Describe the video and tell me what happened in the video?"},
                ],
            }
        ]

        try:
            inputs = processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            )
            inputs = inputs.to(model.device)
            
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=128)
                generated_ids_trimmed = [
                    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            output_text = output_text[0].replace("\n", "")

            output_dir = "select_save_dir"
            os.makedirs(output_dir, exist_ok=True)
            with open(f"{output_dir}/dataset_captions.txt", "a") as f:
                f.writelines(f"{video}#####{output_text}\n")
        except:
            print(f"Error video: {video}")

可替换提问:Describe the video and tell me what happened in the video?

五、完整代码

python 复制代码
import os
import torch
from tqdm import tqdm

from transformers import AutoModelForImageTextToText, AutoProcessor

def load_models():
    model = AutoModelForImageTextToText.from_pretrained(
        "your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")

    return model, processor

def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
    if txt_path is not None and input_type == "txt":
        with open(txt_path, "r") as f:
            datas = f.readlines()
            videos = [data.strip("\n") for data in datas] # absolute pth of video
    
    if video_dir is not None and input_type == "dir":
        video_names = os.listdir(video_dir)
        videos = [os.path.join(video_dir, video) for video in video_names]
        videos.sort()
    
    return videos

if __name__=="__main__":
    model, processor = load_models()
    model.eval()
    
    video_dir = "your_local_video_saved_pth/video"
    videos = get_inference_videos(video_dir=video_dir, input_type="dir")

    for video in tqdm(videos):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video,
                    },
                    {"type": "text", "text": "Describe the video and tell me what happened in the video?"},
                ],
            }
        ]

        try:
            inputs = processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            )
            inputs = inputs.to(model.device)
            
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=128)
                generated_ids_trimmed = [
                    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            output_text = output_text[0].replace("\n", "")

            output_dir = "select_save_dir"
            os.makedirs(output_dir, exist_ok=True)
            with open(f"{output_dir}/dataset_captions.txt", "a") as f:
                f.writelines(f"{video}#####{output_text}\n")
        except:
            print(f"Error video: {video}")
相关推荐
tokepson2 小时前
反向传播
深度学习·算法·ai·反向传播
禾高网络2 小时前
互联网医院定制|互联网医院|禾高互联网医院搭建
java·大数据·人工智能·小程序
鲨莎分不晴2 小时前
通信学习 (Learning to Communicate):从“心电感应”到“语言涌现”
人工智能·学习·机器学习
道法自然04022 小时前
[CARLA系列--05]如何在Carla中去调用传感器模型--Radar篇
人工智能·自动驾驶·ue4
roamingcode2 小时前
2025年技术变革浪潮:从 AI Agent 标准化到人类认知重构
人工智能·ai·重构·agent·skill·mcp
智驱力人工智能2 小时前
森林防火无人机火焰监测系统 构建“天空地”一体化智能防火体系 无人机火焰检测,支持红色火焰检测 城市高层建筑无人机火焰识别
人工智能·深度学习·opencv·算法·目标检测·无人机·边缘计算
Coovally AI模型快速验证2 小时前
无人机低空视觉数据集全景解读:从单机感知到具身智能的跨
人工智能·深度学习·目标检测·机器学习·自动驾驶·无人机
小北方城市网2 小时前
第 5 课:Vue 3 HTTP 请求与 UI 库实战 —— 从本地数据到前后端交互应用
大数据·前端·人工智能·ai·自然语言处理
CES_Asia2 小时前
机器人“奥运会”登陆!CES Asia 2026角逐消费级机器人王座
大数据·人工智能·科技·机器人