使用Qwen3-VL模型批量标注视频内容(视频理解)

Qwen3-VL:视觉理解模型

本文记录如何利用Qwen3-VL对视频内容进行批量标注,即对文件夹中的视频进行打标。

目录

一、权重下载

二、模型加载

三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址

四、批量推理并写入txt

五、完整代码


一、权重下载

下载地址:https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct

该权重为30B的模型,推理需要单张A100(80G)或者两张A100(如果输出长文本);将权重下载保存到本地,加载模型时直接从本地路径加载;

二、模型加载

python 复制代码
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch

def load_models():
    model = AutoModelForImageTextToText.from_pretrained(
        "your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")
    
    return model, processor

三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址

python 复制代码
def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
    if txt_path is not None and input_type == "txt":
        with open(txt_path, "r") as f:
            datas = f.readlines()
            videos = [data.strip("\n") for data in datas] # absolute pth of video
    
    if video_dir is not None and input_type == "dir":
        video_names = os.listdir(video_dir)
        videos = [os.path.join(video_dir, video) for video in video_names]
        videos.sort()
    
    return videos

四、批量推理并写入txt

python 复制代码
    for video in tqdm(videos):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video,
                    },
                    {"type": "text", "text": "Describe the video and tell me what happened in the video?"},
                ],
            }
        ]

        try:
            inputs = processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            )
            inputs = inputs.to(model.device)
            
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=128)
                generated_ids_trimmed = [
                    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            output_text = output_text[0].replace("\n", "")

            output_dir = "select_save_dir"
            os.makedirs(output_dir, exist_ok=True)
            with open(f"{output_dir}/dataset_captions.txt", "a") as f:
                f.writelines(f"{video}#####{output_text}\n")
        except:
            print(f"Error video: {video}")

可替换提问:Describe the video and tell me what happened in the video?

五、完整代码

python 复制代码
import os
import torch
from tqdm import tqdm

from transformers import AutoModelForImageTextToText, AutoProcessor

def load_models():
    model = AutoModelForImageTextToText.from_pretrained(
        "your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")

    return model, processor

def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
    if txt_path is not None and input_type == "txt":
        with open(txt_path, "r") as f:
            datas = f.readlines()
            videos = [data.strip("\n") for data in datas] # absolute pth of video
    
    if video_dir is not None and input_type == "dir":
        video_names = os.listdir(video_dir)
        videos = [os.path.join(video_dir, video) for video in video_names]
        videos.sort()
    
    return videos

if __name__=="__main__":
    model, processor = load_models()
    model.eval()
    
    video_dir = "your_local_video_saved_pth/video"
    videos = get_inference_videos(video_dir=video_dir, input_type="dir")

    for video in tqdm(videos):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video,
                    },
                    {"type": "text", "text": "Describe the video and tell me what happened in the video?"},
                ],
            }
        ]

        try:
            inputs = processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            )
            inputs = inputs.to(model.device)
            
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=128)
                generated_ids_trimmed = [
                    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            output_text = output_text[0].replace("\n", "")

            output_dir = "select_save_dir"
            os.makedirs(output_dir, exist_ok=True)
            with open(f"{output_dir}/dataset_captions.txt", "a") as f:
                f.writelines(f"{video}#####{output_text}\n")
        except:
            print(f"Error video: {video}")
相关推荐
潜创微科技3 分钟前
IT6520:USB‑C DP Alt Mode 到 MIPI 单芯片转换方案
嵌入式硬件·音视频
麦哲思科技任甲林5 分钟前
全变更蒸馏:让AI编程成为一个可进化的系统
人工智能·ai编程·蒸馏·skills·harness工程·回顾
Raink老师8 分钟前
【AI面试临阵磨枪-086】什么是 AI Agent Skill?与传统 Function Calling、Tool 的区别?
人工智能·面试·职场和发展
实在智能RPA10 分钟前
AI Agent是否能处理医药研发数据中多种格式的文档?深度解析2026年智能体在生物医药领域的应用边界
人工智能·ai
Tiansan666613 分钟前
郑州AI问答服务商崛起:专业团队如何重塑企业客服
人工智能·郑州ai问答服务商崛
DeniuHe16 分钟前
sklearn 中所有交叉验证数据集划分方式完整总结
人工智能·python·sklearn
DeniuHe20 分钟前
sklearn中不同交叉验证方法的场景适配
人工智能·python·sklearn
小新同学^O^21 分钟前
简单学习 --> 指令微调
人工智能·学习·llm·指令微调
知识浅谈26 分钟前
Transformer 中的 Q、K、V 到底是什么?怎么理解 Query、Key、Value?
人工智能·深度学习·transformer
名不经传的养虾人26 分钟前
从0到1:企业级AI项目迭代日记 Vol.36|临时方案下线,网关区分负载,用量穿透链路——这一周全是“归位”
人工智能·ai编程·ai工作流·企业ai·多agent协作