使用Qwen3-VL模型批量标注视频内容(视频理解)

Qwen3-VL:视觉理解模型

本文记录如何利用Qwen3-VL对视频内容进行批量标注,即对文件夹中的视频进行打标。

目录

一、权重下载

二、模型加载

三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址

四、批量推理并写入txt

五、完整代码


一、权重下载

下载地址:https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct

该权重为30B的模型,推理需要单张A100(80G)或者两张A100(如果输出长文本);将权重下载保存到本地,加载模型时直接从本地路径加载;

二、模型加载

python 复制代码
from transformers import AutoModelForImageTextToText, AutoProcessor
import torch

def load_models():
    model = AutoModelForImageTextToText.from_pretrained(
        "your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")
    
    return model, processor

三、从文件夹路径或者从写入txt的路径中读取视频的绝对地址

python 复制代码
def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
    if txt_path is not None and input_type == "txt":
        with open(txt_path, "r") as f:
            datas = f.readlines()
            videos = [data.strip("\n") for data in datas] # absolute pth of video
    
    if video_dir is not None and input_type == "dir":
        video_names = os.listdir(video_dir)
        videos = [os.path.join(video_dir, video) for video in video_names]
        videos.sort()
    
    return videos

四、批量推理并写入txt

python 复制代码
    for video in tqdm(videos):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video,
                    },
                    {"type": "text", "text": "Describe the video and tell me what happened in the video?"},
                ],
            }
        ]

        try:
            inputs = processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            )
            inputs = inputs.to(model.device)
            
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=128)
                generated_ids_trimmed = [
                    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            output_text = output_text[0].replace("\n", "")

            output_dir = "select_save_dir"
            os.makedirs(output_dir, exist_ok=True)
            with open(f"{output_dir}/dataset_captions.txt", "a") as f:
                f.writelines(f"{video}#####{output_text}\n")
        except:
            print(f"Error video: {video}")

可替换提问:Describe the video and tell me what happened in the video?

五、完整代码

python 复制代码
import os
import torch
from tqdm import tqdm

from transformers import AutoModelForImageTextToText, AutoProcessor

def load_models():
    model = AutoModelForImageTextToText.from_pretrained(
        "your_local_model_pth/Qwen3-VL-30B-A3B-Instruct", dtype=torch.bfloat16, attn_implementation="flash_attention_2", device_map="auto"
    )
    processor = AutoProcessor.from_pretrained("your_local_model_pth/Qwen3-VL-30B-A3B-Instruct")

    return model, processor

def get_inference_videos(txt_path=None, video_dir=None, input_type="txt"):
    if txt_path is not None and input_type == "txt":
        with open(txt_path, "r") as f:
            datas = f.readlines()
            videos = [data.strip("\n") for data in datas] # absolute pth of video
    
    if video_dir is not None and input_type == "dir":
        video_names = os.listdir(video_dir)
        videos = [os.path.join(video_dir, video) for video in video_names]
        videos.sort()
    
    return videos

if __name__=="__main__":
    model, processor = load_models()
    model.eval()
    
    video_dir = "your_local_video_saved_pth/video"
    videos = get_inference_videos(video_dir=video_dir, input_type="dir")

    for video in tqdm(videos):
        messages = [
            {
                "role": "user",
                "content": [
                    {
                        "type": "video",
                        "video": video,
                    },
                    {"type": "text", "text": "Describe the video and tell me what happened in the video?"},
                ],
            }
        ]

        try:
            inputs = processor.apply_chat_template(
                messages,
                tokenize=True,
                add_generation_prompt=True,
                return_dict=True,
                return_tensors="pt"
            )
            inputs = inputs.to(model.device)
            
            with torch.inference_mode():
                generated_ids = model.generate(**inputs, max_new_tokens=128)
                generated_ids_trimmed = [
                    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
                ]
                output_text = processor.batch_decode(
                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
                )
            output_text = output_text[0].replace("\n", "")

            output_dir = "select_save_dir"
            os.makedirs(output_dir, exist_ok=True)
            with open(f"{output_dir}/dataset_captions.txt", "a") as f:
                f.writelines(f"{video}#####{output_text}\n")
        except:
            print(f"Error video: {video}")
相关推荐
自可乐14 小时前
n8n全面学习教程:从入门到精通的自动化工作流引擎实践指南
运维·人工智能·学习·自动化
king of code porter14 小时前
百宝箱企业版搭建智能体应用-创建应用
人工智能·大模型·智能体
HDO清风14 小时前
CASIA-HWDB2.x 数据集DGRL文件解析(python)
开发语言·人工智能·pytorch·python·目标检测·计算机视觉·restful
策知道14 小时前
依托政府工作报告准备省考【经验贴】
大数据·数据库·人工智能·搜索引擎·政务
小Tomkk14 小时前
PyTorch +YOLO + Label Studio + 图像识别 深度学习项目实战 (二)
pytorch·深度学习·yolo
工程师老罗14 小时前
Pytorch如何加载和读取VOC数据集用来做目标检测?
人工智能·pytorch·目标检测
测试_AI_一辰14 小时前
Agent & RAG 测试工程05:把 RAG 的检索过程跑清楚:chunk 是什么、怎么来的、怎么被命中的
开发语言·人工智能·功能测试·自动化·ai编程
Henry-SAP14 小时前
SAP(ERP) 组织结构业务视角解析
大数据·人工智能·sap·erp·sap pp
龙腾亚太14 小时前
航空零部件加工变形难题破解:数字孪生 + 深度学习的精度控制实战
人工智能·深度学习·数字孪生·ai工程师·ai证书·转型ai
Coding茶水间14 小时前
基于深度学习的输电电力设备检测系统演示与介绍(YOLOv12/v11/v8/v5模型+Pyqt5界面+训练代码+数据集)
开发语言·人工智能·深度学习·yolo·目标检测·机器学习