Datawhale AI夏令营第五期CV方向-城市管理违规行为智能识别-Task1

赛题解析

城市管理违规行为智能识别

初赛任务是根据给定的城管视频监控数据集，进行城市违规行为的检测。违规行为主要包括垃圾桶满溢、机动车违停、非机动车违停等。

选手需要能够从视频中分析并标记出违规行为，提供违规行为发生的时间和位置信息。

数据可视化

首先对现有数据可视化，直观地了解数据集。

复制代码

import os
import cv2
import json
from PIL import Image, ImageDraw, ImageFont

# 文件夹路径
video_folder = r"D:\Illegal_behavior_detection\训练集(有标注第一批)\视频"
annotation_folder = r"D:\Illegal_behavior_detection\训练集(有标注第一批)\标注"
output_folder = r"D:\Illegal_behavior_detection\训练集(有标注第一批)\标注视频"

# 如果输出文件夹不存在，则创建
os.makedirs(output_folder, exist_ok=True)

# 获取所有视频文件
video_files = [f for f in os.listdir(video_folder) if f.endswith('.mp4')]
labeled_video_files = [f for f in os.listdir(output_folder) if f.endswith('.mp4')]
# 类别对应的颜色
category_colors = {
    "非机动车违停": (0, 0, 255),    # 红色
    "机动车违停": (0, 255, 255),    # 黄色
    "垃圾桶满溢": (255, 0, 0),      # 蓝色
    "违法经营": (0, 255, 0)         # 绿色
}

# 加载字体，指定一个支持中文的字体文件路径
font = ImageFont.truetype("simhei.ttf", 24) 

for video_file in video_files:
    if video_file in labeled_video_files: continue
    video_path = os.path.join(video_folder, video_file)
    annotation_path = os.path.join(annotation_folder, video_file.replace('.mp4', '.json'))
    
    # 读取视频
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"无法打开视频文件: {video_file}")
        continue
    
    # 获取视频的宽、高、帧率
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    
    # 创建保存视频的对象
    output_path = os.path.join(output_folder, video_file)
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    # 读取标注文件
    with open(annotation_path, 'r', encoding='utf-8') as f:
        annotations = json.load(f)
    
    frame_id_to_annotations = {}
    for annotation in annotations:
        frame_id = annotation['frame_id']
        if frame_id not in frame_id_to_annotations:
            frame_id_to_annotations[frame_id] = []
        frame_id_to_annotations[frame_id].append(annotation)
    
    frame_id = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_id in frame_id_to_annotations:
            # 将OpenCV的图像转换为PIL图像
            frame_pil = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
            draw = ImageDraw.Draw(frame_pil)
            
            for ann in frame_id_to_annotations[frame_id]:
                bbox = ann['bbox']
                category = ann['category']
                
                # 获取类别对应的颜色
                color = category_colors.get(category, (255, 255, 255))  # 如果类别不在字典中，则默认白色
                
                # 绘制矩形框
                draw.rectangle([bbox[0], bbox[1], bbox[2], bbox[3]], outline=color, width=2)
                
                # 添加类别标签
                draw.text((bbox[0], bbox[1] - 25), category, font=font, fill=color)
            
            # 将PIL图像转换回OpenCV格式
            frame = cv2.cvtColor(np.array(frame_pil), cv2.COLOR_RGB2BGR)
        
        # 写入当前帧
        out.write(frame)
        
        frame_id += 1
    
    cap.release()
    out.release()

print("标注视频已成功生成并保存到输出文件夹。")

在观看视频中，发现部分数据标注严重错误，需要用标注软件进行修正。

标注格式转换

由于视频趋于同质化，且帧之间区别不大，将所有帧(60000+)用于训练没有意义，故进行抽帧，将数据集大小减小至6000。

复制代码

for anno_path, video_path in zip(annos, videos):
    print(video_path)
    anno_df = pd.read_json(anno_path)
    cap = cv2.VideoCapture(video_path)
    frame_idx = 0 
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % 10 == 0:
            img_height, img_width = frame.shape[:2]
            
            frame_anno = anno_df[anno_df['frame_id'] == frame_idx]
            cv2.imwrite(img_path + os.path.basename(anno_path).split('.')[0] + '_' + str(frame_idx) + '.jpg', frame)

            if len(frame_anno) != 0:
                with open(label_path + os.path.basename(anno_path).split('.')[0] + '_' + str(frame_idx) + '.txt', 'w') as up:
                    for category, bbox in zip(frame_anno['category'].values, frame_anno['bbox'].values):
                        category_idx = category_labels.index(category)
                        
                        x_min, y_min, x_max, y_max = bbox
                        x_center = (x_min + x_max) / 2 / img_width
                        y_center = (y_min + y_max) / 2 / img_height
                        width = (x_max - x_min) / img_width
                        height = (y_max - y_min) / img_height

                        if x_center > 1 or y_center > 1:
                            print(bbox)
                        up.write(f'{category_idx} {x_center} {y_center} {width} {height}\n')
        frame_idx += 1

标注清洗

针对标注严重错误的视频，使用X-Anylabeling重新进行标注，导出yolo标注文件。

模型训练

最后遵循教程进行训练并提交结果。

个人思考

标注有待优化。
训练集位置单一，测试集场景和训练集完全隔离，预估必须增加数据集训练。
如何区分正常停放车辆和违规停放车辆是个需要思考的问题。