来自 豆包
一、数据增强的核心原则
数据增强需遵循**"图像与标签同步变换"**:
- 几何变换(缩放、裁剪、翻转)会改变目标框的像素位置,必须按相同变换规则重新计算目标框坐标;
- 色彩变换(亮度、对比度、饱和度)仅改变像素值,不影响目标框位置,无需调整坐标;
- 增强流程:先做几何变换(保证坐标同步)→ 再做色彩变换(不影响坐标)→ 最后resize到448×448(统一尺寸)→ 转换为YOLOv1标签张量。
二、各增强操作的详细做法(附坐标调整示例)
以下所有示例均基于原始图像448×448 ,原始目标框:xmin=128, ymin=192, xmax=256, ymax=320(dog,中心(192,256))。
1. 随机水平翻转(概率0.5)
操作逻辑
- 图像:左右镜像翻转;
- 坐标调整规则:
翻转后目标框的xmin = 图像宽度 - 原始xmax;
翻转后目标框的xmax = 图像宽度 - 原始xmin;
ymin/ymax保持不变。
数值示例
原始框:xmin=128, ymin=192, xmax=256, ymax=320(图像宽448);
翻转后:xmin=448-256=192,xmax=448-128=320,ymin=192,ymax=320;
目标中心从(192,256)变为(304,256),对应网格从(4,3)变为(4,4)(网格大小64,304/64=4.75→grid_x=4)。
2. 随机缩放(缩放比例0.5~1.5倍)
操作逻辑
- 图像:按随机比例
scale(0.5~1.5)缩放整体尺寸; - 坐标调整规则:
缩放后xmin = 原始xmin × scale;
缩放后xmax = 原始xmax × scale;
缩放后ymin = 原始ymin × scale;
缩放后ymax = 原始ymax × scale; - 缩放后需将图像resize回448×448(或裁剪/补边),最终坐标需再按resize比例调整。
数值示例
取缩放比例scale=1.2:
原始框:xmin=128, ymin=192, xmax=256, ymax=320;
缩放后框:xmin=128×1.2=153.6,xmax=256×1.2=307.2,ymin=192×1.2=230.4,ymax=320×1.2=384;
缩放后图像尺寸为448×1.2=537.6×448×1.2=537.6,再resize回448×448,框坐标需除以1.2,最终回到原始值(仅增强随机性,不改变最终尺寸)。
3. 随机裁剪(裁剪区域占原图0.2~1.0倍)
操作逻辑
- 图像:随机选择裁剪区域(宽度/高度为原图的0.2~1.0倍),裁剪后resize回448×448;
- 坐标调整规则:
- 计算裁剪区域的左上角坐标
(crop_x, crop_y)和裁剪尺寸(crop_w, crop_h); - 判断目标框与裁剪区域的交集:
- 若目标框完全在裁剪区域外:丢弃该目标;
- 若目标框部分/完全在裁剪区域内:裁剪后目标框的坐标 = 原始坐标 - 裁剪区域左上角坐标;
- 裁剪后resize回448×448,目标框坐标需乘以
448/crop_w(宽)和448/crop_h(高)。
- 计算裁剪区域的左上角坐标
数值示例
原始框:xmin=128, ymin=192, xmax=256, ymax=320;
随机裁剪区域:crop_x=50, crop_y=100, crop_w=300, crop_h=300;
裁剪后框:xmin=128-50=78,xmax=256-50=206,ymin=192-100=92,ymax=320-100=220;
resize回448×448后:xmin=78×448/300≈113.28,xmax=206×448/300≈301.23,ymin=92×448/300≈136.11,ymax=220×448/300≈328.53。
4. 随机调整亮度/对比度/饱和度(色彩变换)
操作逻辑
- 图像:在HSV色彩空间调整(亮度对应V通道,饱和度对应S通道,对比度对应RGB的像素值缩放);
- 坐标:无需调整,仅改变图像像素值,不影响目标框位置。
三、完整数据增强代码(附坐标同步调整)
以下代码基于PIL和OpenCV实现,包含所有增强操作,且同步调整目标框坐标,最终输出增强后的图像和对应的YOLOv1标签张量:
python
import numpy as np
import cv2
import xml.etree.ElementTree as ET
from PIL import Image, ImageEnhance
import random
# VOC类别映射
VOC_CLASSES = [
'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
'bus', 'car', 'cat', 'chair', 'cow',
'diningtable', 'dog', 'horse', 'motorbike', 'person',
'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
]
VOC_CLASS_TO_ID = {name: idx for idx, name in enumerate(VOC_CLASSES)}
def load_voc_annotation(xml_path):
"""加载VOC XML标注,返回原始目标框和类别"""
tree = ET.parse(xml_path)
root = tree.getroot()
annotations = []
img_w = int(root.find('size/width').text)
img_h = int(root.find('size/height').text)
for obj in root.iter('object'):
class_name = obj.find('name').text
class_id = VOC_CLASS_TO_ID[class_name]
bndbox = obj.find('bndbox')
xmin = float(bndbox.find('xmin').text)
ymin = float(bndbox.find('ymin').text)
xmax = float(bndbox.find('xmax').text)
ymax = float(bndbox.find('ymax').text)
annotations.append({
'bbox': [xmin, ymin, xmax, ymax],
'class_id': class_id
})
return annotations, img_w, img_h
def random_horizontal_flip(image, bboxes):
"""随机水平翻转(概率0.5),同步调整bbox"""
if random.random() < 0.5:
img_w = image.shape[1]
# 图像翻转
image = cv2.flip(image, 1)
# 调整bbox:xmin = img_w - xmax, xmax = img_w - xmin
new_bboxes = []
for bbox in bboxes:
xmin, ymin, xmax, ymax = bbox
new_xmin = img_w - xmax
new_xmax = img_w - xmin
new_bboxes.append([new_xmin, ymin, new_xmax, ymax])
return image, new_bboxes
return image, bboxes
def random_scale(image, bboxes, scale_range=(0.5, 1.5)):
"""随机缩放,同步调整bbox"""
scale = random.uniform(*scale_range)
img_h, img_w = image.shape[:2]
# 缩放图像
new_w = int(img_w * scale)
new_h = int(img_h * scale)
image = cv2.resize(image, (new_w, new_h))
# 调整bbox
new_bboxes = []
for bbox in bboxes:
xmin, ymin, xmax, ymax = bbox
new_xmin = xmin * scale
new_ymin = ymin * scale
new_xmax = xmax * scale
new_ymax = ymax * scale
new_bboxes.append([new_xmin, new_ymin, new_xmax, new_ymax])
return image, new_bboxes
def random_crop(image, bboxes, crop_ratio_range=(0.2, 1.0)):
"""随机裁剪,同步调整bbox"""
img_h, img_w = image.shape[:2]
# 随机选择裁剪比例和位置
crop_w_ratio = random.uniform(*crop_ratio_range)
crop_h_ratio = random.uniform(*crop_ratio_range)
crop_w = int(img_w * crop_w_ratio)
crop_h = int(img_h * crop_h_ratio)
crop_x = random.randint(0, img_w - crop_w)
crop_y = random.randint(0, img_h - crop_h)
# 裁剪图像
cropped_image = image[crop_y:crop_y+crop_h, crop_x:crop_x+crop_w]
# 调整bbox:仅保留在裁剪区域内的目标
new_bboxes = []
for bbox in bboxes:
xmin, ymin, xmax, ymax = bbox
# 计算裁剪后bbox的坐标
new_xmin = max(0, xmin - crop_x)
new_ymin = max(0, ymin - crop_y)
new_xmax = min(crop_w, xmax - crop_x)
new_ymax = min(crop_h, ymax - crop_y)
# 若裁剪后bbox有效(宽高>0),则保留
if new_xmax > new_xmin and new_ymax > new_ymin:
new_bboxes.append([new_xmin, new_ymin, new_xmax, new_ymax])
return cropped_image, new_bboxes
def adjust_color(image):
"""随机调整亮度、对比度、饱和度"""
# 转换为PIL图像便于调整
img_pil = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
# 调整亮度(0.5~1.5倍)
enhancer = ImageEnhance.Brightness(img_pil)
img_pil = enhancer.enhance(random.uniform(0.5, 1.5))
# 调整对比度(0.5~1.5倍)
enhancer = ImageEnhance.Contrast(img_pil)
img_pil = enhancer.enhance(random.uniform(0.5, 1.5))
# 调整饱和度(0.5~1.5倍)
enhancer = ImageEnhance.Color(img_pil)
img_pil = enhancer.enhance(random.uniform(0.5, 1.5))
# 转回OpenCV格式
image = cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)
return image
def voc_to_yolov1_after_aug(image_path, xml_path, S=7, C=20, target_size=448):
"""
完整流程:加载图像+标注 → 数据增强 → resize → 转换为YOLOv1标签
"""
# 1. 加载原始图像和标注
image = cv2.imread(image_path)
annotations, img_w, img_h = load_voc_annotation(xml_path)
bboxes = [ann['bbox'] for ann in annotations]
class_ids = [ann['class_id'] for ann in annotations]
# 2. 数据增强(几何变换:缩放→裁剪→翻转,同步调整bbox)
image, bboxes = random_scale(image, bboxes)
image, bboxes = random_crop(image, bboxes)
image, bboxes = random_horizontal_flip(image, bboxes)
# 3. 色彩变换(不调整bbox)
image = adjust_color(image)
# 4. resize到目标尺寸(448×448),调整bbox
img_h_aug, img_w_aug = image.shape[:2]
image = cv2.resize(image, (target_size, target_size))
# 调整bbox到resize后的坐标
new_bboxes = []
for bbox in bboxes:
xmin, ymin, xmax, ymax = bbox
# 按比例缩放至target_size
xmin = xmin * target_size / img_w_aug
ymin = ymin * target_size / img_h_aug
xmax = xmax * target_size / img_w_aug
ymax = ymax * target_size / img_h_aug
new_bboxes.append([xmin, ymin, xmax, ymax])
# 5. 转换为YOLOv1标签张量
yolov1_label = np.zeros((S, S, 5 + C))
grid_size = target_size / S
for i, bbox in enumerate(new_bboxes):
xmin, ymin, xmax, ymax = bbox
class_id = class_ids[i]
# 计算目标中心和宽高
cx = (xmin + xmax) / 2
cy = (ymin + ymax) / 2
w_abs = xmax - xmin
h_abs = ymax - ymin
# 确定所在网格
grid_x = int(cx / grid_size)
grid_y = int(cy / grid_size)
# 防止网格索引越界
grid_x = min(max(grid_x, 0), S-1)
grid_y = min(max(grid_y, 0), S-1)
# 转换为YOLOv1相对坐标
x = (cx % grid_size) / grid_size
y = (cy % grid_size) / grid_size
w = w_abs / target_size
h = h_abs / target_size
# 填充标签
yolov1_label[grid_y, grid_x, 0] = x
yolov1_label[grid_y, grid_x, 1] = y
yolov1_label[grid_y, grid_x, 2] = w
yolov1_label[grid_y, grid_x, 3] = h
yolov1_label[grid_y, grid_x, 4] = 1.0
yolov1_label[grid_y, grid_x, 5 + class_id] = 1.0
return image, yolov1_label
# 测试代码
if __name__ == "__main__":
# 替换为你的图像和XML路径
image_path = "dog.jpg"
xml_path = "dog.xml"
# 执行增强和转换
aug_image, yolov1_label = voc_to_yolov1_after_aug(image_path, xml_path)
# 保存增强后的图像
cv2.imwrite("dog_aug.jpg", aug_image)
# 打印结果
print("增强后图像尺寸:", aug_image.shape)
print("YOLOv1标签张量形状:", yolov1_label.shape)
# 打印有目标的网格
for grid_y in range(7):
for grid_x in range(7):
if yolov1_label[grid_y, grid_x, 4] == 1.0:
print(f"有目标的网格:({grid_y}, {grid_x})")
print(f" x: {yolov1_label[grid_y, grid_x, 0]:.4f}, y: {yolov1_label[grid_y, grid_x, 1]:.4f}")
print(f" w: {yolov1_label[grid_y, grid_x, 2]:.4f}, h: {yolov1_label[grid_y, grid_x, 3]:.4f}")
print(f" 类别id: {np.argmax(yolov1_label[grid_y, grid_x, 5:])}")
四、代码关键说明
- 增强顺序:先做缩放→裁剪→翻转(几何变换,同步调整坐标),再做色彩变换(不影响坐标),最后resize到448×448;
- 坐标校验:裁剪后会过滤掉无效目标框(宽高≤0),resize后按比例调整坐标,防止越界;
- 多目标处理:遍历所有目标框,逐个调整坐标并填充到YOLOv1标签张量中。
总结
YOLOv1数据增强的核心要点:
- 几何变换必同步调整坐标:翻转、缩放、裁剪后,目标框坐标需按相同变换规则重新计算;
- 色彩变换不调整坐标:仅改变图像像素值,不影响目标框位置;
- 最终统一resize:所有增强后图像需resize到448×448,目标框坐标同步按比例缩放;
- 标签生成:增强后的目标框需重新转换为YOLOv1的网格化相对坐标,填充到标签张量中。
该流程保证了增强后的图像与标签完全匹配,是YOLOv1训练中数据增强的标准实现方式。