视觉opencv学习笔记Ⅴ-数据增强(2)

前言:

书接上回,我们在上一章详细讲解了视觉学习中的数据预处理以及其相应的label的格式类型互转的特性,之后又进一步对数据增强的方法进行进一步的总结,接下来这一章是延续上一章节的数据增强方法进行拓展。

视觉opencv学习笔记Ⅴ-数据增强(1)-CSDN博客

1.像素变换类数据增强锦集

⭐1.BGR<->RGB/BGR->灰度图

核心逻辑:

BGR -> RGB->灰度图

python 复制代码
import cv2
import os
import numpy as np

# ====================== 核心函数:BGR转灰度图 ======================
def bgr2gray(img_bgr: np.ndarray, to_3ch: bool = True) -> np.ndarray:
    """
    极简核心:BGR转灰度图(可选转为3通道适配模型输入)
    :param img_bgr: OpenCV读取的BGR格式图像(H,W,C)
    :param to_3ch: 是否转为3通道(True=模型兼容,False=单通道)
    :return: 灰度图(单/3通道)
    """
    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    # 3通道转换(重复单通道数据)
    if to_3ch:
        return cv2.cvtColor(img_gray, cv2.COLOR_GRAY2BGR)
    return img_gray

# ====================== 核心调用示例 ======================
if __name__ == "__main__":
    # 1. 配置路径(仅改这里)
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.jpg"
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\pixel_aug_results"
    
    # 2. 加载原图(极简版)
    img_bgr = cv2.imread(IMG_PATH)
    if img_bgr is None:
        raise FileNotFoundError(f"无法读取图片:{IMG_PATH}")
    
    # 3. 核心调用:转灰度图(3通道,适配模型)
    img_gray_3ch = bgr2gray(img_bgr, to_3ch=True)
    # 可选:转单通道灰度图
    # img_gray_1ch = bgr2gray(img_bgr, to_3ch=False)
    
    # 4. 保存结果(仅保留核心保存)
    os.makedirs(SAVE_DIR, exist_ok=True)
    img_basename = os.path.splitext(os.path.basename(IMG_PATH))[0]
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_gray_3ch.jpg", img_gray_3ch)
    # cv2.imwrite(f"{SAVE_DIR}/{img_basename}_gray_1ch.jpg", img_gray_1ch)  # 单通道保存
    
    print(f"转灰度图完成!保存路径:{SAVE_DIR}/{img_basename}_gray_3ch.jpg")
    print("核心要点:")
    print("1. to_3ch=True:灰度图转为3通道,可直接输入要求3通道的模型;")
    print("2. to_3ch=False:输出单通道灰度图,仅保留亮度信息;")
    print("3. 无色彩失真:灰度转换仅丢失色度信息,亮度特征完整保留。")


2.全局直方图均衡化

python 复制代码
cv2.equalizeHist(gray_img)  # 仅支持单通道(灰度/Y通道)

核心代码:

python 复制代码
def global_histogram_equalization(img_bgr: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """
    全局直方图均衡化(工业级实现:区分灰度/彩色图)
    Args:
        img_bgr: BGR格式原图
    Returns:
        img_gray_eq: 灰度图均衡化结果(3通道)
        img_color_eq: 彩色图均衡化结果(仅均衡化亮度通道,保留色彩)
    """
    # ========== 1. 灰度图直方图均衡化 ==========
    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    img_gray_eq = cv2.equalizeHist(img_gray)  # 核心API:全局直方图均衡化
    img_gray_eq_3ch = cv2.cvtColor(img_gray_eq, cv2.COLOR_GRAY2BGR)  # 转3通道
    print("完成灰度图全局直方图均衡化(提升整体对比度)")

    # ========== 2. 彩色图直方图均衡化(关键:仅处理亮度通道) ==========
    # 彩色图不能直接均衡化BGR通道(会色彩失真),需转YCrCb空间(Y=亮度,Cr/Cb=色彩)
    img_ycrcb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2YCrCb)
    # 仅对亮度通道(Y)做均衡化
    img_ycrcb[:, :, 0] = cv2.equalizeHist(img_ycrcb[:, :, 0])
    # 转回BGR空间
    img_color_eq = cv2.cvtColor(img_ycrcb, cv2.COLOR_YCrCb2BGR)
    print("完成彩色图全局直方图均衡化(仅均衡化亮度通道,无色彩失真)")

    return img_gray_eq_3ch, img_color_eq

3.自适应均衡化

cv2.createCLAHE()(区别于全局均衡化的cv2.equalizeHist()

python 复制代码
import cv2
import numpy as np

def adaptive_histogram_equalization(img_bgr: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """
    自适应区域直方图均衡化(核心方法)
    关键参数:
    - clipLimit=2.0:限制对比度,避免过度增强放大噪声
    - tileGridSize=(8,8):分块大小,将图像分为8×8的小区域分别均衡化
    """
    # ========== 1. 灰度图自适应均衡化 ==========
    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY)
    # 创建CLAHE对象(核心API)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    img_gray_clahe = clahe.apply(img_gray)  # 应用自适应均衡化
    img_gray_clahe_3ch = cv2.cvtColor(img_gray_clahe, cv2.COLOR_GRAY2BGR)

    # ========== 2. 彩色图自适应均衡化(仅亮度通道) ==========
    img_ycrcb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2YCrCb)
    img_ycrcb[:, :, 0] = clahe.apply(img_ycrcb[:, :, 0])  # 仅处理亮度通道
    img_color_clahe = cv2.cvtColor(img_ycrcb, cv2.COLOR_YCrCb2BGR)

    return img_gray_clahe_3ch, img_color_clahe

# ------------------- 核心调用示例 -------------------
if __name__ == "__main__":
    # 1. 加载原图
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.jpg"
    img_origin = cv2.imread(IMG_PATH, cv2.IMREAD_COLOR)
    
    # 2. 核心调用:自适应直方图均衡化
    img_gray_clahe, img_color_clahe = adaptive_histogram_equalization(img_origin)
    
    # 3. 保存结果(可选,用于对比)
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\pixel_aug_results"
    cv2.imwrite(f"{SAVE_DIR}/ok_gray_adaptive_hist_eq.jpg", img_gray_clahe)
    cv2.imwrite(f"{SAVE_DIR}/ok_color_adaptive_hist_eq.jpg", img_color_clahe)
    
    print("自适应直方图均衡化完成!")
    print("关键差异:")
    print("1. 对比全局均衡化:局部明暗不均区域(如部分逆光)会更自然,无过曝/噪声放大;")
    print("2. clipLimit越大:对比度增强越明显,但易放大噪声;建议2.0(工业常用);")
    print("3. tileGridSize越小:分块越细,局部适配越好,但计算量略增;8×8为最优平衡。")

4.随机调节亮度

random_brightness(随机调整亮度)

python 复制代码
import cv2
import numpy as np
import random

def random_brightness(img_bgr: np.ndarray, brightness_range=(0.5, 1.5)) -> np.ndarray:
    """
    随机调整亮度(核心方法)
    关键约束:
    - 亮度因子限制在 0.5~1.5(避免过暗/过曝)
    - 像素值强制 clip 到 0~255,防止异常值
    """
    # 随机生成亮度因子(0.5~1.5 倍)
    brightness = random.uniform(*brightness_range)
    # 转为 float 计算(避免 uint8 溢出)
    img_float = img_bgr.astype(np.float32)
    # 调整亮度(所有通道同比例调整,保留色彩)
    img_bright = img_float * brightness
    # 限制像素值在 0~255 之间(核心约束)
    img_bright = np.clip(img_bright, 0, 255).astype(np.uint8)
    return img_bright

# ------------------- 核心调用示例 -------------------
if __name__ == "__main__":
    # 1. 加载原图
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.jpg"
    img_origin = cv2.imread(IMG_PATH, cv2.IMREAD_COLOR)
    
    # 2. 核心调用:随机调整亮度(0.5~1.5 倍)
    img_bright = random_brightness(img_origin, brightness_range=(0.5, 1.5))
    
    # 3. 保存结果(用于对比)
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\pixel_aug_results"
    cv2.imwrite(f"{SAVE_DIR}/ok_random_brightness.jpg", img_bright)
    
    print("随机亮度调整完成!")
    print("关键要点:")
    print("1. 亮度因子建议 0.5~1.5:<0.3 过暗丢失细节,>1.8 过曝;")
    print("2. 所有通道同比例调整:避免色彩偏移(如仅调R通道导致偏色);")
    print("3. 必须 clip 像素值:uint8 范围 0~255,溢出会出现伪色/黑块;")
    print("4. 适用场景:模拟强光/弱光下的工业目标(如车间灯光明暗变化)。")

5.图像像素取反

核心调用方式:

python 复制代码
# 仅执行像素取反(直接使用)
img_bgr = cv2.imread(IMG_PATH)
img_inverted = img_bitwise_not(img_bgr)
cv2.imwrite("ok_bitwise_not.jpg", img_inverted)
python 复制代码
import cv2
import os
import numpy as np

# ====================== 核心函数:图像像素取反(反色) ======================
def img_bitwise_not(img_bgr: np.ndarray) -> np.ndarray:
    """
    极简核心:OpenCV BGR图像像素取反(反色)
    :param img_bgr: OpenCV读取的BGR格式图像(H,W,C)
    :return: 反色后的BGR图像
    """
    # 核心API:cv2.bitwise_not 直接对像素取反(uint8范围0~255,等价于 255 - 像素值)
    img_inverted = cv2.bitwise_not(img_bgr)
    return img_inverted

# ====================== 极简加载/保存标签(取反无需修改标签,仅拷贝) ======================
def copy_yolo_label(src_label_path: str, dst_label_path: str):
    """复制原标签(取反不改变目标框,无需修正)"""
    if os.path.exists(src_label_path):
        import shutil
        shutil.copy(src_label_path, dst_label_path)
        print(f"标签已拷贝:{dst_label_path}")

# ====================== 核心调用示例 ======================
if __name__ == "__main__":
    # 1. 配置路径(仅改这里)
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.txt"  # 可选:原YOLO标签
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\pixel_aug_results"
    
    # 2. 加载原图(极简版)
    img_bgr = cv2.imread(IMG_PATH)
    if img_bgr is None:
        raise FileNotFoundError(f"无法读取图片:{IMG_PATH}")
    
    # 3. 核心调用:像素取反
    img_inverted = img_bitwise_not(img_bgr)
    
    # 4. 保存结果
    os.makedirs(SAVE_DIR, exist_ok=True)
    img_basename = os.path.splitext(os.path.basename(IMG_PATH))[0]
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_bitwise_not.jpg", img_inverted)
    
    # 5. 拷贝标签(取反不改变目标框,直接复用原标签)
    if os.path.exists(LABEL_PATH):
        copy_yolo_label(LABEL_PATH, f"{SAVE_DIR}/{img_basename}_bitwise_not.txt")
    
    print(f"像素取反完成!保存路径:{SAVE_DIR}/{img_basename}_bitwise_not.jpg")
    print("核心要点:")
    print("1. 核心效果:像素值取反(如黑色→白色、红色→青色),增强目标与背景对比度;")
    print("2. 标签处理:无需修正YOLO框(仅像素值变化,目标位置/大小不变);")
    print("3. 适用场景:深色背景下的浅色目标、黑白对比强烈的工业场景(如印刷字符);")
    print("4. 无参数调整:纯像素值反转,无超参需调优。")

6.减去像素均值(subtract_mean)

核心调用:

python 复制代码
# 自动计算全局均值并减去
img_bgr = cv2.imread(IMG_PATH)
img_sub_mean = subtract_mean(img_bgr, mean=None)
cv2.imwrite("ok_sub_mean.jpg", img_sub_mean)

作用:像素级归一化操作

1. 消除全局光照偏移的影响

2. 降低数据分布差异,提升模型泛化性

场景:

1. 光照不稳定的工业 / 户外视觉场景(核心场景)

  • 工业检测:车间零件 / 包装盒拍摄(灯光明暗、角度导致光照不均);
  • 户外视觉:交通摄像头、无人机图像(早晚 / 阴天 / 晴天光照差异大);
  • 安防监控:夜间 / 白天的行人 / 车辆检测(全局亮度波动大)。

2. 基于预训练模型的迁移学习(必用场景)

只要你在分类、检测、分割任务中复用「ImageNet 预训练权重」,必须将输入图像减去预训练时的固定均值(如 ImageNet 均值),否则模型的底层特征提取(如边缘、纹理)会完全偏离预期。例:用 ResNet50 做工业零件缺陷分类,第一步预处理就是subtract_mean(img, mean=[103.939, 116.779, 123.68])

使用必须要做的事情:

  1. 必须做clip约束 :减去均值后可能出现负数像素(如暗区像素值 5,均值 10,结果 - 5),需用np.clip(img_sub_mean, 0, 255)限制到 0~255,否则图像会出现伪色、黑块等异常;
  2. 均值选择要匹配场景
    • 迁移学习:用预训练模型的固定均值(如 ImageNet);
    • 自有数据集训练:计算数据集的全局均值(所有图片的通道均值),而非单张图片均值(代码中np.mean(img_float, axis=(0,1))是单张均值,批量训练需先算全量均值);
  3. 仅像素值变化,标签无需修改:和像素取反、亮度调整一样,减去均值不改变目标的位置 / 大小,YOLO 标签直接拷贝即可。
python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Project :Pytorch
@File :subtract_mean.py
@IDE :PyCharm
@Author :wjj
@Date :2025/12/14 18:56
@Description:
"""
import cv2
import os
import numpy as np


# ====================== 核心函数:减去像素均值 ======================
def subtract_mean(img_bgr: np.ndarray, mean: list = None) -> np.ndarray:
    """
    极简核心:图像减去像素均值(全局/自定义),clip避免像素值异常
    :param img_bgr: OpenCV读取的BGR图像(H,W,C)
    :param mean: 自定义均值 [B, G, R],None则计算图像全局均值
    :return: 减去均值后的BGR图像(uint8)
    """
    # 转为float计算,避免uint8溢出(负数/超界)
    img_float = img_bgr.astype(np.float32)

    # 计算/使用均值
    if mean is None:
        mean = np.mean(img_float, axis=(0, 1))  # 按通道计算全局均值
        print(f"自动计算图像全局均值(B/G/R):{mean.round(2)}")
    else:
        mean = np.array(mean, dtype=np.float32)

    # 核心操作:减去均值 + 约束像素值到0~255
    img_sub_mean = img_float - mean
    img_sub_mean = np.clip(img_sub_mean, 0, 255)  # 避免负数/超255导致图像异常

    # 转回uint8(符合OpenCV保存/模型输入要求)
    return img_sub_mean.astype(np.uint8)


# ====================== 极简标签处理(直接拷贝) ======================
def copy_yolo_label(src_label_path: str, dst_label_path: str):
    """拷贝原标签(均值减法不改变目标框)"""
    if os.path.exists(src_label_path):
        import shutil
        shutil.copy(src_label_path, dst_label_path)


# ====================== 核心调用示例 ======================
if __name__ == "__main__":
    # 1. 配置路径
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.txt"  # 可选
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\pixel_aug_results"

    # 2. 加载原图
    img_bgr = cv2.imread(IMG_PATH)
    if img_bgr is None:
        raise FileNotFoundError(f"无法读取图片:{IMG_PATH}")

    # 3. 核心调用(两种方式可选)
    # 方式1:自动计算全局均值
    img_sub_global_mean = subtract_mean(img_bgr, mean=None)
    # 方式2:使用自定义均值(如ImageNet均值 [103.939, 116.779, 123.68],适配预训练模型)
    # img_sub_custom_mean = subtract_mean(img_bgr, mean=[103.939, 116.779, 123.68])

    # 4. 保存结果
    os.makedirs(SAVE_DIR, exist_ok=True)
    img_basename = os.path.splitext(os.path.basename(IMG_PATH))[0]
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_sub_global_mean.jpg", img_sub_global_mean)
    # cv2.imwrite(f"{SAVE_DIR}/{img_basename}_sub_custom_mean.jpg", img_sub_custom_mean)

    # 5. 拷贝标签(无需修正)
    if os.path.exists(LABEL_PATH):
        copy_yolo_label(LABEL_PATH, f"{SAVE_DIR}/{img_basename}_sub_global_mean.txt")

    print(f"减去均值完成!保存路径:{SAVE_DIR}/{img_basename}_sub_global_mean.jpg")
    print("核心要点:")
    print("1. 核心作用:降低光照全局偏移影响,提升模型泛化性;")
    print("2. 关键约束:必须clip到0~255,否则负数像素会导致图像偏色/异常;")
    print("3. 均值选择:自定义均值(如ImageNet)适配预训练模型,全局均值适配自有数据;")
    print("4. 标签处理:仅像素值偏移,目标框不变,直接拷贝原标签。")

7.图像锐化增强(拉普拉斯 / 高斯 / 均值)

核心调用:

python 复制代码
# 仅执行高斯锐化(最常用,效果柔和)
img_bgr = cv2.imread(IMG_PATH)
img_sharpen = image_sharpen(img_bgr, sharpen_type="gaussian", alpha=0.5)
cv2.imwrite("ok_gaussian_sharpen.jpg", img_sharpen)

其本质作用为:利用 "模糊 / 边缘提取" 的结果反向增强细节。其与去噪的三种模糊效果完全相反。

可以把图像想象成 "一张有纹理的纸":

  • 模糊(均值 / 高斯):用手把纸揉平,纹理(细节)变浅(弱化);
  • 锐化(均值 / 高斯):把揉平的纸和原纸对比,找出 "变浅的纹理差",再用笔把这些纹理描深(强化);
  • 拉普拉斯锐化:直接找到纸的 "轮廓线"(边缘),用笔把轮廓线描粗(强强化)。
场景需求 选模糊(均值 / 高斯) 选锐化(均值 / 高斯 / 拉普拉斯)
图像有颗粒噪声,需降噪 ✅ 均值模糊 ❌ 锐化会放大噪声
低分辨率图,需平滑轮廓 ✅ 高斯模糊 ❌ 锐化会放大像素块
模糊的零件划痕,需清晰 ❌ 模糊会更糊 ✅ 拉普拉斯锐化(强边缘)
印刷字边缘模糊,需柔和增强 ❌ 模糊会更糊 ✅ 高斯锐化(自然无伪影)
纹理对比度低,需强化 ❌ 模糊会降低对比度 ✅ 均值锐化(提升纹理对比)
python 复制代码
import cv2
import os
import numpy as np


# ====================== 核心函数:多类型图像锐化 ======================
def image_sharpen(img_bgr: np.ndarray, sharpen_type: str = "laplacian", alpha: float = 0.5) -> np.ndarray:
    """
    极简核心:拉普拉斯/高斯/均值锐化(增强边缘/纹理)
    :param img_bgr: OpenCV读取的BGR图像(H,W,C)
    :param sharpen_type: 锐化类型 - laplacian/gaussian/mean
    :param alpha: 锐化强度(0.3~0.7为宜,过强产生伪影)
    :return: 锐化后的BGR图像(uint8)
    """
    # 转为float计算,避免uint8溢出
    img_float = img_bgr.astype(np.float32)

    # 1. 生成模糊/边缘图(不同锐化类型的核心差异)
    if sharpen_type == "laplacian":
        # 拉普拉斯锐化:提取边缘后叠加到原图
        laplacian = cv2.Laplacian(img_float, cv2.CV_32F, ksize=3)  # 3×3核提取边缘
        img_sharpen = img_float + alpha * laplacian
    elif sharpen_type == "gaussian":
        # 高斯锐化:原图 - 高斯模糊图 → 增强细节
        img_blur = cv2.GaussianBlur(img_float, (3, 3), 1.0)  # 3×3高斯核,sigma=1
        img_sharpen = img_float + alpha * (img_float - img_blur)
    elif sharpen_type == "mean":
        # 均值锐化:原图 - 均值模糊图 → 增强对比度
        img_blur = cv2.blur(img_float, (3, 3))  # 3×3均值核
        img_sharpen = img_float + alpha * (img_float - img_blur)
    else:
        raise ValueError(f"无效锐化类型:{sharpen_type},可选laplacian/gaussian/mean")

    # 2. 约束像素值到0~255,避免伪影/溢出
    img_sharpen = np.clip(img_sharpen, 0, 255).astype(np.uint8)
    return img_sharpen


# ====================== 极简标签处理(直接拷贝) ======================
def copy_yolo_label(src_label_path: str, dst_label_path: str):
    """拷贝原标签(锐化不改变目标框)"""
    if os.path.exists(src_label_path):
        import shutil
        shutil.copy(src_label_path, dst_label_path)


# ====================== 核心调用示例 ======================
if __name__ == "__main__":
    # 1. 配置路径
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.txt"  # 可选
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out"

    # 2. 加载原图
    img_bgr = cv2.imread(IMG_PATH)
    if img_bgr is None:
        raise FileNotFoundError(f"无法读取图片:{IMG_PATH}")

    # 3. 核心调用:三种锐化(按需选一种)
    img_laplacian = image_sharpen(img_bgr, sharpen_type="laplacian", alpha=0.5)  # 拉普拉斯(边缘最强)
    img_gaussian = image_sharpen(img_bgr, sharpen_type="gaussian", alpha=0.5)  # 高斯(最柔和)
    img_mean = image_sharpen(img_bgr, sharpen_type="mean", alpha=0.5)  # 均值(对比度增强)

    # 4. 保存结果
    os.makedirs(SAVE_DIR, exist_ok=True)
    img_basename = os.path.splitext(os.path.basename(IMG_PATH))[0]
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_laplacian_sharpen.jpg", img_laplacian)
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_gaussian_sharpen.jpg", img_gaussian)
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_mean_sharpen.jpg", img_mean)

    # 5. 拷贝标签(无需修正)
    if os.path.exists(LABEL_PATH):
        copy_yolo_label(LABEL_PATH, f"{SAVE_DIR}/{img_basename}_laplacian_sharpen.txt")

    print("三种锐化完成!保存路径:", SAVE_DIR)
    print("核心要点:")
    print("1. 锐化强度alpha:0.3~0.7为宜,>0.8易产生锯齿/伪影,<0.2效果不明显;")
    print("2. 类型选择:")
    print("   - laplacian:边缘增强最强(适合模糊的目标边缘,如远距离小目标);")
    print("   - gaussian:锐化最柔和(适合低分辨率/运动模糊图像,无明显伪影);")
    print("   - mean:对比度增强(适合纹理模糊的目标,如印刷字符/零件纹理);")
    print("3. 适用场景:低分辨率、运动模糊、边缘模糊的工业目标(如包装盒文字、零件划痕);")
    print("4. 标签处理:仅增强纹理,目标框不变,直接拷贝原标签。")

8.Sobel 边缘融合增强

本质:

Sobel 边缘提取 + 边缘融合到原图核心逻辑(强化目标边缘特征)

python 复制代码
# 仅执行双方向Sobel边缘融合(最常用)
img_bgr = cv2.imread(IMG_PATH)
img_sobel = sobel_edge_to_image(img_bgr, edge_dir="both", alpha=0.5)
cv2.imwrite("ok_sobel_both.jpg", img_sobel)
  • Sobel 算子是 "边缘提取刀",高斯算子是 "降噪洗菜盆",二者都是底层工具;
  • Sobel 边缘融合增强是 "用刀 + 盆(可选)做一道边缘更清晰的菜",是上层应用流程;
python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Project :Pytorch
@File :sobel_.py
@IDE :PyCharm
@Author :wjj
@Date :2025/12/14 19:23
@Description:
"""
import cv2
import os
import numpy as np


# ====================== 核心函数:Sobel边缘提取+融合到原图 ======================
def sobel_edge_to_image(img_bgr: np.ndarray, edge_dir: str = "both", alpha: float = 0.5) -> np.ndarray:
    """
    极简核心:Sobel边缘提取 + 边缘图融合到原图(强化边缘特征)
    :param img_bgr: OpenCV读取的BGR图像(H,W,C)
    :param edge_dir: 边缘检测方向 - x(垂直)/y(水平)/both(双方向)
    :param alpha: 边缘融合强度(0.3~0.8为宜,过强掩盖原图细节)
    :return: 融合边缘后的BGR图像(uint8)
    """
    # 1. 转为灰度图(Sobel仅支持单通道)
    img_gray = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2GRAY).astype(np.float32)

    # 2. Sobel边缘提取(核心API:cv2.Sobel,一阶导数边缘检测)
    if edge_dir == "x":
        # x方向:检测垂直边缘(如文字竖线、零件竖边)
        sobel_edge = cv2.Sobel(img_gray, cv2.CV_32F, dx=1, dy=0, ksize=3)
    elif edge_dir == "y":
        # y方向:检测水平边缘(如文字横线、零件横边)
        sobel_edge = cv2.Sobel(img_gray, cv2.CV_32F, dx=0, dy=1, ksize=3)
    elif edge_dir == "both":
        # 双方向:合并x/y边缘(最常用,完整边缘)
        sobel_x = cv2.Sobel(img_gray, cv2.CV_32F, dx=1, dy=0, ksize=3)
        sobel_y = cv2.Sobel(img_gray, cv2.CV_32F, dx=0, dy=1, ksize=3)
        sobel_edge = cv2.addWeighted(sobel_x, 0.5, sobel_y, 0.5, 0)
    else:
        raise ValueError(f"无效边缘方向:{edge_dir},可选x/y/both")

    # 3. 边缘图归一化(0~255)+ 转为3通道(匹配原图)
    sobel_edge = np.clip(cv2.normalize(sobel_edge, None, 0, 255, cv2.NORM_MINMAX), 0, 255)
    sobel_edge_3ch = cv2.cvtColor(sobel_edge.astype(np.uint8), cv2.COLOR_GRAY2BGR)

    # 4. 边缘图融合到原图(核心:原图为主,边缘图为辅)
    img_float = img_bgr.astype(np.float32)
    edge_float = sobel_edge_3ch.astype(np.float32)
    img_fused = (1 - alpha) * img_float + alpha * edge_float  # 加权融合
    img_fused = np.clip(img_fused, 0, 255).astype(np.uint8)

    return img_fused


# ====================== 极简标签处理(直接拷贝) ======================
def copy_yolo_label(src_label_path: str, dst_label_path: str):
    """拷贝原标签(边缘融合不改变目标框)"""
    if os.path.exists(src_label_path):
        import shutil
        shutil.copy(src_label_path, dst_label_path)


# ====================== 核心调用示例 ======================
if __name__ == "__main__":
    # 1. 配置路径
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\perspective_data\ok.txt"  # 可选
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\sobel"

    # 2. 加载原图
    img_bgr = cv2.imread(IMG_PATH)
    if img_bgr is None:
        raise FileNotFoundError(f"无法读取图片:{IMG_PATH}")

    # 3. 核心调用(按需选方向)
    img_sobel_x = sobel_edge_to_image(img_bgr, edge_dir="x", alpha=0.5)  # 垂直边缘
    img_sobel_y = sobel_edge_to_image(img_bgr, edge_dir="y", alpha=0.5)  # 水平边缘
    img_sobel_both = sobel_edge_to_image(img_bgr, edge_dir="both", alpha=0.5)  # 双方向(最常用)

    # 4. 保存结果
    os.makedirs(SAVE_DIR, exist_ok=True)
    img_basename = os.path.splitext(os.path.basename(IMG_PATH))[0]
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_sobel_x.jpg", img_sobel_x)
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_sobel_y.jpg", img_sobel_y)
    cv2.imwrite(f"{SAVE_DIR}/{img_basename}_sobel_both.jpg", img_sobel_both)

    # 5. 拷贝标签(无需修正)
    if os.path.exists(LABEL_PATH):
        copy_yolo_label(LABEL_PATH, f"{SAVE_DIR}/{img_basename}_sobel_both.txt")

    print("Sobel边缘融合完成!保存路径:", SAVE_DIR)
    print("核心要点:")
    print("1. 边缘方向选择:")
    print("   - x方向:强化垂直边缘(如包装盒竖边、文字竖线);")
    print("   - y方向:强化水平边缘(如包装盒横边、文字横线);")
    print("   - both:完整强化所有边缘(工业场景首选);")
    print("2. 强度alpha:0.3~0.8为宜,>0.8会掩盖原图纹理,<0.3边缘增强不明显;")
    print("3. 适用场景:边缘模糊的工业目标(如低分辨率零件边缘、印刷字符轮廓);")
    print("4. 核心差异:Sobel是一阶导数边缘(柔和),拉普拉斯是二阶(锐利),Sobel更适合工业模糊图。")

2.图像融合 / 拼接类数据增强锦集

1.MixUp 数据增强

核心:将两张图按照一定的权重进行融合为一张图

核心代码:

python 复制代码
# 仅执行MixUp融合(λ=0.5)
img1 = cv2.imread(IMG1_PATH)
img2 = cv2.imread(IMG2_PATH)
bboxes1 = load_yolo_bboxes(LABEL1_PATH)
bboxes2 = load_yolo_bboxes(LABEL2_PATH)
img_mix, bboxes_mix = mix_up_images(img1, img2, bboxes1, bboxes2, lambda_=0.5)
python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Project :Pytorch
@File :mixup.py
@IDE :PyCharm
@Author :wjj
@Date :2025/12/14 19:41
@Description:mixup(新增带Box可视化功能)
"""
import cv2
import os
import numpy as np
import random


# ====================== 核心函数:MixUp图像+标签融合 ======================
def mix_up_images(img1: np.ndarray, img2: np.ndarray, bboxes1: np.ndarray, bboxes2: np.ndarray,
                  lambda_: float = None) -> tuple[np.ndarray, np.ndarray]:
    """
    极简核心:两张图像MixUp融合 + YOLO标签合并(检测场景)
    :param img1: 第一张BGR图像(H,W,C)
    :param img2: 第二张BGR图像(需和img1同尺寸)
    :param bboxes1: 第一张图YOLO归一化框 [x,y,w,h],shape=(n,4)
    :param bboxes2: 第二张图YOLO归一化框 [x,y,w,h],shape=(m,4)
    :param lambda_: 融合系数(0~1),None则随机生成(0.3~0.7)
    :return: mix_up后的图像、合并后的YOLO框(n+m,4)
    """
    # 1. 确保两张图尺寸一致(MixUp前提)
    if img1.shape != img2.shape:
        img2 = cv2.resize(img2, (img1.shape[1], img1.shape[0]))

    # 2. 随机生成融合系数λ(0.3~0.7避免某张图占比过高)
    if lambda_ is None:
        lambda_ = random.uniform(0.3, 0.7)

    # 3. 图像加权融合(核心:img1*λ + img2*(1-λ))
    img1_float = img1.astype(np.float32)
    img2_float = img2.astype(np.float32)
    img_mix = lambda_ * img1_float + (1 - lambda_) * img2_float
    img_mix = np.clip(img_mix, 0, 255).astype(np.uint8)

    # 4. 标签合并(检测任务:保留两张图的所有框,无需加权)
    # 分类任务需加权类别概率,检测任务直接合并框即可
    bboxes_mix = np.vstack([bboxes1, bboxes2]) if len(bboxes1) > 0 and len(bboxes2) > 0 else (
        bboxes1 if len(bboxes1) > 0 else bboxes2)

    return img_mix, bboxes_mix


# ====================== 新增:可视化MixUp结果(带Box) ======================
def visualize_mix_up(img_mix: np.ndarray, bboxes_mix: np.ndarray, class_ids: list, save_path: str):
    """
    将合并后的YOLO框绘制到MixUp图像上,保存带框的可视化结果
    :param img_mix: MixUp融合后的BGR图像
    :param bboxes_mix: 合并后的YOLO归一化框 [x,y,w,h]
    :param class_ids: 框对应的类别ID列表
    :param save_path: 可视化结果保存路径
    """
    img_vis = img_mix.copy()
    h, w = img_vis.shape[:2]

    # 定义不同类别的框颜色(区分img1和img2的框)
    class_colors = {0: (0, 255, 0), 1: (0, 0, 255)}  # class0绿色,class1红色

    # 遍历所有框,绘制到图像上
    for cls, box in zip(class_ids, bboxes_mix):
        # YOLO归一化框 → 像素坐标
        x_center, y_center, box_w, box_h = box
        x_center_pix = x_center * w
        y_center_pix = y_center * h
        box_w_pix = box_w * w
        box_h_pix = box_h * h

        # 计算框的左上角/右下角坐标
        x1 = int(x_center_pix - box_w_pix / 2)
        y1 = int(y_center_pix - box_h_pix / 2)
        x2 = int(x_center_pix + box_w_pix / 2)
        y2 = int(y_center_pix + box_h_pix / 2)

        # 绘制矩形框 + 类别标签
        color = class_colors.get(cls, (255, 0, 0))  # 默认蓝色
        cv2.rectangle(img_vis, (x1, y1), (x2, y2), color, 2)
        cv2.putText(img_vis, f"cls{cls}", (x1, y1 - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

    # 保存可视化结果
    cv2.imwrite(save_path, img_vis)
    print(f"带Box的MixUp可视化结果已保存:{save_path}")


# ====================== 极简加载/保存YOLO标签 ======================
def load_yolo_bboxes(label_path: str) -> np.ndarray:
    """仅加载YOLO归一化框"""
    bboxes = []
    if os.path.exists(label_path):
        with open(label_path) as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) >= 5: bboxes.append([float(x) for x in parts[1:5]])
    return np.array(bboxes, dtype=np.float32)


def save_yolo_bboxes(label_path: str, class_ids: list, bboxes: np.ndarray):
    """保存MixUp后的YOLO标签(合并两张图的类别+框)"""
    # class_ids需和bboxes长度匹配(如img1的框对应class0,img2的框对应class1)
    with open(label_path, "w") as f:
        for cls, box in zip(class_ids, bboxes):
            f.write(f"{cls} {box[0]:.6f} {box[1]:.6f} {box[2]:.6f} {box[3]:.6f}\n")


# ====================== 核心调用示例 ======================
if __name__ == "__main__":
    # 1. 配置两张图/标签路径(MixUp需至少两张图)
    IMG1_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy.jpg"
    LABEL1_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy.txt"  # class 0
    IMG2_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy1.jpg"
    LABEL2_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy1.txt"  # class 1
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out"

    # 2. 加载两张图+标签
    img1 = cv2.imread(IMG1_PATH)
    img2 = cv2.imread(IMG2_PATH)
    bboxes1 = load_yolo_bboxes(LABEL1_PATH)
    bboxes2 = load_yolo_bboxes(LABEL2_PATH)

    # 3. 核心调用:MixUp融合
    img_mix, bboxes_mix = mix_up_images(img1, img2, bboxes1, bboxes2, lambda_=0.5)

    # 4. 准备合并后的类别ID(img1的框→class0,img2的框→class1)
    class_ids = [0] * len(bboxes1) + [1] * len(bboxes2)

    # 5. 保存结果(原融合图 + 带Box的可视化图)
    os.makedirs(SAVE_DIR, exist_ok=True)
    # 保存纯融合图
    mix_img_path = f"{SAVE_DIR}/mix_up_result.jpg"
    cv2.imwrite(mix_img_path, img_mix)
    # 保存带Box的可视化图
    vis_img_path = f"{SAVE_DIR}/mix_up_result_with_box.jpg"
    visualize_mix_up(img_mix, bboxes_mix, class_ids, vis_img_path)

    # 6. 保存标签
    save_yolo_bboxes(f"{SAVE_DIR}/mix_up_result.txt", class_ids, bboxes_mix)

    print(f"\nMixUp完成!融合系数λ=0.5,合并后框数:{len(bboxes_mix)}")
    print("生成文件列表:")
    print(f"1. 纯融合图像:{mix_img_path}")
    print(f"2. 带Box可视化图像:{vis_img_path}")
    print(f"3. 合并后标签文件:{SAVE_DIR}/mix_up_result.txt")
    print("\n核心要点:")
    print("1. 核心作用:提升模型泛化性,缓解过拟合(尤其小样本工业场景);")
    print("2. 关键约束:两张图需同尺寸(自动resize适配),λ取0.3~0.7效果最优;")
    print("3. 标签处理:检测任务直接合并框,分类任务需加权类别概率;")
    print("4. 可视化说明:class0(img1)框为绿色,class1(img2)框为红色,便于区分来源;")
    print("5. 区别于普通融合:MixUp是随机λ加权,且标签需同步融合,非简单拼接。")


2.mosaic数据增强

本质:将四张图组合到一起,形成新的图,遵循"随机采样+策略约束",裁剪面积要求,保证有效的box留存率。本质就是在数据集中随机选4张图片再组合起来,形成新的图片增加box的密度。

python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Project :Pytorch
@File :mosaic.py
@IDE :PyCharm
@Author :wjj
@Date :2025/12/14 22:00
@Description:Mosaic增强:4图固定位置+保留所有Box(修复NameError)
"""
import cv2
import os
import numpy as np


# ====================== 单张图预处理:resize+Box坐标转换 ======================
def process_single_image(img_path, label_path, target_region_size):
    """
    处理单张图:
    1. resize到目标区域尺寸(最终图的1/4)
    2. 转换Box坐标为区域内像素坐标
    3. 返回:处理后的图、区域内Box像素坐标、类别ID
    """
    region_w, region_h = target_region_size
    # 加载图像并resize到区域尺寸
    img = cv2.imread(img_path)
    if img is None:
        img = np.ones((region_h, region_w, 3), dtype=np.uint8) * 128  # 灰色填充
    img = cv2.resize(img, (region_w, region_h))

    # 加载标签并转换为区域内像素坐标
    box_pix = []  # 区域内像素坐标:[x1, y1, x2, y2]
    class_ids = []
    if os.path.exists(label_path):
        with open(label_path, "r") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split()
                try:
                    if len(parts) >= 5:
                        cls_id = int(parts[0])
                        # 归一化坐标→区域内像素坐标
                        xc, yc, bw, bh = [float(p) for p in parts[1:5]]
                        x1 = int((xc - bw / 2) * region_w)
                        y1 = int((yc - bh / 2) * region_h)
                        x2 = int((xc + bw / 2) * region_w)
                        y2 = int((yc + bh / 2) * region_h)
                        # 确保Box在区域内
                        x1 = max(0, x1)
                        y1 = max(0, y1)
                        x2 = min(region_w - 1, x2)
                        y2 = min(region_h - 1, y2)
                        box_pix.append([x1, y1, x2, y2])
                        class_ids.append(cls_id)
                except (ValueError, IndexError):
                    print(f"警告:标签文件{label_path}无效行:{line}")
                    continue
    return img, box_pix, class_ids


# ====================== 核心Mosaic拼接:固定4图位置+保留所有Box ======================
def fixed_position_mosaic(img_paths, label_paths, target_size=(640, 640)):
    """
    固定位置Mosaic:
    - 图0 → 左上区域(0~w/2, 0~h/2)
    - 图1 → 右上区域(w/2~w, 0~h/2)
    - 图2 → 左下区域(0~w/2, h/2~h)
    - 图3 → 右下区域(w/2~w, h/2~h)
    强制保留所有图的Box,转换为最终图的归一化坐标
    """
    assert len(img_paths) == 4 and len(label_paths) == 4, "必须传入4张图/标签"
    final_w, final_h = target_size
    region_w = final_w // 2
    region_h = final_h // 2

    # 步骤1:创建最终Mosaic大图
    mosaic_img = np.zeros((final_h, final_w, 3), dtype=np.uint8)

    # 步骤2:处理每张图并拼接,同时转换Box坐标到最终图
    all_final_boxes = []  # 最终图的归一化Box
    all_class_ids = []  # 最终图的类别ID
    box_counts_per_img = [0, 0, 0, 0]  # 记录每张图的Box数量

    # 图0 → 左上区域
    img0, boxes0, cls0 = process_single_image(img_paths[0], label_paths[0], (region_w, region_h))
    mosaic_img[0:region_h, 0:region_w] = img0
    box_counts_per_img[0] = len(boxes0)
    # 转换Box到最终图坐标
    for (x1, y1, x2, y2), cls in zip(boxes0, cls0):
        xc = (x1 + x2) / 2 / final_w
        yc = (y1 + y2) / 2 / final_h
        bw = (x2 - x1) / final_w
        bh = (y2 - y1) / final_h
        all_final_boxes.append([xc, yc, bw, bh])
        all_class_ids.append(cls)

    # 图1 → 右上区域
    img1, boxes1, cls1 = process_single_image(img_paths[1], label_paths[1], (region_w, region_h))
    mosaic_img[0:region_h, region_w:final_w] = img1
    box_counts_per_img[1] = len(boxes1)
    # 转换Box到最终图坐标(x坐标+region_w)
    for (x1, y1, x2, y2), cls in zip(boxes1, cls1):
        x1_final = x1 + region_w
        x2_final = x2 + region_w
        xc = (x1_final + x2_final) / 2 / final_w
        yc = (y1 + y2) / 2 / final_h
        bw = (x2_final - x1_final) / final_w
        bh = (y2 - y1) / final_h
        all_final_boxes.append([xc, yc, bw, bh])
        all_class_ids.append(cls)

    # 图2 → 左下区域
    img2, boxes2, cls2 = process_single_image(img_paths[2], label_paths[2], (region_w, region_h))
    mosaic_img[region_h:final_h, 0:region_w] = img2
    box_counts_per_img[2] = len(boxes2)
    # 转换Box到最终图坐标(y坐标+region_h)
    for (x1, y1, x2, y2), cls in zip(boxes2, cls2):
        y1_final = y1 + region_h
        y2_final = y2 + region_h
        xc = (x1 + x2) / 2 / final_w
        yc = (y1_final + y2_final) / 2 / final_h
        bw = (x2 - x1) / final_w
        bh = (y2_final - y1_final) / final_h
        all_final_boxes.append([xc, yc, bw, bh])
        all_class_ids.append(cls)

    # 图3 → 右下区域
    img3, boxes3, cls3 = process_single_image(img_paths[3], label_paths[3], (region_w, region_h))
    mosaic_img[region_h:final_h, region_w:final_w] = img3
    box_counts_per_img[3] = len(boxes3)
    # 转换Box到最终图坐标(x+region_w, y+region_h)
    for (x1, y1, x2, y2), cls in zip(boxes3, cls3):
        x1_final = x1 + region_w
        x2_final = x2 + region_w
        y1_final = y1 + region_h
        y2_final = y2 + region_h
        xc = (x1_final + x2_final) / 2 / final_w
        yc = (y1_final + y2_final) / 2 / final_h
        bw = (x2_final - x1_final) / final_w
        bh = (y2_final - y1_final) / final_h
        all_final_boxes.append([xc, yc, bw, bh])
        all_class_ids.append(cls)

    # 转换为numpy数组
    all_final_boxes = np.array(all_final_boxes, dtype=np.float32)
    return mosaic_img, all_final_boxes, all_class_ids, box_counts_per_img


# ====================== 可视化:绘制所有Box ======================
def visualize_mosaic_with_all_boxes(mosaic_img, boxes, class_ids, box_counts_per_img, save_path):
    """
    修复NameError,正确统计每张图的Box数量
    :param mosaic_img: 最终拼接图
    :param boxes: 所有Box的归一化坐标
    :param class_ids: 所有Box的类别ID
    :param box_counts_per_img: 每张图的Box数量列表 [img0, img1, img2, img3]
    :param save_path: 可视化图保存路径
    """
    img_vis = mosaic_img.copy()
    final_h, final_w = img_vis.shape[:2]
    # 不同图的Box用不同颜色区分
    colors = [
        (0, 255, 0),  # 图0(左上)→ 绿色
        (0, 0, 255),  # 图1(右上)→ 红色
        (255, 0, 0),  # 图2(左下)→ 蓝色
        (255, 255, 0)  # 图3(右下)→ 黄色
    ]

    # 按图的索引遍历Box
    box_idx = 0
    for img_idx in range(4):
        color = colors[img_idx]
        # 取当前图的Box数量
        curr_box_num = box_counts_per_img[img_idx]
        if curr_box_num == 0:
            continue
        # 遍历当前图的所有Box
        for _ in range(curr_box_num):
            if box_idx >= len(boxes):
                break
            box = boxes[box_idx]
            cls = class_ids[box_idx]
            # 转换为像素坐标
            xc, yc, bw, bh = box
            x1 = int((xc - bw / 2) * final_w)
            y1 = int((yc - bh / 2) * final_h)
            x2 = int((xc + bw / 2) * final_w)
            y2 = int((yc + bh / 2) * final_h)
            # 绘制Box和标注
            cv2.rectangle(img_vis, (x1, y1), (x2, y2), color, 2)
            cv2.putText(img_vis, f"img{img_idx}_cls{cls}", (x1, y1 - 5),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
            box_idx += 1

    # 保存可视化图
    cv2.imwrite(save_path, img_vis)
    print(f"带所有Box的Mosaic图已保存:{save_path}")
    print(
        f"各图Box数量:img0={box_counts_per_img[0]}, img1={box_counts_per_img[1]}, img2={box_counts_per_img[2]}, img3={box_counts_per_img[3]}")


# ====================== 保存YOLO标签 ======================
def save_yolo_labels(label_path, class_ids, boxes):
    with open(label_path, "w") as f:
        for cls, box in zip(class_ids, boxes):
            xc, yc, bw, bh = box
            f.write(f"{cls} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n")
    print(f"包含所有Box的YOLO标签已保存:{label_path}")


# ====================== 主函数 ======================
if __name__ == "__main__":
    # 1. 配置4张图/标签路径(替换为你的实际路径)
    IMG_PATHS = [
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy.jpg",
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy1.jpg",
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy2.jpg",
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy3.jpg",
    ]
    LABEL_PATHS = [
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy.txt",
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy1.txt",
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy2.txt",
        r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy3.txt"
    ]
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out"
    FINAL_SIZE = (640, 640)  # 最终Mosaic图尺寸

    # 2. 创建保存目录
    os.makedirs(SAVE_DIR, exist_ok=True)

    # 3. 执行固定位置Mosaic(强制保留所有图和Box)
    mosaic_img, all_boxes, all_classes, box_counts = fixed_position_mosaic(IMG_PATHS, LABEL_PATHS, FINAL_SIZE)

    # 4. 保存纯Mosaic图
    mosaic_img_path = os.path.join(SAVE_DIR, "mosaic_fixed_position1.jpg")
    cv2.imwrite(mosaic_img_path, mosaic_img)
    print(f"纯Mosaic图已保存:{mosaic_img_path}")

    # 5. 保存带所有Box的可视化图(修复参数传递)
    vis_img_path = os.path.join(SAVE_DIR, "mosaic_with_all_boxes1.jpg")
    visualize_mosaic_with_all_boxes(mosaic_img, all_boxes, all_classes, box_counts, vis_img_path)

    # 6. 保存包含所有Box的YOLO标签
    label_path = os.path.join(SAVE_DIR, "mosaic_all_boxes1.txt")
    save_yolo_labels(label_path, all_classes, all_boxes)

    # 7. 最终统计
    print(f"\n=== 固定位置Mosaic增强完成 ===")
    print(f"最终图尺寸:{FINAL_SIZE}")
    print(f"总Box数量:{len(all_boxes)}(4张图的Box全部保留)")
    print(f"保存路径:{SAVE_DIR}")
    print(f"核心特性:")
    print(f"1. 图0→左上,图1→右上,图2→左下,图3→右下,位置固定;")
    print(f"2. 每张图的Box全部转换为最终图坐标,无遗漏;")
    print(f"3. 可视化图中不同图的Box用不同颜色区分,便于验证;")
    print(f"4. 标签文件包含所有Box,可直接用于YOLO训练。")

3.paste_img1box_to_img2数据增强

本质:把 img1 中某个 Box 对应的像素区域裁剪出来,粘贴到 img2 的指定位置,同时将该 Box 的坐标从 img1 的归一化坐标转换为 img2 的归一化坐标,生成新标签。

步骤 核心操作 公式 / 逻辑
1. Box 解析 归一化→像素坐标 x1 = (xc - bw/2)*w1, y1 = (yc - bh/2)*h1
2. 裁剪区域 从 img1 裁剪 Box 区域 crop_img = img1[y1:y2, x1:x2]
3. 粘贴位置 避免越界 paste_x ∈ [0, w2-crop_w], paste_y ∈ [0, h2-crop_h]
4. 图像粘贴 覆盖 img2 对应区域 img2[paste_y:paste_y+crop_h, paste_x:paste_x+crop_w] = crop_img
5. 坐标转换 像素→归一化(img2) new_xc = (paste_x + paste_x+crop_w)/2 /w2
python 复制代码
import cv2
import numpy as np

def paste_img1box_to_img2(
    img1_path: str,    # 源图(要裁剪Box的图)路径
    img2_path: str,    # 目标图(要粘贴的图)路径
    label1_path: str,  # 源图标签路径(包含Box信息)
    box_idx: int = 0,  # 选择源图的第几个Box(默认第0个)
    paste_pos: tuple = None,  # 粘贴位置 (x, y),None则随机位置
    save_path: str = "paste_result.jpg",  # 结果保存路径
    label_save_path: str = "paste_result.txt"  # 新标签保存路径
):
    """
    将img1的指定Box区域粘贴到img2,并生成新的标签(Box坐标转换为img2的归一化坐标)
    :return: 粘贴后的图像、新的归一化Box坐标、新标签
    """
    # ========== 步骤1:读取并预处理图像 ==========
    # 读取源图img1和目标图img2
    img1 = cv2.imread(img1_path)
    img2 = cv2.imread(img2_path)
    if img1 is None:
        raise ValueError(f"源图{img1_path}读取失败!")
    if img2 is None:
        raise ValueError(f"目标图{img2_path}读取失败!")
    h1, w1 = img1.shape[:2]  # 源图尺寸
    h2, w2 = img2.shape[:2]  # 目标图尺寸

    # ========== 步骤2:解析img1的Box(归一化→像素坐标) ==========
    box1_norm = None  # 源图Box的归一化坐标 [xc, yc, bw, bh]
    box1_pix = None   # 源图Box的像素坐标 [x1, y1, x2, y2]
    cls1 = None       # 源图Box的类别ID

    with open(label1_path, "r") as f:
        lines = [line.strip() for line in f if line.strip()]
        if box_idx >= len(lines):
            raise IndexError(f"源图标签仅{len(lines)}个Box,无法选择第{box_idx}个!")
        # 解析指定Box
        parts = lines[box_idx].split()
        if len(parts) < 5:
            raise ValueError(f"源图标签行格式错误:{lines[box_idx]}")
        cls1 = int(parts[0])
        xc, yc, bw, bh = [float(p) for p in parts[1:5]]
        # 归一化坐标→像素坐标(x1,y1=左上,x2,y2=右下)
        x1 = int((xc - bw/2) * w1)
        y1 = int((yc - bh/2) * h1)
        x2 = int((xc + bw/2) * w1)
        y2 = int((yc + bh/2) * h1)
        # 确保Box在img1范围内
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(w1-1, x2)
        y2 = min(h1-1, y2)
        box1_norm = [xc, yc, bw, bh]
        box1_pix = [x1, y1, x2, y2]

    # ========== 步骤3:裁剪img1的Box区域 ==========
    crop_img = img1[y1:y2, x1:x2]  # 裁剪Box对应的像素区域
    crop_h, crop_w = crop_img.shape[:2]
    if crop_h == 0 or crop_w == 0:
        raise ValueError(f"源图Box裁剪区域为空!Box像素坐标:{box1_pix}")

    # ========== 步骤4:确定img2上的粘贴位置(避免越界) ==========
    if paste_pos is None:
        # 随机位置(确保裁剪区域完全在img2内)
        paste_x = np.random.randint(0, w2 - crop_w)
        paste_y = np.random.randint(0, h2 - crop_h)
    else:
        paste_x, paste_y = paste_pos
        # 强制修正越界位置
        paste_x = max(0, min(paste_x, w2 - crop_w))
        paste_y = max(0, min(paste_y, h2 - crop_h))

    # ========== 步骤5:将裁剪区域粘贴到img2 ==========
    img2_pasted = img2.copy()
    img2_pasted[paste_y:paste_y+crop_h, paste_x:paste_x+crop_w] = crop_img

    # ========== 步骤6:转换Box坐标为img2的归一化坐标 ==========
    # 粘贴后的Box像素坐标(img2上)
    new_x1 = paste_x
    new_y1 = paste_y
    new_x2 = paste_x + crop_w
    new_y2 = paste_y + crop_h
    # 像素坐标→归一化坐标
    new_xc = (new_x1 + new_x2) / 2 / w2
    new_yc = (new_y1 + new_y2) / 2 / h2
    new_bw = (new_x2 - new_x1) / w2
    new_bh = (new_y2 - new_y1) / h2
    new_box_norm = [new_xc, new_yc, new_bw, new_bh]

    # ========== 步骤7:保存结果 ==========
    # 保存粘贴后的图像
    cv2.imwrite(save_path, img2_pasted)
    print(f"粘贴后的图像已保存:{save_path}")
    # 保存新标签(格式:cls xc yc bw bh)
    with open(label_save_path, "w") as f:
        f.write(f"{cls1} {new_xc:.6f} {new_yc:.6f} {new_bw:.6f} {new_bh:.6f}\n")
    print(f"新标签已保存:{label_save_path}")

    return img2_pasted, new_box_norm, cls1

# ------------------- 调用示例 -------------------
if __name__ == "__main__":
    # 配置路径
    IMG1_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy2.jpg"    # 源图(要裁剪的图)
    IMG2_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy1.jpg"     # 目标图(要粘贴的背景图)
    LABEL1_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy2.txt"  # 源图标签
    SAVE_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\paste_result.jpg"
    LABEL_SAVE_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\paste_label.txt"

    # 执行粘贴操作:将img1的第0个Box粘贴到img2的(100, 100)位置
    try:
        pasted_img, new_box, cls = paste_img1box_to_img2(
            img1_path=IMG1_PATH,
            img2_path=IMG2_PATH,
            label1_path=LABEL1_PATH,
            box_idx=0,          # 选择img1的第0个Box
            paste_pos=(100, 100),  # 粘贴到img2的(100,100)位置
            save_path=SAVE_PATH,
            label_save_path=LABEL_SAVE_PATH
        )
        print(f"粘贴完成!")
        print(f"新Box归一化坐标:xc={new_box[0]:.6f}, yc={new_box[1]:.6f}, bw={new_box[2]:.6f}, bh={new_box[3]:.6f}")
        print(f"Box类别:{cls}")
    except Exception as e:
        print(f"执行失败:{e}")

3.局部增强/裁剪类数据增强锦集

1.sliding_crop

本质:从一张大图中按固定步长滑动裁剪出多个子图,同时将Box坐标转换到对应的子图中,是目标检测中扩充数据/处理大图的常用手段。

「按固定窗口 + 步长遍历大图→裁剪子图→转换原图 Box 到子图坐标」

python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Project :Pytorch
@File :sliding_crop.py
@IDE :PyCharm
@Author :wjj
@Date :2025/12/15 16:43
@Description: Sliding Crop (Fix encoding & window calculation)
"""
import cv2
import numpy as np
import os
from typing import List, Tuple, Dict


def sliding_crop(
        img_path: str,
        label_path: str,
        crop_size: Tuple[int, int] = (640, 640),  # Crop window size (w, h)
        stride: Tuple[int, int] = (320, 320),  # Sliding stride (x, y)
        save_dir: str = "sliding_crop_album",  # Album save dir
        keep_partial_box: bool = True,  # Keep box if center in window
        filter_empty: bool = True,  # Filter sub-img without box
        class_names: List[str] = None  # Class names (optional)
) -> List[Dict]:
    """
    Sliding crop large image to sub-images, convert box coords to sub-img normalized coords
    :return: Crop results list (sub-img path, label path, box num, etc.)
    """
    # ========== Initialization ==========
    os.makedirs(save_dir, exist_ok=True)
    crop_w, crop_h = crop_size
    stride_x, stride_y = stride
    result_list = []

    # ========== Read image & label ==========
    # Read large image
    img = cv2.imread(img_path)
    if img is None:
        raise ValueError(f"Failed to read image: {img_path}!")
    img_h, img_w = img.shape[:2]

    # Auto adjust oversize crop window
    if crop_w >= img_w or crop_h >= img_h:
        print(
            f"Warning: Crop window ({crop_w}x{crop_h}) ≥ image size ({img_w}x{img_h}), auto adjust to 1/2 of image size")
        crop_w = img_w // 2
        crop_h = img_h // 2
        stride_x = crop_w // 2
        stride_y = crop_h // 2
        print(f"Adjusted: Crop window ({crop_w}x{crop_h}), stride ({stride_x}x{stride_y})")

    # Parse label (normalized → pixel coords, GBK encoding for Windows)
    original_boxes = []
    try:
        with open(label_path, "r", encoding='gbk') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                parts = line.split()
                if len(parts) < 5:
                    print(f"Warning: Invalid label line {line}, skip")
                    continue
                try:
                    cls_id = int(parts[0])
                    xc, yc, bw, bh = [float(p) for p in parts[1:5]]
                    # Normalized → pixel coords
                    x1 = int((xc - bw / 2) * img_w)
                    y1 = int((yc - bh / 2) * img_h)
                    x2 = int((xc + bw / 2) * img_w)
                    y2 = int((yc + bh / 2) * img_h)
                    # Fix out-of-bounds
                    x1 = max(0, x1)
                    y1 = max(0, y1)
                    x2 = min(img_w - 1, x2)
                    y2 = min(img_h - 1, y2)
                    original_boxes.append((x1, y1, x2, y2, cls_id))
                except Exception as e:
                    print(f"Failed to parse label line {line}: {e}")
                    continue
    except FileNotFoundError:
        raise FileNotFoundError(f"Label file {label_path} not found!")

    # ========== Calculate sliding window positions ==========
    # X direction starts
    x_starts = []
    x = 0
    while x + crop_w <= img_w:
        x_starts.append(x)
        x += stride_x
    if x_starts and x_starts[-1] + crop_w < img_w:
        x_starts.append(img_w - crop_w)
    if not x_starts:
        x_starts = [0]

    # Y direction starts
    y_starts = []
    y = 0
    while y + crop_h <= img_h:
        y_starts.append(y)
        y += stride_y
    if y_starts and y_starts[-1] + crop_h < img_h:
        y_starts.append(img_h - crop_h)
    if not y_starts:
        y_starts = [0]

    # ========== Crop sub-images ==========
    crop_idx = 0
    for y_start in y_starts:
        for x_start in x_starts:
            x_end = x_start + crop_w
            y_end = y_start + crop_h

            # Crop sub-image
            sub_img = img[y_start:y_end, x_start:x_end]
            sub_img_h, sub_img_w = sub_img.shape[:2]
            if sub_img_h != crop_h or sub_img_w != crop_w:
                # Pad black border to keep size
                sub_img = cv2.copyMakeBorder(
                    sub_img, 0, crop_h - sub_img_h, 0, crop_w - sub_img_w,
                    cv2.BORDER_CONSTANT, value=(0, 0, 0)
                )

            # Filter boxes in current window
            sub_boxes = []
            for (bx1, by1, bx2, by2, cls_id) in original_boxes:
                box_cx = (bx1 + bx2) / 2
                box_cy = (by1 + by2) / 2
                if keep_partial_box:
                    in_window = (x_start <= box_cx <= x_end) and (y_start <= box_cy <= y_end)
                else:
                    in_window = (x_start <= bx1) and (bx2 <= x_end) and (y_start <= by1) and (by2 <= y_end)
                if not in_window:
                    continue

                # Convert to sub-image normalized coords
                sub_bx1 = bx1 - x_start
                sub_by1 = by1 - y_start
                sub_bx2 = bx2 - x_start
                sub_by2 = by2 - y_start
                # Fix out-of-bounds in sub-image
                sub_bx1 = max(0, sub_bx1)
                sub_by1 = max(0, sub_by1)
                sub_bx2 = min(crop_w - 1, sub_bx2)
                sub_by2 = min(crop_h - 1, sub_by2)
                # Pixel → normalized
                sub_xc = (sub_bx1 + sub_bx2) / 2 / crop_w
                sub_yc = (sub_by1 + sub_by2) / 2 / crop_h
                sub_bw = (sub_bx2 - sub_bx1) / crop_w
                sub_bh = (sub_by2 - sub_by1) / crop_h
                sub_boxes.append((sub_xc, sub_yc, sub_bw, sub_bh, cls_id))

            # Filter empty sub-images
            if filter_empty and len(sub_boxes) == 0:
                continue

            # Save sub-image and label
            sub_img_name = f"crop_{crop_idx:04d}_x{x_start}_y{y_start}.jpg"
            sub_label_name = f"crop_{crop_idx:04d}_x{x_start}_y{y_start}.txt"
            sub_img_path = os.path.join(save_dir, sub_img_name)
            sub_label_path = os.path.join(save_dir, sub_label_name)

            cv2.imwrite(sub_img_path, sub_img)
            # Save label with GBK encoding
            with open(sub_label_path, "w", encoding='gbk') as f:
                for (xc, yc, bw, bh, cls_id) in sub_boxes:
                    f.write(f"{cls_id} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n")

            # Record result
            result_info = {
                "crop_idx": crop_idx,
                "window_pos": (x_start, y_start, x_end, y_end),
                "sub_img_path": sub_img_path,
                "sub_label_path": sub_label_path,
                "box_num": len(sub_boxes)
            }
            result_list.append(result_info)
            print(f"Crop completed {sub_img_path} | Box count: {len(sub_boxes)}")
            crop_idx += 1

    # ========== Generate summary file (GBK encoding) ==========
    album_summary_path = os.path.join(save_dir, "sliding_crop_summary.txt")
    with open(album_summary_path, "w", encoding='gbk') as f:
        f.write("Sliding Crop Album Summary\n")
        f.write(f"Original image path: {img_path}\n")
        f.write(f"Original image size: {img_w}x{img_h}\n")
        f.write(f"Crop window size: {crop_w}x{crop_h}\n")
        f.write(f"Sliding stride: {stride_x}x{stride_y}\n")
        f.write(f"Generated sub-images: {len(result_list)}\n")
        f.write("-" * 50 + "\n")
        for info in result_list:
            f.write(
                f"Index {info['crop_idx']} | Window pos {info['window_pos']} | Sub-img {info['sub_img_path']} | Box count {info['box_num']}\n")

    print(f"\nSliding crop completed!")
    print(f"Album save directory: {save_dir}")
    print(f"Generated sub-images count: {len(result_list)}")
    print(f"Summary file: {album_summary_path}")

    return result_list


# ------------------- Test Example -------------------
if __name__ == "__main__":
    # Config
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy1.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy1.txt"
    SAVE_DIR = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\sliding_crop_album"
    CROP_SIZE = (640, 640)  # Reasonable size for model input
    STRIDE = (320, 320)  # 50% overlap

    # Run sliding crop
    try:
        crop_results = sliding_crop(
            img_path=IMG_PATH,
            label_path=LABEL_PATH,
            crop_size=CROP_SIZE,
            stride=STRIDE,
            save_dir=SAVE_DIR,
            keep_partial_box=True,
            filter_empty=True
        )
    except Exception as e:
        print(f"Sliding crop failed: {e}")

实现效果1->4的核心对比:

裁剪后的子图能实现数据增强:

  1. 样本数量扩充:从 1 张原图→4 张子图,直接增加训练样本量,降低模型过拟合风险;
  2. 目标位置多样性:子图中目标的「相对位置」和原图不同(比如原图中目标在中间,子图中目标在左上角 / 右下角),模型能学习到「目标出现在图像任意位置」的特征,避免对目标位置过拟合;
  3. 目标聚焦增强:如果原图中目标占比小(比如小目标),裁剪后的子图会让目标在画面中占比更高,模型更容易捕捉目标的细节特征;
  4. 抗截断鲁棒性:50% 重叠的裁剪方式,会让跨窗口的目标在不同子图中被「部分保留」(比如目标一半在 A 子图、一半在 B 子图),模型能学习到「目标被截断时的特征」,提升检测的鲁棒性。

2.copy_paste

本质:在同一张图内复制指定类别目标,随机粘贴到空白区域

同图内复制指定类别目标→随机粘贴到空白区域

支持翻转 / 旋转 / 缩放增强,且通过掩码避免目标重叠。

python 复制代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@Project :Pytorch
@File :copy_paste.py
@IDE :PyCharm
@Author :wjj
@Date :2025/12/15 17:12
@Description:
"""
import cv2
import numpy as np
import os
import random
from typing import List, Tuple


def copy_paste(
        img_path: str,
        label_path: str,
        target_cls: int,  # 要复制的目标类别ID(如0)
        paste_num: int = 3,  # 复制粘贴的数量(默认3个)
        save_path: str = "copy_paste_aug.jpg",
        new_label_path: str = "copy_paste_aug.txt",
        aug_prob: float = 0.5,  # 目标增强(翻转/旋转/缩放)的概率
        scale_range: Tuple = (0.8, 1.2),  # 缩放范围
        rotate_range: Tuple = (-15, 15)  # 旋转角度范围(±15°)
):
    """
    同图内复制指定类别目标,随机粘贴到空白区域(避免重叠),支持目标增强
    :param target_cls: 要增强的稀有目标类别ID
    :param paste_num: 复制粘贴的目标数量
    """
    # ========== 1. 读取图片和标签 ==========
    # 读取原图
    img = cv2.imread(img_path)
    if img is None:
        raise ValueError(f"读取图片失败: {img_path}")
    img_h, img_w = img.shape[:2]
    img_copy = img.copy()  # 用于粘贴的画布

    # 解析标签:分离目标类别 + 转换为像素坐标
    original_boxes = []  # 所有目标:[(x1,y1,x2,y2,cls_id), ...]
    target_boxes = []  # 指定类别的目标:[(x1,y1,x2,y2), ...]

    with open(label_path, "r", encoding='gbk') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 5:
                continue
            cls_id = int(parts[0])
            xc, yc, bw, bh = [float(p) for p in parts[1:5]]
            # 归一化→像素坐标
            x1 = int((xc - bw / 2) * img_w)
            y1 = int((yc - bh / 2) * img_h)
            x2 = int((xc + bw / 2) * img_w)
            y2 = int((yc + bh / 2) * img_h)
            original_boxes.append((x1, y1, x2, y2, cls_id))
            # 筛选指定类别的目标
            if cls_id == target_cls:
                target_boxes.append((x1, y1, x2, y2))

    # 校验:指定类别是否有目标
    if len(target_boxes) == 0:
        raise ValueError(f"图片中无类别ID为 {target_cls} 的目标!")

    # ========== 2. 生成空白区域掩码(避免粘贴重叠) ==========
    # 初始化掩码:0=空白,1=已有目标
    mask = np.zeros((img_h, img_w), dtype=np.uint8)
    for (x1, y1, x2, y2, _) in original_boxes:
        mask[y1:y2, x1:x2] = 1  # 已有目标区域标记为1

    # ========== 3. 复制指定类别目标并增强 ==========
    # 随机选一个指定类别目标作为复制模板(若有多个,随机选)
    src_x1, src_y1, src_x2, src_y2 = random.choice(target_boxes)
    src_crop = img[src_y1:src_y2, src_x1:src_x2]  # 复制的目标像素区域
    src_h, src_w = src_crop.shape[:2]

    # 存储粘贴后的新目标坐标(用于生成新标签)
    new_boxes = []

    # ========== 4. 随机粘贴到空白区域 ==========
    paste_count = 0
    max_attempts = paste_num * 10  # 最大尝试次数(避免死循环)
    attempts = 0

    while paste_count < paste_num and attempts < max_attempts:
        attempts += 1

        # -------- 4.1 对复制的目标做增强(翻转/旋转/缩放) --------
        aug_crop = src_crop.copy()
        # 随机水平翻转
        if random.random() < aug_prob:
            aug_crop = cv2.flip(aug_crop, 1)
        # 随机缩放
        scale = random.uniform(*scale_range)
        aug_w = int(src_w * scale)
        aug_h = int(src_h * scale)
        aug_crop = cv2.resize(aug_crop, (aug_w, aug_h))
        # 随机旋转(带黑边,避免裁剪)
        if random.random() < aug_prob:
            angle = random.uniform(*rotate_range)
            M = cv2.getRotationMatrix2D((aug_w / 2, aug_h / 2), angle, 1)
            aug_crop = cv2.warpAffine(aug_crop, M, (aug_w, aug_h), borderValue=(0, 0, 0))

        # -------- 4.2 随机选择空白粘贴位置 --------
        # 确保粘贴位置在图片内,且目标完整放下
        paste_x = random.randint(0, img_w - aug_w)
        paste_y = random.randint(0, img_h - aug_h)
        # 检查粘贴区域是否为空白(掩码全0)
        paste_mask = mask[paste_y:paste_y + aug_h, paste_x:paste_x + aug_w]
        if np.sum(paste_mask) == 0:  # 无重叠,可粘贴
            # 粘贴目标到画布
            img_copy[paste_y:paste_y + aug_h, paste_x:paste_x + aug_w] = aug_crop
            # 更新掩码(标记为已有目标)
            mask[paste_y:paste_y + aug_h, paste_x:paste_x + aug_w] = 1
            # 记录新目标坐标(像素→归一化)
            new_xc = (paste_x + aug_w / 2) / img_w
            new_yc = (paste_y + aug_h / 2) / img_h
            new_bw = aug_w / img_w
            new_bh = aug_h / img_h
            new_boxes.append((target_cls, new_xc, new_yc, new_bw, new_bh))
            paste_count += 1
            print(f"成功粘贴第 {paste_count} 个目标 → 位置: ({paste_x}, {paste_y})")

    if paste_count < paste_num:
        print(f"警告:仅成功粘贴 {paste_count} 个目标(剩余位置有重叠)")

    # ========== 5. 生成新标签(原目标 + 新粘贴目标) ==========
    with open(new_label_path, "w", encoding='gbk') as f:
        # 写入原目标
        for (x1, y1, x2, y2, cls_id) in original_boxes:
            xc = (x1 + x2) / 2 / img_w
            yc = (y1 + y2) / 2 / img_h
            bw = (x2 - x1) / img_w
            bh = (y2 - y1) / img_h
            f.write(f"{cls_id} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n")
        # 写入新粘贴的目标
        for (cls_id, xc, yc, bw, bh) in new_boxes:
            f.write(f"{cls_id} {xc:.6f} {yc:.6f} {bw:.6f} {bh:.6f}\n")

    # ========== 6. 保存增强后的图片 ==========
    cv2.imwrite(save_path, img_copy)
    print(f"\nCopy-Paste 增强完成!")
    print(f"增强后图片保存至: {save_path}")
    print(f"新标签保存至: {new_label_path}")
    print(f"原目标数量: {len(original_boxes)} | 新增目标数量: {len(new_boxes)}")

    return img_copy, new_boxes


# ------------------- 测试示例 -------------------
if __name__ == "__main__":
    # 配置参数(仅需修改以下5项)
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy1.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy1.txt"
    TARGET_CLS = 0  # 要增强的目标类别ID(根据你的标签修改)
    SAVE_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\copy_paste_aug.jpg"
    NEW_LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\copy_paste_aug.txt"

    # 核心调用
    try:
        copy_paste(
            img_path=IMG_PATH,
            label_path=LABEL_PATH,
            target_cls=TARGET_CLS,
            paste_num=3,  # 复制粘贴3个目标
            save_path=SAVE_PATH,
            new_label_path=NEW_LABEL_PATH
        )
    except Exception as e:
        print(f"执行失败: {e}")

修改前后对比:

模块 实现逻辑
目标筛选 解析标签后筛选指定target_cls的目标,仅复制该类别(解决稀有目标增强)
重叠避免 生成掩码矩阵,标记已有目标区域,仅在掩码为 0 的空白区域粘贴
目标增强 复制的目标随机做:水平翻转(50% 概率)、缩放(0.8~1.2 倍)、小角度旋转
坐标转换 粘贴后自动将像素坐标转换为 YOLO 格式的归一化坐标,生成新标签
鲁棒性 限制最大尝试次数,避免因空白区域不足导致死循环,同时提示实际粘贴数量

3.pyr_up/pyr_down

本质:很简单,就是我们常见的将原图放大或缩小达到增加数据样本的作用。

  1. 上采样(小目标增强)

    • 建议仅用 1~2 次:1 次缩放后目标尺寸 ×2,小目标细节更清晰;2 次后可能模糊,需结合实际图像质量判断;
    • 适用场景:原图中小目标占比 < 5%,检测模型难以捕捉的场景。
  2. 下采样(大目标适配)

    • 无次数限制(建议 1~3 次):每次缩小为原来的 1/2,适配模型输入尺寸(如 640×640);
    • 适用场景:原图中大目标占比 > 80%,超出模型检测范围的场景。
  3. 验证方法

    • 运行代码后,打开缩放后的图片,对比标注框是否与目标位置匹配;
    • 检查新标签文件的归一化坐标,确保数值在 0~1 范围内(无越界)
python 复制代码
import cv2
import numpy as np

def pyr_scale(
    img_path: str,
    label_path: str,
    scale_type: str = "up",  # "up"放大/"down"缩小
    scale_times: int = 1,    # 缩放次数(up建议1次)
    save_path: str = "pyr_scaled.jpg",
    new_label_path: str = "pyr_scaled.txt"
):
    # 1. 读取原图
    img = cv2.imread(img_path)
    orig_h, orig_w = img.shape[:2]
    scaled_img = img.copy()

    # 2. 解析原标签(归一化→像素坐标)
    orig_boxes = []
    with open(label_path, "r", encoding='gbk') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 5: continue
            cls_id = int(parts[0])
            xc, yc, bw, bh = [float(p) for p in parts[1:5]]
            x1 = int((xc - bw/2) * orig_w)
            y1 = int((yc - bh/2) * orig_h)
            x2 = int((xc + bw/2) * orig_w)
            y2 = int((yc + bh/2) * orig_h)
            orig_boxes.append((x1, y1, x2, y2, cls_id))

    # 3. 执行图像缩放(pyrUp/pyrDown)
    scale_factor = 1.0
    for _ in range(scale_times):
        if scale_type == "up":
            scaled_img = cv2.pyrUp(scaled_img)
            scale_factor *= 2
        else:
            scaled_img = cv2.pyrDown(scaled_img)
            scale_factor /= 2
    scaled_h, scaled_w = scaled_img.shape[:2]

    # 4. 缩放标注框并生成新标签
    with open(new_label_path, "w", encoding='gbk') as f:
        for (x1, y1, x2, y2, cls_id) in orig_boxes:
            # 像素坐标缩放 + 修正越界
            new_x1 = max(0, int(x1 * scale_factor))
            new_y1 = max(0, int(y1 * scale_factor))
            new_x2 = min(scaled_w-1, int(x2 * scale_factor))
            new_y2 = min(scaled_h-1, int(y2 * scale_factor))
            # 像素→归一化坐标
            new_xc = (new_x1 + new_x2)/2 / scaled_w
            new_yc = (new_y1 + new_y2)/2 / scaled_h
            new_bw = (new_x2 - new_x1)/scaled_w
            new_bh = (new_y2 - new_y1)/scaled_h
            f.write(f"{cls_id} {new_xc:.6f} {new_yc:.6f} {new_bw:.6f} {new_bh:.6f}\n")

    # 5. 保存结果
    cv2.imwrite(save_path, scaled_img)
    print(f"完成!原图尺寸:{orig_w}x{orig_h} → 缩放后:{scaled_w}x{scaled_h}")
    print(f"缩放后图片:{save_path} | 新标签:{new_label_path}")

# ------------------- 极简验证调用 -------------------
if __name__ == "__main__":
    # 仅需修改这6行路径/参数即可验证
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy1.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy1.txt"
    SCALE_TYPE = "up"  # 测试放大(改"down"测试缩小)
    SCALE_TIMES = 1    # 仅缩放1次(避免模糊)
    SAVE_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\pyr_up.jpg"
    NEW_LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\pyr_up.txt"

    pyr_scale(
        img_path=IMG_PATH,
        label_path=LABEL_PATH,
        scale_type=SCALE_TYPE,
        scale_times=SCALE_TIMES,
        save_path=SAVE_PATH,
        new_label_path=NEW_LABEL_PATH
    )

图片与原图一致,仅变大/变小罢了。


4.cut_box

本质:

「裁剪小目标周边区域 + 放大」

  1. 图片变化

    • 原图 → 裁剪「目标 + 周边 100 像素」的局部区域 → 放大 2 倍,小目标特征被强化(更清晰);
    • 对比 IMG_PATH(原图)和 SAVE_PATH(裁剪放大图),能直观看到小目标从 "模糊小点" 变成 "清晰区域"。
  2. Label 变化

    • 原标签:目标坐标是「相对于整张原图」的归一化值;
    • 新标签:目标坐标是「相对于裁剪放大后局部图」的归一化值,框的占比会显著变大(因为图片尺寸变小,目标占比提升)
python 复制代码
import cv2
import numpy as np

def cut_box(
    img_path: str,
    label_path: str,
    target_cls: int = 0,      # 要增强的小目标类别ID
    expand_pixel: int = 100,  # 目标周边保留像素(默认100)
    scale_factor: int = 2,    # 放大倍数(强化小目标特征)
    save_path: str = "cut_box_aug.jpg",
    new_label_path: str = "cut_box_aug.txt"
):
    # 1. 读取原图
    img = cv2.imread(img_path)
    if img is None:
        raise ValueError(f"读取图片失败: {img_path}")
    img_h, img_w = img.shape[:2]

    # 2. 解析标签(归一化→像素坐标),筛选指定类别小目标
    target_box = None
    with open(label_path, "r", encoding='gbk') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 5: continue
            cls_id = int(parts[0])
            if cls_id != target_cls: continue  # 仅处理指定类别
            xc, yc, bw, bh = [float(p) for p in parts[1:5]]
            # 归一化→像素坐标
            x1 = int((xc - bw/2) * img_w)
            y1 = int((yc - bh/2) * img_h)
            x2 = int((xc + bw/2) * img_w)
            y2 = int((yc + bh/2) * img_h)
            target_box = (x1, y1, x2, y2)
            break  # 仅处理第一个指定类别目标(验证用)

    if target_box is None:
        raise ValueError(f"未找到类别ID为 {target_cls} 的目标!")
    x1, y1, x2, y2 = target_box

    # 3. 裁剪目标周边区域(保留expand_pixel像素)
    crop_x1 = max(0, x1 - expand_pixel)
    crop_y1 = max(0, y1 - expand_pixel)
    crop_x2 = min(img_w - 1, x2 + expand_pixel)
    crop_y2 = min(img_h - 1, y2 + expand_pixel)
    cut_img = img[crop_y1:crop_y2, crop_x1:crop_x2]  # 裁剪区域

    # 4. 放大裁剪区域(强化小目标特征)
    cut_h, cut_w = cut_img.shape[:2]
    scaled_cut = cv2.resize(cut_img, (cut_w * scale_factor, cut_h * scale_factor))
    scaled_h, scaled_w = scaled_cut.shape[:2]

    # 5. 重新计算目标框坐标(相对于放大后的裁剪图)
    rel_x1 = x1 - crop_x1
    rel_y1 = y1 - crop_y1
    rel_x2 = x2 - crop_x1
    rel_y2 = y2 - crop_y1
    # 放大后像素坐标(用于绘制框)
    new_x1 = int(rel_x1 * scale_factor)
    new_y1 = int(rel_y1 * scale_factor)
    new_x2 = int(rel_x2 * scale_factor)
    new_y2 = int(rel_y2 * scale_factor)
    # 归一化坐标(保存到标签)
    new_xc = (new_x1 + new_x2) / 2 / scaled_w
    new_yc = (new_y1 + new_y2) / 2 / scaled_h
    new_bw = (new_x2 - new_x1) / scaled_w
    new_bh = (new_y2 - new_y1) / scaled_h

    # ========== 新增:在放大后的图片上绘制label框 ==========
    # 绘制红色矩形框(线宽2,醒目)
    cv2.rectangle(scaled_cut, (new_x1, new_y1), (new_x2, new_y2), (0, 0, 255), 2)
    # 可选:添加类别文字标注
    cv2.putText(scaled_cut, f"cls_{target_cls}", (new_x1, new_y1-10),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 255), 2)

    # 6. 保存结果(带框的图片+新标签)
    cv2.imwrite(save_path, scaled_cut)
    with open(new_label_path, "w", encoding='gbk') as f:
        f.write(f"{target_cls} {new_xc:.6f} {new_yc:.6f} {new_bw:.6f} {new_bh:.6f}\n")

    # 输出关键信息(对比用)
    print(f"裁剪区域:({crop_x1},{crop_y1})→({crop_x2},{crop_y2})")
    print(f"裁剪后尺寸:{cut_w}x{cut_h} → 放大后:{scaled_w}x{scaled_h}")
    print(f"绘制框坐标:({new_x1},{new_y1})→({new_x2},{new_y2})")
    print(f"完成!带框图片:{save_path} | 新标签:{new_label_path}")

# ------------------- 极简验证调用 -------------------
if __name__ == "__main__":
    # 仅需修改以下6行参数即可验证
    IMG_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\jjy3.jpg"
    LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\jjy\jjy3.txt"
    TARGET_CLS = 0  # 要增强的小目标类别ID
    EXPAND_PIXEL = 100  # 目标周边保留100像素
    SAVE_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\cut_box_aug.jpg"
    NEW_LABEL_PATH = r"C:\Users\Excub\workspace\Pytorch\12.08_cv\out\cut_box_aug.txt"

    cut_box(
        img_path=IMG_PATH,
        label_path=LABEL_PATH,
        target_cls=TARGET_CLS,
        expand_pixel=EXPAND_PIXEL,
        save_path=SAVE_PATH,
        new_label_path=NEW_LABEL_PATH
    )

成品图片:


总结:

本章节承接上一份文章,继续总结了常见的数据增强的方法,并配上了相应的案例和实例仅供大家学习参考,如有不对的地方,欢迎大家指出!感谢!

相关推荐
JoannaJuanCV3 小时前
自动驾驶—CARLA仿真(24)sensor_synchronization demo
网络·人工智能·自动驾驶·carla
JoannaJuanCV3 小时前
自动驾驶—CARLA仿真(14)draw_skeleton demo
人工智能·机器学习·自动驾驶
测试人社区-千羽3 小时前
飞机自动驾驶系统测试:安全关键系统的全面验证框架
人工智能·安全·面试·职场和发展·自动化·自动驾驶·测试用例
Abona3 小时前
广义端到端(GE2E)自动驾驶技术综述:范式演进、核心挑战与破局路径
人工智能·机器学习·自动驾驶
CSDN官方博客3 小时前
CSDN社区镜像创作活动
大数据·运维·人工智能
棒棒的皮皮3 小时前
【OpenCV】Python图像处理几何变换之缩放
图像处理·python·opencv·计算机视觉
JoannaJuanCV3 小时前
自动驾驶—CARLA仿真(12)client_bounding_boxes demo
人工智能·自动驾驶·pygame·carla
志凌海纳SmartX3 小时前
AI知识科普丨ModelOps / MLOps / LLMOps 有什么区别?
人工智能
SACKings3 小时前
神经网络的层是什么?
人工智能·深度学习·神经网络