接续
继上一篇文章的模型复现完毕之后,就要完成对应的数据预处理以及训练,推理等步骤,整个模型才算完全复现完毕。
CornerNet-Reproduction/
├── models/
│ ├── __init__.py
│ ├── backbone.py # Hourglass网络
│ ├── corner_pool.py # Corner Pooling实现
│ ├── cornernet.py # 主网络
│ └── loss.py # 损失函数
├── utils/
│ ├── __init__.py
│ ├── image.py # 图像处理
│ ├── decode.py # 推理解码
│ └── visualization.py # 可视化工具
├── data/
│ ├── __init__.py
│ ├── coco.py # COCO数据集
│ └── transforms.py # 数据增强
├── configs/
│ └── cornernet_config.py # 配置文件
├── train.py # 训练脚本
├── test.py # 测试脚本
├── demo.py # 演示脚本
└── requirements.txt # 依赖包
数据增强
data/transform.py
(1)导入功能包,图像归一化的均值和标准差,基于COCO数据集
python
import cv2
import numpy as np
import random
import torch
CORNERNET_MEAN = (0.40789654, 0.44719302, 0.47026115) # RGB
CORNERNET_STD = (0.28863828, 0.27408164, 0.27809835)
实际上,V2版本transform已经自带了同步处理图像和标签的,自己不需要写负责的图像变换增强。这里为了锻炼自己对数据增强的理解,手搓了数据增强的代码。
python
#组合变换类
class Compose:
def __init__(self, transforms, strict=True):
self.transforms = transforms
self.strict = strict
def __call__(self, image, target):
for t in self.transforms:
try:
image, target = t(image, target)
except Exception:
if self.strict:
raise
return image, target
随机水平翻转增强,标签同步水平翻转
python
class RandomHorizontalFlip:
def __init__(self, prob=0.5):
self.prob = prob
def __call__(self, image, target):
if random.random() >= self.prob:
return image, target
image = np.ascontiguousarray(np.fliplr(image))
w = image.shape[1]
boxes = target["boxes"].copy()
boxes[:, [0, 2]] = w - boxes[:, [2, 0]]
target["boxes"] = boxes
return image, target
固定尺寸裁剪
python
class RandomCropFixSize:
"""
把图放进 size×size 画布中(不足则 padding),然后随机裁剪 size×size
这比"先强行 resize 再 crop"更贴近 CornerNet/CenterNet 的做法(带 border 概念)
"""
def __init__(self, size=511, pad_value=0):
self.size = int(size)
self.pad_value = int(pad_value)
def __call__(self, image, target):
h, w = image.shape[:2]
size = self.size
out = np.full((max(h, size), max(w, size), 3), self.pad_value, dtype=image.dtype)
out[:h, :w] = image
new_h, new_w = out.shape[:2]
x0 = random.randint(0, new_w - size)
y0 = random.randint(0, new_h - size)
image = out[y0:y0 + size, x0:x0 + size]
boxes = target["boxes"].copy()
boxes[:, [0, 2]] -= x0
boxes[:, [1, 3]] -= y0
boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, size)
boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, size)
bw = boxes[:, 2] - boxes[:, 0]
bh = boxes[:, 3] - boxes[:, 1]
keep = (bw > 1) & (bh > 1)
target["boxes"] = boxes[keep]
target["labels"] = target["labels"][keep]
return image, target
推理时候直接resize
python
class CenterPadResize:
"""
验证/推理:按比例缩放,使最长边= size,然后居中 pad 到 size×size
记录 meta:scale + border(用于还原到原图坐标)
"""
def __init__(self, size=511, pad_value=0):
self.size = int(size)
self.pad_value = int(pad_value)
def __call__(self, image, target):
orig_h, orig_w = image.shape[:2]
size = self.size
scale = size / max(orig_h, orig_w)
new_w = int(orig_w * scale + 0.5)
new_h = int(orig_h * scale + 0.5)
resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
out = np.full((size, size, 3), self.pad_value, dtype=resized.dtype)
x0 = (size - new_w) // 2
y0 = (size - new_h) // 2
out[y0:y0 + new_h, x0:x0 + new_w] = resized
if len(target.get("boxes", [])) > 0:
boxes = target["boxes"].copy() * scale
boxes[:, [0, 2]] += x0
boxes[:, [1, 3]] += y0
boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, size)
boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, size)
target["boxes"] = boxes
target.setdefault("meta", {})
target["meta"].update({
"orig_size": (int(orig_h), int(orig_w)),
"input_size": int(size),
"scale": float(scale),
"border": (float(x0), float(y0)),
"resized_size": (int(new_h), int(new_w)),
})
return out, target
标准化图像
python
class Normalize:
def __init__(self, mean=CORNERNET_MEAN, std=CORNERNET_STD):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
def __call__(self, image, target):
image = image.astype(np.float32) / 255.0
image = (image - self.mean) / self.std
return image, target
转换为Tensor
python
class ToTensor:
def __call__(self, image, target):
x = torch.from_numpy(image).permute(2, 0, 1).contiguous()
return x, target
然后我们将训练与验证分开即可,整套代码如下所示:
python
import cv2
import numpy as np
import random
import torch
CORNERNET_MEAN = (0.40789654, 0.44719302, 0.47026115) # RGB
CORNERNET_STD = (0.28863828, 0.27408164, 0.27809835)
class Compose:
def __init__(self, transforms, strict=True):
self.transforms = transforms
self.strict = strict
def __call__(self, image, target):
for t in self.transforms:
try:
image, target = t(image, target)
except Exception:
if self.strict:
raise
return image, target
class RandomHorizontalFlip:
def __init__(self, prob=0.5):
self.prob = prob
def __call__(self, image, target):
if random.random() >= self.prob:
return image, target
image = np.ascontiguousarray(np.fliplr(image))
w = image.shape[1]
boxes = target["boxes"].copy()
boxes[:, [0, 2]] = w - boxes[:, [2, 0]]
target["boxes"] = boxes
return image, target
class RandomScale:
"""
CornerNet 常见:离散尺度集合随机选
"""
def __init__(self, scales=(0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4)):
self.scales = scales
def __call__(self, image, target):
s = random.choice(self.scales)
h, w = image.shape[:2]
new_w = max(1, int(w * s + 0.5))
new_h = max(1, int(h * s + 0.5))
image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
if len(target["boxes"]) > 0:
target["boxes"] = target["boxes"] * s
return image, target
class RandomCropFixSize:
"""
把图放进 size×size 画布中(不足则 padding),然后随机裁剪 size×size
这比"先强行 resize 再 crop"更贴近 CornerNet/CenterNet 的做法(带 border 概念)
"""
def __init__(self, size=511, pad_value=0):
self.size = int(size)
self.pad_value = int(pad_value)
def __call__(self, image, target):
h, w = image.shape[:2]
size = self.size
out = np.full((max(h, size), max(w, size), 3), self.pad_value, dtype=image.dtype)
out[:h, :w] = image
new_h, new_w = out.shape[:2]
x0 = random.randint(0, new_w - size)
y0 = random.randint(0, new_h - size)
image = out[y0:y0 + size, x0:x0 + size]
boxes = target["boxes"].copy()
boxes[:, [0, 2]] -= x0
boxes[:, [1, 3]] -= y0
boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, size)
boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, size)
bw = boxes[:, 2] - boxes[:, 0]
bh = boxes[:, 3] - boxes[:, 1]
keep = (bw > 1) & (bh > 1)
target["boxes"] = boxes[keep]
target["labels"] = target["labels"][keep]
return image, target
class CenterPadResize:
"""
验证/推理:按比例缩放,使最长边= size,然后居中 pad 到 size×size
记录 meta:scale + border(用于还原到原图坐标)
"""
def __init__(self, size=511, pad_value=0):
self.size = int(size)
self.pad_value = int(pad_value)
def __call__(self, image, target):
orig_h, orig_w = image.shape[:2]
size = self.size
scale = size / max(orig_h, orig_w)
new_w = int(orig_w * scale + 0.5)
new_h = int(orig_h * scale + 0.5)
resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
out = np.full((size, size, 3), self.pad_value, dtype=resized.dtype)
x0 = (size - new_w) // 2
y0 = (size - new_h) // 2
out[y0:y0 + new_h, x0:x0 + new_w] = resized
if len(target.get("boxes", [])) > 0:
boxes = target["boxes"].copy() * scale
boxes[:, [0, 2]] += x0
boxes[:, [1, 3]] += y0
boxes[:, [0, 2]] = np.clip(boxes[:, [0, 2]], 0, size)
boxes[:, [1, 3]] = np.clip(boxes[:, [1, 3]], 0, size)
target["boxes"] = boxes
target.setdefault("meta", {})
target["meta"].update({
"orig_size": (int(orig_h), int(orig_w)),
"input_size": int(size),
"scale": float(scale),
"border": (float(x0), float(y0)),
"resized_size": (int(new_h), int(new_w)),
})
return out, target
class Normalize:
def __init__(self, mean=CORNERNET_MEAN, std=CORNERNET_STD):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
def __call__(self, image, target):
image = image.astype(np.float32) / 255.0
image = (image - self.mean) / self.std
return image, target
class ToTensor:
def __call__(self, image, target):
x = torch.from_numpy(image).permute(2, 0, 1).contiguous()
return x, target
def get_train_transforms(input_size=511):
return Compose([
RandomScale(),
RandomHorizontalFlip(0.5),
RandomCropFixSize(size=input_size, pad_value=0),
Normalize(),
ToTensor(),
], strict=True)
def get_val_transforms(input_size=511):
return Compose([
CenterPadResize(size=input_size, pad_value=0),
Normalize(),
ToTensor(),
], strict=True)
数据集加载
在数据增强之前,我们还需要对数据集进行读取,主要是图像和标签的同步处理。
高斯核的生成,在实际的角点检测过程中,如果只把角点一个点当做正样本的话,条件太过于严格,实际难以维持训练稳定。为此,我们需要根据目标的大小动态的设置高斯核,只是力角点远的地方数值小,让其平滑过渡。
python
import os
import cv2
import math
import numpy as np
import torch
from torch.utils.data import Dataset
from pycocotools.coco import COCO
def gaussian_radius(det_size, min_overlap=0.7):
height, width = det_size
height = float(height)
width = float(width)
if height <= 0 or width <= 0:
return 0.0
a1 = 1
b1 = height + width
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = max(0.0, b1 ** 2 - 4 * a1 * c1)
r1 = (b1 + math.sqrt(sq1)) / 2
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = max(0.0, b2 ** 2 - 4 * a2 * c2)
r2 = (b2 + math.sqrt(sq2)) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = max(0.0, b3 ** 2 - 4 * a3 * c3)
r3 = (b3 + math.sqrt(sq3)) / 2
return min(r1, r2, r3)
def gaussian2D(shape, sigma=1):
m, n = [(ss - 1.0) / 2.0 for ss in shape]
y, x = np.ogrid[-m:m + 1, -n:n + 1]
h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
def draw_gaussian(heatmap, center, radius):
radius = int(radius)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[:2]
if radius <= 0:
if 0 <= x < width and 0 <= y < height:
heatmap[y, x] = 1.0
return heatmap
diameter = 2 * radius + 1
gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
left = min(x, radius)
right = min(width - x, radius + 1)
top = min(y, radius)
bottom = min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
if masked_gaussian.shape == masked_heatmap.shape:
np.maximum(masked_heatmap, masked_gaussian, out=masked_heatmap)
masked_heatmap[top, left] = 1.0
return heatmap
由于都输数据变换等操作,其全部代码如下:
python
import os
import cv2
import math
import numpy as np
import torch
from torch.utils.data import Dataset
from pycocotools.coco import COCO
def gaussian_radius(det_size, min_overlap=0.7):
height, width = det_size
height = float(height)
width = float(width)
if height <= 0 or width <= 0:
return 0.0
a1 = 1
b1 = height + width
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = max(0.0, b1 ** 2 - 4 * a1 * c1)
r1 = (b1 + math.sqrt(sq1)) / 2
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = max(0.0, b2 ** 2 - 4 * a2 * c2)
r2 = (b2 + math.sqrt(sq2)) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = max(0.0, b3 ** 2 - 4 * a3 * c3)
r3 = (b3 + math.sqrt(sq3)) / 2
return min(r1, r2, r3)
def gaussian2D(shape, sigma=1):
m, n = [(ss - 1.0) / 2.0 for ss in shape]
y, x = np.ogrid[-m:m + 1, -n:n + 1]
h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
def draw_gaussian(heatmap, center, radius):
radius = int(radius)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[:2]
if radius <= 0:
if 0 <= x < width and 0 <= y < height:
heatmap[y, x] = 1.0
return heatmap
diameter = 2 * radius + 1
gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
left = min(x, radius)
right = min(width - x, radius + 1)
top = min(y, radius)
bottom = min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:radius + right]
if masked_gaussian.shape == masked_heatmap.shape:
np.maximum(masked_heatmap, masked_gaussian, out=masked_heatmap)
masked_heatmap[top, left] = 1.0
return heatmap
class COCODatasetCornerNet(Dataset):
"""
训练/验证都使用 COCO instances_*.json(这点对"严格对齐"很关键)
需要 data_dir:
- annotations/instances_train2017.json
- train2017/*.jpg
"""
def __init__(
self,
data_dir,
split="train2017",
transforms=None,
input_size=511,
output_size=128,
down_ratio=4, #下采样率
max_objs=128, #单张图片最大目标数
):
self.data_dir = data_dir
self.split = split
self.transforms = transforms #图像变换管道
self.input_size = int(input_size)
self.output_size = int(output_size)
self.down_ratio = int(down_ratio)
self.max_objs = int(max_objs)
ann_file = os.path.join(data_dir, "annotations", f"instances_{split}.json") #加载数据集标注文件
if not os.path.exists(ann_file):
raise FileNotFoundError(f"COCO annotation not found: {ann_file}")
self.coco = COCO(ann_file) # 使用 pycocotools 解析
self.img_ids = sorted(self.coco.getImgIds()) #获取图像的全部ID,如COCO128的话,就是0~127
self.cat_ids = sorted(self.coco.getCatIds()) #获取COCO的所有类别ID并进行排序,COCO128是1-90,中间有些没有,一共是80个类别
self.cat_to_idx = {cat_id: i for i, cat_id in enumerate(self.cat_ids)} #将不连续的cat_ids隐射为连续的,建立隐射字典cat_to_idx = {1: 0, 2: 1, ..., 90: 79}
self.idx_to_cat = {i: cat_id for cat_id, i in self.cat_to_idx.items()} # 反向隐射idx_to_cat = {0: 1, 1: 2, ..., 79: 90}
self.num_classes = len(self.cat_ids)
# 过滤无标注图(训练更稳定;验证你可选择不过滤)
valid = []
for img_id in self.img_ids: #循环所有的图片ID
ann_ids = self.coco.getAnnIds(imgIds=img_id) #获取标注的ID,即这张图片所有的标注ID
if len(ann_ids) > 0: #如果不为0,则表示这个图像有过标注,即有目标的
valid.append(img_id) #添加图片,避免无标注的图片,这样子就可以过滤没有标注的图片了,因为那个ID被过滤掉了
self.img_ids = valid
def __len__(self):
return len(self.img_ids)
def __getitem__(self, idx):
img_id = self.img_ids[idx] #图像ID
img_info = self.coco.loadImgs(img_id)[0] #获取图像信息
img_path = os.path.join(self.data_dir, self.split, img_info["file_name"]) #加载图像的全部路径
image = cv2.imread(img_path) #读取图像
if image is None:
raise FileNotFoundError(f"Failed to read image: {img_path}")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) #通道转换,BGR转RGB
ann_ids = self.coco.getAnnIds(imgIds=img_id) #根据图像ID获取图像标注ID
anns = self.coco.loadAnns(ann_ids) #根据标注ID加载所有标注
h_img, w_img = image.shape[:2] #图像的高和宽
boxes = []
labels = []
for ann in anns:
if ann.get("iscrowd", 0) == 1: #忽略群体标注
continue
x, y, w, h = ann["bbox"] # 提取边界框并转换格式
if w < 1 or h < 1:
continue
# 转换 [x,y,w,h] 为 [x1,y1,x2,y2]
x1 = max(0.0, float(x))
y1 = max(0.0, float(y))
x2 = min(float(w_img), float(x + w))
y2 = min(float(h_img), float(y + h))
# 过滤无效框
if x2 <= x1 or y2 <= y1:
continue
#有效框和有效的标签全部存起来
boxes.append([x1, y1, x2, y2])
labels.append(self.cat_to_idx[int(ann["category_id"])])
boxes = np.asarray(boxes, dtype=np.float32) #转 boxes 为 np array (float32)
labels = np.asarray(labels, dtype=np.int64)
target = {"boxes": boxes, "labels": labels, "image_id": int(img_id)} #创建 target dict
if self.transforms is not None:
image, target = self.transforms(image, target) ## 如缩放、翻转等
# 输出 target(严格版)
gt = self._gen_targets(target) # 调用内部方法生成最终标签
return image, gt # 返回图片和真实的标签
# CornerNet的精华和核心所在
def _gen_targets(self, target):
boxes = target["boxes"]
labels = target["labels"]
H = self.output_size
W = self.output_size
# 为每个类别创建热力图
tl_hm = np.zeros((self.num_classes, H, W), dtype=np.float32)
br_hm = np.zeros((self.num_classes, H, W), dtype=np.float32)
# tl 和br的索引,用于后面引导计算
tl_inds = np.zeros((self.max_objs,), dtype=np.int64)
br_inds = np.zeros((self.max_objs,), dtype=np.int64)
# 偏移
tl_regs = np.zeros((self.max_objs, 2), dtype=np.float32)
br_regs = np.zeros((self.max_objs, 2), dtype=np.float32)
# 掩膜 1或者0, # 初始化掩码(标记哪些位置是有效目标)
reg_mask = np.zeros((self.max_objs,), dtype=np.float32)
#上面所述,后续根据图片进行填充
num = min(len(boxes), self.max_objs) #物体数,限 max_objs,取实际与最小的为主
for i in range(num):
# 当前目标的类别
cls = int(labels[i])
x1, y1, x2, y2 = [float(v) for v in boxes[i]] #提取坐标
if x2 <= x1 + 1 or y2 <= y1 + 1: #坐标如果太小,直接跳过
continue
# CornerNet 常用:BR 使用 (x2-1, y2-1)
br_x_pix = x2 - 1.0
br_y_pix = y2 - 1.0
# 将原图坐标转换到特征图坐标(下采样),这里结果都是浮点数
tl_x = x1 / self.down_ratio
tl_y = y1 / self.down_ratio
br_x = br_x_pix / self.down_ratio
br_y = br_y_pix / self.down_ratio
## 防止坐标越界,clip():将坐标限制在[0, 特征图尺寸-1)范围内,eps:小的epsilon,避免坐标正好等于尺寸值
eps = 1e-4
tl_x = float(np.clip(tl_x, 0.0, W - 1 - eps))
tl_y = float(np.clip(tl_y, 0.0, H - 1 - eps))
br_x = float(np.clip(br_x, 0.0, W - 1 - eps))
br_y = float(np.clip(br_y, 0.0, H - 1 - eps))
# 取整数部分(特征图上的像素位置)
tl_xi, tl_yi = int(tl_x), int(tl_y)
br_xi, br_yi = int(br_x), int(br_y)
# 计算边界框在特征图上的大小
box_w = (x2 - x1) / self.down_ratio
box_h = (y2 - y1) / self.down_ratio
## 计算高斯半径
radius = gaussian_radius((math.ceil(box_h), math.ceil(box_w)))
radius = int(max(0, radius))
# 在热力图上绘制高斯分布
draw_gaussian(tl_hm[cls], (tl_xi, tl_yi), radius)
draw_gaussian(br_hm[cls], (br_xi, br_yi), radius)
# 记录角点在特征图上的扁平索引,将二维坐标(y,x)转换为一维索引,例如:128×128特征图上,(y=12, x=25) → 12×128+25=1561
tl_inds[i] = tl_yi * W + tl_xi
br_inds[i] = br_yi * W + br_xi
## 计算偏移量(浮点坐标 - 整数坐标)
tl_regs[i, 0] = tl_x - tl_xi
tl_regs[i, 1] = tl_y - tl_yi
br_regs[i, 0] = br_x - br_xi
br_regs[i, 1] = br_y - br_yi
## 标记这个位置是有效目标
reg_mask[i] = 1.0
return {
"tl_heatmaps": torch.from_numpy(tl_hm),
"br_heatmaps": torch.from_numpy(br_hm),
"tl_inds": torch.from_numpy(tl_inds),
"br_inds": torch.from_numpy(br_inds),
"tl_regs": torch.from_numpy(tl_regs),
"br_regs": torch.from_numpy(br_regs),
"reg_mask": torch.from_numpy(reg_mask),
"image_id": torch.tensor(int(target.get("image_id", -1)), dtype=torch.int64),
"meta": target.get("meta", {}),
}
# 批处理函数,用于将每批次打包输入,增强训练节奏。
def collate_fn(batch):
#batch的情况如下所示:batch = [(image1, target1), (image2, target2), (image3, target3)]
images = torch.stack([b[0] for b in batch], dim=0) #把批次的图片都堆叠起来,所有 image 形状相同(transforms 确保),否则 stack 会失败。
keys = ["tl_heatmaps", "br_heatmaps", "tl_inds", "br_inds", "tl_regs", "br_regs", "reg_mask", "image_id"]
targets = {k: torch.stack([b[1][k] for b in batch], dim=0) for k in keys}
targets["meta"] = [b[1].get("meta", {}) for b in batch] # list,不 stack
return images, targets
热力图可视化:


