目录
-
- 引言
- 计算机视觉基础
- 卷积神经网络(CNN)基础
- 构建完整的CNN模型
- 目标检测基础
- 实战项目:简单的目标检测器
- 高级目标检测算法简介
-
- R-CNN系列
- [SSD(Single Shot MultiBox Detector)](#SSD(Single Shot MultiBox Detector))
- 实际应用示例
- 性能评估
-
- [mAP(Mean Average Precision)计算](#mAP(Mean Average Precision)计算)
- 总结
- 学习建议
引言
计算机视觉(Computer Vision, CV)是人工智能领域的一个重要分支,致力于让计算机能够理解和解释视觉信息。从图像分类到目标检测,从图像分割到场景理解,计算机视觉技术已经广泛应用于自动驾驶、医疗诊断、安防监控等多个领域。本文将深入探讨卷积神经网络(CNN)的原理及其在目标检测中的应用。

计算机视觉基础
图像的数字化表示
计算机中的图像由像素组成,每个像素具有特定的颜色值。对于灰度图像,每个像素只有一个值(0-255)表示亮度;对于彩色图像,通常使用RGB三个通道表示颜色。
python
import numpy as np
import matplotlib.pyplot as plt
# 创建示例图像
def create_sample_image():
# 创建一个简单的彩色图像 (100x100像素)
image = np.zeros((100, 100, 3), dtype=np.uint8)
# 红色方块
image[20:40, 20:40] = [255, 0, 0]
# 绿色方块
image[60:80, 20:40] = [0, 255, 0]
# 蓝色方块
image[40:60, 60:80] = [0, 0, 255]
# 黄色方块
image[60:80, 60:80] = [255, 255, 0]
return image
# 创建并显示图像
sample_image = create_sample_image()
plt.figure(figsize=(8, 4))
plt.subplot(1, 2, 1)
plt.imshow(sample_image)
plt.title("原始图像")
plt.axis('off')
# 显示RGB通道
plt.subplot(1, 2, 2)
plt.imshow(sample_image)
plt.title("RGB彩色图像")
plt.axis('off')
plt.show()
# 查看图像的基本属性
print(f"图像尺寸: {sample_image.shape}")
print(f"图像数据类型: {sample_image.dtype}")
print(f"像素值范围: {sample_image.min()} - {sample_image.max()}")
图像预处理
python
class ImagePreprocessor:
def __init__(self):
pass
def resize(self, image, new_size):
"""调整图像大小"""
from skimage.transform import resize
return resize(image, new_size, anti_aliasing=True)
def normalize(self, image):
"""归一化图像到[0,1]范围"""
return image / 255.0
def center_crop(self, image, crop_size):
"""中心裁剪"""
h, w = image.shape[:2]
start_h = (h - crop_size[0]) // 2
start_w = (w - crop_size[1]) // 2
return image[start_h:start_h+crop_size[0], start_w:start_w+crop_size[1]]
def augment(self, image):
"""数据增强"""
augmented = []
# 原始图像
augmented.append(image)
# 水平翻转
augmented.append(np.fliplr(image))
# 随机旋转
angle = np.random.uniform(-30, 30)
from skimage.transform import rotate
rotated = rotate(image, angle, mode='reflect')
augmented.append(rotated)
# 亮度调整
brightness_factor = np.random.uniform(0.8, 1.2)
brightened = np.clip(image * brightness_factor, 0, 1)
augmented.append(brightened)
return augmented
# 使用示例
preprocessor = ImagePreprocessor()
# 创建示例图像
image = np.random.rand(256, 256, 3) # 随机彩色图像
# 预处理流程
resized = preprocessor.resize(image, (224, 224))
normalized = preprocessor.normalize(resized)
cropped = preprocessor.center_crop(normalized, (200, 200))
augmented = preprocessor.augment(cropped)
print(f"原始图像大小: {image.shape}")
print(f"预处理后大小: {normalized.shape}")
print(f"增强后的图像数量: {len(augmented)}")
卷积神经网络(CNN)基础
卷积操作
卷积是CNN的核心操作,通过卷积核(滤波器)在图像上滑动来提取特征。
python
import numpy as np
class Conv2D:
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
self.in_channels = in_channels
self.out_channels = out_channels
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
self.stride = stride
self.padding = padding
# 初始化权重和偏置
scale = np.sqrt(2.0 / (in_channels * kernel_size * kernel_size))
self.weights = np.random.randn(out_channels, in_channels, *self.kernel_size) * scale
self.bias = np.zeros(out_channels)
def forward(self, x):
"""前向传播"""
batch_size, in_channels, height, width = x.shape
kh, kw = self.kernel_size
# 计算输出尺寸
out_height = (height + 2 * self.padding - kh) // self.stride + 1
out_width = (width + 2 * self.padding - kw) // self.stride + 1
# 填充输入
if self.padding > 0:
x_padded = np.pad(x, ((0, 0), (0, 0),
(self.padding, self.padding),
(self.padding, self.padding)),
mode='constant')
else:
x_padded = x
# 初始化输出
output = np.zeros((batch_size, self.out_channels, out_height, out_width))
# 执行卷积操作
for b in range(batch_size):
for oc in range(self.out_channels):
for oh in range(out_height):
for ow in range(out_width):
# 计算卷积窗口的起始和结束位置
h_start = oh * self.stride
h_end = h_start + kh
w_start = ow * self.stride
w_end = w_start + kw
# 提取窗口并计算卷积
window = x_padded[b, :, h_start:h_end, w_start:w_end]
output[b, oc, oh, ow] = np.sum(window * self.weights[oc]) + self.bias[oc]
return output
def backward(self, x, grad_output, learning_rate):
"""反向传播"""
batch_size, in_channels, height, width = x.shape
kh, kw = self.kernel_size
_, _, out_height, out_width = grad_output.shape
# 初始化梯度
grad_weights = np.zeros_like(self.weights)
grad_bias = np.zeros_like(self.bias)
grad_input = np.zeros_like(x)
# 计算梯度
for b in range(batch_size):
for oc in range(self.out_channels):
for oh in range(out_height):
for ow in range(out_width):
# 计算输入梯度
h_start = oh * self.stride
h_end = h_start + kh
w_start = ow * self.stride
w_end = w_start + kw
if h_end <= height and w_end <= width:
grad_input[b, :, h_start:h_end, w_start:w_end] += \
grad_output[b, oc, oh, ow] * self.weights[oc]
# 计算权重梯度
grad_weights[oc] += grad_output[b, oc, oh, ow] * x[b, :, h_start:h_end, w_start:w_end]
grad_bias[oc] += np.sum(grad_output[b, oc])
# 更新参数
self.weights -= learning_rate * grad_weights
self.bias -= learning_rate * grad_bias
return grad_input
# 示例使用
# 创建输入 (batch_size=2, channels=3, height=32, width=32)
input_data = np.random.randn(2, 3, 32, 32)
# 创建卷积层
conv = Conv2D(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
# 前向传播
output = conv.forward(input_data)
print(f"输入形状: {input_data.shape}")
print(f"输出形状: {output.shape}")
# 可视化卷积核
plt.figure(figsize=(12, 8))
for i in range(min(16, 16)): # 显示前16个卷积核
plt.subplot(4, 4, i+1)
# 取第一个输入通道的卷积核
kernel = conv.weights[i, 0]
plt.imshow(kernel, cmap='gray')
plt.title(f'Kernel {i+1}')
plt.axis('off')
plt.suptitle('卷积核可视化')
plt.show()
池化层
池化层用于降低特征图的维度,减少计算量并提取主要特征。
python
class MaxPool2D:
def __init__(self, kernel_size, stride=None):
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
self.stride = stride if stride is not None else kernel_size
def forward(self, x):
"""前向传播"""
batch_size, channels, height, width = x.shape
kh, kw = self.kernel_size
# 计算输出尺寸
out_height = (height - kh) // self.stride + 1
out_width = (width - kw) // self.stride + 1
output = np.zeros((batch_size, channels, out_height, out_width))
# 执行最大池化
for b in range(batch_size):
for c in range(channels):
for oh in range(out_height):
for ow in range(out_width):
h_start = oh * self.stride
h_end = h_start + kh
w_start = ow * self.stride
w_end = w_start + kw
output[b, c, oh, ow] = np.max(x[b, c, h_start:h_end, w_start:w_end])
return output
class AvgPool2D:
def __init__(self, kernel_size, stride=None):
self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
self.stride = stride if stride is not None else kernel_size
def forward(self, x):
"""前向传播"""
batch_size, channels, height, width = x.shape
kh, kw = self.kernel_size
# 计算输出尺寸
out_height = (height - kh) // self.stride + 1
out_width = (width - kw) // self.stride + 1
output = np.zeros((batch_size, channels, out_height, out_width))
# 执行平均池化
for b in range(batch_size):
for c in range(channels):
for oh in range(out_height):
for ow in range(out_width):
h_start = oh * self.stride
h_end = h_start + kh
w_start = ow * self.stride
w_end = w_start + kw
output[b, c, oh, ow] = np.mean(x[b, c, h_start:h_end, w_start:w_end])
return output
# 示例使用
# 创建输入特征图
feature_map = np.random.randn(1, 1, 8, 8) # 1个通道,8x8的特征图
# 创建池化层
max_pool = MaxPool2D(kernel_size=2, stride=2)
avg_pool = AvgPool2D(kernel_size=2, stride=2)
# 应用池化
max_pooled = max_pool.forward(feature_map)
avg_pooled = avg_pool.forward(feature_map)
print(f"原始特征图大小: {feature_map.shape}")
print(f"最大池化后大小: {max_pooled.shape}")
print(f"平均池化后大小: {avg_pooled.shape}")
# 可视化
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.imshow(feature_map[0, 0], cmap='gray')
plt.title('原始特征图')
plt.colorbar()
plt.subplot(1, 3, 2)
plt.imshow(max_pooled[0, 0], cmap='gray')
plt.title('最大池化')
plt.colorbar()
plt.subplot(1, 3, 3)
plt.imshow(avg_pooled[0, 0], cmap='gray')
plt.title('平均池化')
plt.colorbar()
plt.show()
激活函数
python
def relu(x):
"""ReLU激活函数"""
return np.maximum(0, x)
def relu_derivative(x):
"""ReLU导数"""
return (x > 0).astype(float)
def leaky_relu(x, alpha=0.01):
"""Leaky ReLU激活函数"""
return np.where(x > 0, x, alpha * x)
def leaky_relu_derivative(x, alpha=0.01):
"""Leaky ReLU导数"""
return np.where(x > 0, 1, alpha)
# 可视化激活函数
x = np.linspace(-5, 5, 100)
plt.figure(figsize=(12, 4))
plt.subplot(1, 3, 1)
plt.plot(x, relu(x))
plt.title('ReLU')
plt.grid(True)
plt.subplot(1, 3, 2)
plt.plot(x, relu_derivative(x))
plt.title("ReLU导数")
plt.grid(True)
plt.subplot(1, 3, 3)
plt.plot(x, leaky_relu(x))
plt.title('Leaky ReLU')
plt.grid(True)
plt.show()
构建完整的CNN模型
python
class CNN:
def __init__(self):
self.layers = []
def add_conv_layer(self, in_channels, out_channels, kernel_size, stride=1, padding=0):
"""添加卷积层"""
self.layers.append({
'type': 'conv',
'layer': Conv2D(in_channels, out_channels, kernel_size, stride, padding)
})
def add_pool_layer(self, pool_type='max', kernel_size=2, stride=None):
"""添加池化层"""
if pool_type == 'max':
self.layers.append({
'type': 'pool',
'layer': MaxPool2D(kernel_size, stride)
})
elif pool_type == 'avg':
self.layers.append({
'type': 'pool',
'layer': AvgPool2D(kernel_size, stride)
})
def add_activation(self, activation_type='relu'):
"""添加激活函数"""
self.layers.append({
'type': 'activation',
'activation': activation_type
})
def forward(self, x):
"""前向传播"""
activations = [x]
for layer_info in self.layers:
if layer_info['type'] == 'conv':
x = layer_info['layer'].forward(x)
elif layer_info['type'] == 'pool':
x = layer_info['layer'].forward(x)
elif layer_info['type'] == 'activation':
if layer_info['activation'] == 'relu':
x = relu(x)
elif layer_info['activation'] == 'leaky_relu':
x = leaky_relu(x)
activations.append(x)
return activations
def predict(self, x):
"""预测"""
activations = self.forward(x)
return activations[-1]
# 构建示例CNN
model = CNN()
# 架构:输入 -> Conv -> ReLU -> Pool -> Conv -> ReLU -> Pool
model.add_conv_layer(in_channels=3, out_channels=16, kernel_size=3, padding=1)
model.add_activation('relu')
model.add_pool_layer('max', 2)
model.add_conv_layer(in_channels=16, out_channels=32, kernel_size=3, padding=1)
model.add_activation('relu')
model.add_pool_layer('max', 2)
# 创建测试输入
test_input = np.random.randn(1, 3, 32, 32)
# 前向传播
activations = model.forward(test_input)
print("各层输出形状:")
for i, activation in enumerate(activations):
print(f"Layer {i}: {activation.shape}")
目标检测基础
目标检测不仅要识别图像中的物体类别,还要定位物体的位置。
边界框表示
python
class BoundingBox:
def __init__(self, x_min, y_min, x_max, y_max, label=None, confidence=None):
self.x_min = x_min
self.y_min = y_min
self.x_max = x_max
self.y_max = y_max
self.label = label
self.confidence = confidence
@property
def width(self):
return self.x_max - self.x_min
@property
def height(self):
return self.y_max - self.y_min
@property
def area(self):
return self.width * self.height
@property
def center(self):
return ((self.x_min + self.x_max) / 2, (self.y_min + self.y_max) / 2)
def to_coco_format(self):
"""转换为COCO格式 [x, y, width, height]"""
return [self.x_min, self.y_min, self.width, self.height]
def to_yolo_format(self, image_width, image_height):
"""转换为YOLO格式 [x_center, y_center, width, height] (归一化)"""
x_center = (self.x_min + self.x_max) / 2 / image_width
y_center = (self.y_min + self.y_max) / 2 / image_height
width = self.width / image_width
height = self.height / image_height
return [x_center, y_center, width, height]
def calculate_iou(box1, box2):
"""计算两个边界框的IoU"""
# 计算交集
x1 = max(box1.x_min, box2.x_min)
y1 = max(box1.y_min, box2.y_min)
x2 = min(box1.x_max, box2.x_max)
y2 = min(box1.y_max, box2.y_max)
if x2 <= x1 or y2 <= y1:
return 0
intersection = (x2 - x1) * (y2 - y1)
# 计算并集
union = box1.area + box2.area - intersection
return intersection / union
# 示例使用
box1 = BoundingBox(10, 10, 50, 50, "cat", 0.9)
box2 = BoundingBox(30, 30, 70, 70, "cat", 0.85)
print(f"边界框1: 中心={box1.center}, 面积={box1.area}")
print(f"边界框2: 中心={box2.center}, 面积={box2.area}")
print(f"IoU: {calculate_iou(box1, box2):.4f}")
print(f"COCO格式: {box1.to_coco_format()}")
print(f"YOLO格式(假设图像100x100): {box1.to_yolo_format(100, 100)}")
非极大值抑制(NMS)
python
def non_max_suppression(boxes, iou_threshold=0.5):
"""非极大值抑制"""
if not boxes:
return []
# 按置信度排序
boxes = sorted(boxes, key=lambda x: x.confidence, reverse=True)
selected_boxes = []
while boxes:
# 选择置信度最高的框
current_box = boxes.pop(0)
selected_boxes.append(current_box)
# 移除与当前框IoU过高的框
remaining_boxes = []
for box in boxes:
if calculate_iou(current_box, box) < iou_threshold:
remaining_boxes.append(box)
boxes = remaining_boxes
return selected_boxes
# 示例使用
# 创建多个重叠的检测框
detections = [
BoundingBox(10, 10, 50, 50, "cat", 0.95),
BoundingBox(12, 12, 52, 52, "cat", 0.90),
BoundingBox(15, 15, 55, 55, "cat", 0.85),
BoundingBox(100, 100, 150, 150, "dog", 0.88),
BoundingBox(102, 102, 152, 152, "dog", 0.82),
]
# 应用NMS
selected = non_max_suppression(detections, iou_threshold=0.5)
print(f"原始检测框数量: {len(detections)}")
print(f"NMS后保留的框数量: {len(selected)}")
for i, box in enumerate(selected):
print(f"框 {i+1}: {box.label}, 置信度={box.confidence:.4f}")
实战项目:简单的目标检测器
数据准备
python
import numpy as np
import cv2
class ObjectDetectionDataset:
def __init__(self, image_size=224, grid_size=7):
self.image_size = image_size
self.grid_size = grid_size
self.cell_size = image_size // grid_size
self.num_classes = 3 # 假设3个类别
self.num_boxes = 2 # 每个格子预测2个框
def generate_synthetic_data(self, num_samples=1000):
"""生成合成数据用于训练"""
images = []
labels = []
class_names = ['circle', 'rectangle', 'triangle']
for _ in range(num_samples):
# 创建空白图像
image = np.zeros((self.image_size, self.image_size, 3), dtype=np.uint8)
# 创建标签网格
label_grid = np.zeros((self.grid_size, self.grid_size, 5 * self.num_boxes + self.num_classes))
# 随机放置1-3个物体
num_objects = np.random.randint(1, 4)
for _ in range(num_objects):
# 随机选择类别
class_idx = np.random.randint(self.num_classes)
class_name = class_names[class_idx]
# 随机位置和大小
size = np.random.randint(20, 60)
x = np.random.randint(size, self.image_size - size)
y = np.random.randint(size, self.image_size - size)
# 绘制物体
if class_name == 'circle':
cv2.circle(image, (x, y), size//2, (255, 0, 0), -1)
elif class_name == 'rectangle':
cv2.rectangle(image, (x-size//2, y-size//2),
(x+size//2, y+size//2), (0, 255, 0), -1)
else: # triangle
points = np.array([
[x, y - size//2],
[x - size//2, y + size//2],
[x + size//2, y + size//2]
], np.int32)
cv2.fillPoly(image, [points], (0, 0, 255))
# 创建边界框
box = BoundingBox(x - size//2, y - size//2,
x + size//2, y + size//2,
class_name, 1.0)
# 转换为YOLO格式并放入网格
self._place_box_in_grid(box, class_idx, label_grid)
images.append(image)
labels.append(label_grid)
return np.array(images), np.array(labels)
def _place_box_in_grid(self, box, class_idx, label_grid):
"""将边界框放入网格中"""
# 计算中心点所在的格子
center_x, center_y = box.center
grid_x = int(center_x / self.cell_size)
grid_y = int(center_y / self.cell_size)
if 0 <= grid_x < self.grid_size and 0 <= grid_y < self.grid_size:
# 计算相对于格子的坐标
x_offset = (center_x / self.cell_size) - grid_x
y_offset = (center_y / self.cell_size) - grid_y
# 计算相对于图像的宽高
width_offset = box.width / self.image_size
height_offset = box.height / self.image_size
# 将信息放入网格(简化版,只使用第一个预测框)
label_grid[grid_y, grid_x, 0] = 1 # 置信度
label_grid[grid_y, grid_x, 1] = x_offset
label_grid[grid_y, grid_x, 2] = y_offset
label_grid[grid_y, grid_x, 3] = width_offset
label_grid[grid_y, grid_x, 4] = height_offset
# 设置类别
label_grid[grid_y, grid_x, 5 * self.num_boxes + class_idx] = 1
# 生成数据
dataset = ObjectDetectionDataset()
images, labels = dataset.generate_synthetic_data(100)
print(f"图像数据形状: {images.shape}")
print(f"标签数据形状: {labels.shape}")
# 可视化几个样本
plt.figure(figsize=(12, 8))
for i in range(min(6, len(images))):
plt.subplot(2, 3, i+1)
plt.imshow(images[i])
plt.title(f'Sample {i+1}')
plt.axis('off')
plt.show()
简化的YOLO风格检测器
python
class YOLODetector:
def __init__(self, input_size=224, grid_size=7, num_classes=3):
self.input_size = input_size
self.grid_size = grid_size
self.num_classes = num_classes
self.num_boxes = 2
# 构建CNN骨干网络
self.backbone = CNN()
self._build_backbone()
# 检测头
self._build_detection_head()
def _build_backbone(self):
"""构建骨干网络"""
# 输入: 3 x 224 x 224
self.backbone.add_conv_layer(3, 32, 3, padding=1)
self.backbone.add_activation('relu')
self.backbone.add_pool_layer('max', 2) # 112 x 112
self.backbone.add_conv_layer(32, 64, 3, padding=1)
self.backbone.add_activation('relu')
self.backbone.add_pool_layer('max', 2) # 56 x 56
self.backbone.add_conv_layer(64, 128, 3, padding=1)
self.backbone.add_activation('relu')
self.backbone.add_pool_layer('max', 2) # 28 x 28
self.backbone.add_conv_layer(128, 256, 3, padding=1)
self.backbone.add_activation('relu')
self.backbone.add_pool_layer('max', 2) # 14 x 14
self.backbone.add_conv_layer(256, 512, 3, padding=1)
self.backbone.add_activation('relu')
self.backbone.add_pool_layer('max', 2) # 7 x 7
def _build_detection_head(self):
"""构建检测头"""
# 展平后的特征维度: 512 * 7 * 7
flattened_size = 512 * self.grid_size * self.grid_size
# 全连接层
self.fc1 = np.random.randn(flattened_size, 1024) * 0.01
self.fc1_bias = np.zeros(1024)
# 输出层: 每个格子输出 (5 * num_boxes + num_classes)
output_size = self.grid_size * self.grid_size * (5 * self.num_boxes + self.num_classes)
self.fc2 = np.random.randn(1024, output_size) * 0.01
self.fc2_bias = np.zeros(output_size)
def forward(self, x):
"""前向传播"""
# 骨干网络
activations = self.backbone.forward(x)
features = activations[-1]
# 展平特征
batch_size = features.shape[0]
flattened = features.reshape(batch_size, -1)
# 全连接层
fc1_out = relu(np.dot(flattened, self.fc1) + self.fc1_bias)
fc2_out = np.dot(fc1_out, self.fc2) + self.fc2_bias
# 重塑为网格格式
output = fc2_out.reshape(batch_size, self.grid_size, self.grid_size,
-1) # 最后一维: 5*num_boxes + num_classes
return output
def decode_predictions(self, predictions, conf_threshold=0.5):
"""解码预测结果"""
batch_size = predictions.shape[0]
all_detections = []
for b in range(batch_size):
detections = []
for i in range(self.grid_size):
for j in range(self.grid_size):
cell_pred = predictions[b, i, j]
# 解码边界框(简化版,只考虑第一个框)
confidence = sigmoid(cell_pred[0])
if confidence > conf_threshold:
# 相对于格子的坐标
x_offset = sigmoid(cell_pred[1])
y_offset = sigmoid(cell_pred[2])
# 宽高
w_offset = cell_pred[3]
h_offset = cell_pred[4]
# 转换为绝对坐标
cell_size = self.input_size / self.grid_size
center_x = (j + x_offset) * cell_size
center_y = (i + y_offset) * cell_size
width = np.exp(w_offset) * cell_size
height = np.exp(h_offset) * cell_size
# 计算边界框坐标
x_min = center_x - width / 2
y_min = center_y - height / 2
x_max = center_x + width / 2
y_max = center_y + height / 2
# 获取类别
class_probs = cell_pred[5 * self.num_boxes:]
class_idx = np.argmax(class_probs)
# 创建检测框
box = BoundingBox(x_min, y_min, x_max, y_max,
label=str(class_idx),
confidence=confidence)
detections.append(box)
# 应用NMS
if detections:
detections = non_max_suppression(detections)
all_detections.append(detections)
return all_detections
def sigmoid(x):
"""Sigmoid函数"""
return 1 / (1 + np.exp(-x))
# 创建检测器
detector = YOLODetector()
# 测试前向传播
test_input = np.random.randn(2, 3, 224, 224)
predictions = detector.forward(test_input)
print(f"输入形状: {test_input.shape}")
print(f"预测输出形状: {predictions.shape}")
# 解码预测
detections = detector.decode_predictions(predictions)
print(f"检测到的框数量: {len(detections[0])}")
训练目标检测器
python
class ObjectDetectionTrainer:
def __init__(self, model, learning_rate=0.001):
self.model = model
self.learning_rate = learning_rate
self.dataset = ObjectDetectionDataset()
def compute_loss(self, predictions, targets):
"""计算损失(简化版)"""
# MSE损失
mse_loss = np.mean((predictions - targets) ** 2)
return mse_loss
def train_step(self, images, targets):
"""单步训练"""
# 前向传播
predictions = self.model.forward(images)
# 计算损失
loss = self.compute_loss(predictions, targets)
# 简化的参数更新(实际需要完整的反向传播)
# 这里仅作演示
return loss
def train(self, epochs=10, batch_size=8):
"""训练模型"""
# 生成训练数据
train_images, train_labels = self.dataset.generate_synthetic_data(1000)
print("开始训练...")
for epoch in range(epochs):
total_loss = 0
num_batches = len(train_images) // batch_size
for i in range(0, len(train_images), batch_size):
batch_images = train_images[i:i+batch_size]
batch_targets = train_labels[i:i+batch_size]
# 归一化图像
batch_images = batch_images / 255.0
# 训练步骤
loss = self.train_step(batch_images, batch_targets)
total_loss += loss
avg_loss = total_loss / num_batches
print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")
# 训练检测器
trainer = ObjectDetectionTrainer(detector)
trainer.train(epochs=5)
可视化检测结果
python
def visualize_detections(image, detections, class_names=None):
"""可视化检测结果"""
if class_names is None:
class_names = ['circle', 'rectangle', 'triangle']
plt.figure(figsize=(10, 10))
plt.imshow(image)
for box in detections:
# 绘制边界框
rect = plt.Rectangle((box.x_min, box.y_min),
box.width, box.height,
fill=False, color='red', linewidth=2)
plt.gca().add_patch(rect)
# 添加标签
if box.label is not None:
class_idx = int(box.label)
if class_idx < len(class_names):
label_text = f"{class_names[class_idx]}: {box.confidence:.2f}"
else:
label_text = f"Class {class_idx}: {box.confidence:.2f}"
plt.text(box.x_min, box.y_min - 5, label_text,
color='red', fontsize=12,
bbox=dict(facecolor='white', alpha=0.7))
plt.axis('off')
plt.show()
# 创建测试图像
test_image = np.zeros((224, 224, 3), dtype=np.uint8)
cv2.circle(test_image, (100, 100), 30, (255, 0, 0), -1)
cv2.rectangle(test_image, (150, 50), (200, 100), (0, 255, 0), -1)
# 模拟检测结果
simulated_detections = [
BoundingBox(70, 70, 130, 130, "0", 0.95), # circle
BoundingBox(150, 50, 200, 100, "1", 0.88), # rectangle
]
# 可视化
visualize_detections(test_image, simulated_detections)
高级目标检测算法简介
R-CNN系列
python
class RCNNConcept:
"""R-CNN系列的概念性实现"""
def __init__(self):
self.regions = [] # 区域提议
def selective_search(self, image):
"""选择性搜索生成区域提议(简化版)"""
# 实际实现使用更复杂的算法
h, w = image.shape[:2]
proposals = []
# 生成多尺度的窗口
scales = [0.5, 0.75, 1.0, 1.25, 1.5]
aspect_ratios = [0.5, 1.0, 2.0]
for scale in scales:
for ratio in aspect_ratios:
width = int(w * scale)
height = int(width * ratio)
for x in range(0, w, width//4):
for y in range(0, h, height//4):
x_end = min(x + width, w)
y_end = min(y + height, h)
proposals.append(BoundingBox(x, y, x_end, y_end))
return proposals
def classify_regions(self, image, regions):
"""对每个区域进行分类"""
classifications = []
for region in regions:
# 提取区域
roi = image[region.y_min:region.y_max, region.x_min:region.x_max]
# 调整大小(实际应用中使用更复杂的特征提取)
roi_resized = cv2.resize(roi, (224, 224))
# 分类(这里使用随机分类作为示例)
class_probs = np.random.rand(3)
class_idx = np.argmax(class_probs)
confidence = class_probs[class_idx]
classifications.append({
'box': region,
'class': str(class_idx),
'confidence': confidence
})
return classifications
# 演示R-CNN概念
rcnn = RCNNConcept()
regions = rcnn.selective_search(test_image)
print(f"生成了 {len(regions)} 个区域提议")
# 选择部分区域进行分类
sample_regions = regions[:10]
classifications = rcnn.classify_regions(test_image, sample_regions)
print("\n分类结果示例:")
for i, cls in enumerate(classifications[:5]):
print(f"区域 {i+1}: 类别={cls['class']}, 置信度={cls['confidence']:.4f}")
SSD(Single Shot MultiBox Detector)
python
class SSDConcept:
"""SSD的概念性实现"""
def __init__(self, image_size=300):
self.image_size = image_size
self.feature_maps = [
(38, 38), # Conv4_3
(19, 19), # Conv7
(10, 10), # Conv8_2
(5, 5), # Conv9_2
(3, 3), # Conv10_2
(1, 1) # Conv11_2
]
self.default_boxes = self._generate_default_boxes()
def _generate_default_boxes(self):
"""生成默认框"""
default_boxes = []
# 每个特征图的尺度和长宽比
scales = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88]
aspect_ratios = [[1, 2, 1/2], [1, 2, 3, 1/2, 1/3], [1, 2, 3, 1/2, 1/3],
[1, 2, 3, 1/2, 1/3], [1, 2, 1/2], [1, 2, 1/2]]
for k, (h, w) in enumerate(self.feature_maps):
for i in range(h):
for j in range(w):
# 中心点
cx = (j + 0.5) / w
cy = (i + 0.5) / h
for ratio in aspect_ratios[k]:
# 计算宽高
w_ratio = np.sqrt(ratio)
h_ratio = 1 / w_ratio
# 计算框的宽高
box_w = scales[k] * w_ratio
box_h = scales[k] * h_ratio
# 添加默认框
default_boxes.append([cx, cy, box_w, box_h])
return np.array(default_boxes)
def decode_boxes(self, loc_preds, variances=[0.1, 0.2]):
"""解码预测的边界框"""
# loc_preds: [N, num_default_boxes, 4] (cx, cy, w, h)
decoded_boxes = []
for pred in loc_preds:
# 转换为实际坐标
cx = pred[:, 0] * self.default_boxes[:, 2] * variances[0] + self.default_boxes[:, 0]
cy = pred[:, 1] * self.default_boxes[:, 3] * variances[0] + self.default_boxes[:, 1]
w = self.default_boxes[:, 2] * np.exp(pred[:, 2] * variances[1])
h = self.default_boxes[:, 3] * np.exp(pred[:, 3] * variances[1])
# 转换为 (x_min, y_min, x_max, y_max)
x_min = (cx - w / 2) * self.image_size
y_min = (cy - h / 2) * self.image_size
x_max = (cx + w / 2) * self.image_size
y_max = (cy + h / 2) * self.image_size
decoded_boxes.append(np.stack([x_min, y_min, x_max, y_max], axis=1))
return decoded_boxes
# 演示SSD概念
ssd = SSDConcept()
print(f"生成的默认框数量: {len(ssd.default_boxes)}")
# 模拟预测
loc_preds = np.random.randn(1, len(ssd.default_boxes), 4) * 0.1
decoded = ssd.decode_boxes(loc_preds)
print(f"解码后的边界框形状: {decoded[0].shape}")
实际应用示例
视频中的目标检测
python
def detect_objects_in_video(video_path, output_path, detector):
"""在视频中检测物体"""
cap = cv2.VideoCapture(video_path)
# 获取视频属性
fps = int(cap.get(cv2.CAP_PROP_FPS))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
# 创建视频写入器
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
frame_count = 0
while True:
ret, frame = cap.read()
if not ret:
break
# 调整帧大小
frame_resized = cv2.resize(frame, (detector.input_size, detector.input_size))
# 转换为RGB
frame_rgb = cv2.cvtColor(frame_resized, cv2.COLOR_BGR2RGB)
# 归一化
frame_normalized = frame_rgb / 255.0
# 添加批次维度
input_tensor = frame_normalized[np.newaxis, ...]
# 转换维度顺序 (H, W, C) -> (N, C, H, W)
input_tensor = input_tensor.transpose(0, 3, 1, 2)
# 检测物体
predictions = detector.forward(input_tensor)
detections = detector.decode_predictions(predictions)[0]
# 在原始帧上绘制检测结果
for box in detections:
# 缩放坐标到原始帧大小
scale_x = width / detector.input_size
scale_y = height / detector.input_size
x_min = int(box.x_min * scale_x)
y_min = int(box.y_min * scale_y)
x_max = int(box.x_max * scale_x)
y_max = int(box.y_max * scale_y)
# 绘制边界框
cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
# 添加标签
label = f"Class {box.label}: {box.confidence:.2f}"
cv2.putText(frame, label, (x_min, y_min - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
# 写入帧
out.write(frame)
frame_count += 1
if frame_count % 30 == 0:
print(f"已处理 {frame_count} 帧")
cap.release()
out.release()
print("视频处理完成!")
# 使用示例(需要实际的视频文件)
# detect_objects_in_video('input_video.mp4', 'output_video.mp4', detector)
性能评估
mAP(Mean Average Precision)计算
python
class DetectionEvaluator:
def __init__(self, iou_threshold=0.5):
self.iou_threshold = iou_threshold
def calculate_ap(self, predictions, ground_truths, class_name):
"""计算单个类别的平均精度"""
# 按置信度排序
predictions = sorted(predictions, key=lambda x: x.confidence, reverse=True)
tp = np.zeros(len(predictions))
fp = np.zeros(len(predictions))
num_gt = len(ground_truths)
for i, pred in enumerate(predictions):
max_iou = 0
best_gt_idx = -1
for j, gt in enumerate(ground_truths):
if gt.label == class_name:
iou = calculate_iou(pred, gt)
if iou > max_iou:
max_iou = iou
best_gt_idx = j
if max_iou >= self.iou_threshold:
# 检查是否已匹配
if not getattr(ground_truths[best_gt_idx], 'matched', False):
tp[i] = 1
ground_truths[best_gt_idx].matched = True
else:
fp[i] = 1
else:
fp[i] = 1
# 计算精度和召回率
fp_cumsum = np.cumsum(fp)
tp_cumsum = np.cumsum(tp)
precisions = tp_cumsum / (tp_cumsum + fp_cumsum + 1e-10)
recalls = tp_cumsum / (num_gt + 1e-10)
# 计算AP
ap = self._calculate_ap_voc(precisions, recalls)
return ap
def _calculate_ap_voc(self, precisions, recalls):
"""使用VOC 2007方法计算AP"""
# 在11个召回率点上插值
recall_levels = np.linspace(0, 1, 11)
ap = 0
for level in recall_levels:
# 找到召回率大于等于该水平的最大精度
mask = recalls >= level
if np.any(mask):
p = np.max(precisions[mask])
ap += p / 11
return ap
def calculate_map(self, predictions_list, ground_truths_list, class_names):
"""计算mAP"""
aps = []
for class_name in class_names:
class_predictions = []
class_ground_truths = []
for predictions, ground_truths in zip(predictions_list, ground_truths_list):
# 筛选当前类别的预测和真实标注
class_predictions.extend([p for p in predictions if p.label == class_name])
class_ground_truths.extend([g for g in ground_truths if g.label == class_name])
# 计算当前类别的AP
if class_ground_truths:
ap = self.calculate_ap(class_predictions, class_ground_truths, class_name)
aps.append(ap)
print(f"类别 {class_name}: AP = {ap:.4f}")
# 计算mAP
map_score = np.mean(aps) if aps else 0
print(f"mAP: {map_score:.4f}")
return map_score
# 模拟评估
evaluator = DetectionEvaluator()
# 模拟预测和真实标注
predictions = [
BoundingBox(10, 10, 50, 50, "cat", 0.9),
BoundingBox(100, 100, 150, 150, "dog", 0.8),
BoundingBox(200, 200, 250, 250, "cat", 0.7),
]
ground_truths = [
BoundingBox(12, 12, 52, 52, "cat"),
BoundingBox(102, 102, 152, 152, "dog"),
BoundingBox(200, 200, 250, 250, "cat"),
BoundingBox(300, 300, 350, 350, "bird"),
]
# 计算mAP
class_names = ["cat", "dog", "bird"]
map_score = evaluator.calculate_map([predictions], [ground_truths], class_names)
总结
本文深入探讨了计算机视觉中的卷积神经网络和目标检测技术,涵盖了:
- 基础概念:图像表示、预处理等基础知识
- CNN核心组件:卷积、池化、激活函数的原理和实现
- 目标检测基础:边界框表示、IoU计算、NMS等关键技术
- 实战项目:从零实现一个简化的YOLO风格检测器
- 高级算法:R-CNN系列、SSD等先进算法的概念
- 实际应用:视频目标检测、性能评估等
计算机视觉是一个快速发展的领域,新的算法和技术不断涌现。从传统的手工特征到深度学习的端到端学习,目标检测技术已经取得了巨大进步。掌握CNN的原理和目标检测的核心技术,对于深入理解计算机视觉至关重要。
学习建议
- 深入研究ResNet、DenseNet等经典架构
- 学习实例分割(Mask R-CNN)技术
- 探索目标跟踪算法
- 了解3D目标检测和点云处理
- 实践实际项目,参与Kaggle等竞赛