引言
计算机视觉是人工智能领域最具活力的方向之一,从图像分类、目标检测到语义分割,各种视觉任务都需要大量高效的图像处理算子支持。CANN开源社区推出的ops-cv是一个专门面向计算机视觉应用的高性能算子库,提供了丰富的图像处理、目标检测相关算子实现,为CV应用在NPU上的加速提供了坚实基础。
相关链接:
- CANN组织链接: https://atomgit.com/cann
- ops-cv仓库链接: https://atomgit.com/cann/ops-cv
一、ops-cv项目概述
1.1 项目简介
ops-cv是CANN开源社区提供的图像处理和目标检测相关算子库,实现了网络在NPU上的高效加速计算。该算子库涵盖了从基础的图像变换到复杂的目标检测后处理等各类计算机视觉算子,是构建高性能CV应用的重要组件。
1.2 核心特性
| 特性 | 说明 |
|---|---|
| 图像处理算子 | 提供缩放、裁剪、翻转、旋转等基础图像变换 |
| 目标检测算子 | 包含NMS、IoU计算、Anchor生成等检测相关算子 |
| 高性能优化 | 针对NPU硬件特性进行深度优化 |
| 多格式支持 | 支持常见的图像格式和数据布局 |
| 后处理优化 | 提供检测框解码、结果聚合等后处理算子 |
1.3 算子分类
ops-cv/
├── 图像变换 (Image Transform)
│ ├── resize # 图像缩放
│ ├── crop # 图像裁剪
│ ├── flip # 图像翻转
│ ├── rotate # 图像旋转
│ ├── warp_affine # 仿射变换
│ └── warp_perspective # 透视变换
├── 颜色空间转换 (Color Space)
│ ├── rgb_to_grayscale # RGB转灰度图
│ ├── bgr_to_rgb # BGR转RGB
│ ├── rgb_to_hsv # RGB转HSV
│ └── rgb_to_yuv # RGB转YUV
├── 目标检测 (Object Detection)
│ ├── nms # 非极大值抑制
│ ├── iou # IoU计算
│ ├── giou # GIoU计算
│ ├── diou # DIoU计算
│ ├── ciou # CIoU计算
│ ├── roi_align # RoI对齐
│ ├── roi_pool # RoI池化
│ └── generate_anchors # Anchor生成
├── 图像增强 (Image Enhancement)
│ ├── brightness_adjust # 亮度调整
│ ├── contrast_adjust # 对比度调整
│ ├── saturation_adjust # 饱和度调整
│ └── histogram_equalization # 直方图均衡化
└── 后处理 (Post Processing)
├── decode_boxes # 检测框解码
├── top_k # Top-K筛选
├── gather # 数据聚合
└── transpose # 数据转置
二、基础图像变换算子
2.1 图像缩放(Resize)
图像缩放是计算机视觉中最基础的操作之一,ops-cv提供了多种高质量缩放算法。
cpp
#include "ops_cv/image_transform.hpp"
using namespace ops::cv;
// 图像缩放示例
void resize_example() {
// 输入图像: [batch, channels, height, width]
Tensor input = Tensor({1, 3, 480, 640});
// 缩放参数
ResizeParams params;
params.output_size = {224, 224}; // 目标尺寸 [height, width]
params.mode = ResizeMode::BILINEAR; // 双线性插值
params.align_corners = false; // 不对齐角点
params.half_pixel_centers = true; // 使用半像素中心
// 执行缩放
Tensor output = resize(input, params);
// 输出形状: [1, 3, 224, 224]
std::cout << "缩放后图像形状: " << output.shape() << std::endl;
}
// 支持的缩放模式
enum class ResizeMode {
NEAREST, // 最近邻插值
BILINEAR, // 双线性插值
BICUBIC, // 双三次插值
AREA // 区域插值(适合缩小)
};
// 自定义缩放比例
void resize_with_scale_example() {
Tensor input = Tensor({1, 3, 480, 640});
ResizeParams params;
params.scale_factor = {0.5f, 0.5f}; // 高度和宽度都缩放到50%
params.mode = ResizeMode::BILINEAR;
Tensor output = resize(input, params);
// 输出形状: [1, 3, 240, 320]
std::cout << "按比例缩放后形状: " << output.shape() << std::endl;
}
2.2 图像翻转与旋转
cpp
// 图像翻转示例
void flip_example() {
Tensor input = Tensor({1, 3, 224, 224});
// 水平翻转
FlipParams flip_params;
flip_params.flip_horizontal = true;
flip_params.flip_vertical = false;
Tensor h_flipped = flip(input, flip_params);
// 垂直翻转
flip_params.flip_horizontal = false;
flip_params.flip_vertical = true;
Tensor v_flipped = flip(input, flip_params);
// 同时水平和垂直翻转
flip_params.flip_horizontal = true;
flip_params.flip_vertical = true;
Tensor hv_flipped = flip(input, flip_params);
}
// 图像旋转示例
void rotate_example() {
Tensor input = Tensor({1, 3, 224, 224});
RotateParams params;
params.angle = 45.0f; // 旋转角度(度)
params.expand = false; // 不扩展图像尺寸
params.center = {112, 112}; // 旋转中心
params.fill_value = 0.0f; // 填充值
Tensor output = rotate(input, params);
}
2.3 仿射变换
cpp
// 仿射变换示例
void warp_affine_example() {
Tensor input = Tensor({1, 3, 480, 640});
// 定义仿射变换矩阵 [2x3]
// | a b tx |
// | c d ty |
std::vector<float> transform_matrix = {
0.8f, 0.0f, 10.0f, // 水平缩放到80%,右移10像素
0.0f, 0.8f, 20.0f // 垂直缩放到80%,下移20像素
};
WarpAffineParams params;
params.matrix = transform_matrix;
params.dsize = {400, 400}; // 输出尺寸
params.flags = InterpolationMode::LINEAR;
params.border_mode = BorderMode::CONSTANT;
params.border_value = 0.0f;
Tensor output = warp_affine(input, params);
std::cout << "仿射变换后形状: " << output.shape() << std::endl;
}
三、目标检测算子
3.1 非极大值抑制(NMS)
NMS是目标检测中的关键算子,用于去除重叠的检测框。
cpp
#include "ops_cv/object_detection.hpp"
// NMS实现
std::vector<int> nms(
const std::vector<Box>& boxes,
const std::vector<float>& scores,
float iou_threshold
) {
// 按分数降序排序
std::vector<int> indices(boxes.size());
std::iota(indices.begin(), indices.end(), 0);
std::sort(indices.begin(), indices.end(),
[&scores](int a, int b) { return scores[a] > scores[b]; });
std::vector<bool> suppressed(boxes.size(), false);
std::vector<int> keep;
for (size_t i = 0; i < indices.size(); ++i) {
int idx = indices[i];
if (suppressed[idx]) continue;
keep.push_back(idx);
const Box& current_box = boxes[idx];
// 计算与剩余框的IoU,抑制高IoU的框
for (size_t j = i + 1; j < indices.size(); ++j) {
int other_idx = indices[j];
if (suppressed[other_idx]) continue;
float iou = compute_iou(current_box, boxes[other_idx]);
if (iou > iou_threshold) {
suppressed[other_idx] = true;
}
}
}
return keep;
}
// 使用示例
void nms_example() {
// 假设有100个检测框
std::vector<Box> boxes(100);
std::vector<float> scores(100);
// 初始化检测框和分数...
for (int i = 0; i < 100; ++i) {
boxes[i] = Box{rand() % 400, rand() % 400,
rand() % 100 + 50, rand() % 100 + 50};
scores[i] = static_cast<float>(rand()) / RAND_MAX;
}
// 执行NMS
float iou_threshold = 0.5f;
std::vector<int> keep = nms(boxes, scores, iou_threshold);
std::cout << "NMS后保留的框数量: " << keep.size() << std::endl;
}
3.2 IoU及其变体计算
cpp
// IoU(Intersection over Union)计算
struct Box {
float x1, y1, x2, y2; // 左上角和右下角坐标
float area() const {
return std::max(0.0f, x2 - x1) * std::max(0.0f, y2 - y1);
}
Box intersection(const Box& other) const {
return Box{
std::max(x1, other.x1),
std::max(y1, other.y1),
std::min(x2, other.x2),
std::min(y2, other.y2)
};
}
Box union_box(const Box& other) const {
return Box{
std::min(x1, other.x1),
std::min(y1, other.y1),
std::max(x2, other.x2),
std::max(y2, other.y2)
};
}
};
float compute_iou(const Box& a, const Box& b) {
Box inter = a.intersection(b);
float inter_area = inter.area();
Box u = a.union_box(b);
float union_area = a.area() + b.area() - inter_area;
return union_area > 0 ? inter_area / union_area : 0.0f;
}
// GIoU(Generalized IoU)计算
float compute_giou(const Box& a, const Box& b) {
float iou = compute_iou(a, b);
// 计算最小包围框
Box enclose = a.union_box(b);
float enclose_area = enclose.area();
// GIoU = IoU - (enclose_area - union_area) / enclose_area
float union_area = a.area() + b.area() - a.intersection(b).area();
float giou = iou - (enclose_area - union_area) / enclose_area;
return giou;
}
// DIoU(Distance IoU)计算
float compute_diou(const Box& a, const Box& b) {
float iou = compute_iou(a, b);
// 计算中心点距离
float a_cx = (a.x1 + a.x2) / 2.0f;
float a_cy = (a.y1 + a.y2) / 2.0f;
float b_cx = (b.x1 + b.x2) / 2.0f;
float b_cy = (b.y1 + b.y2) / 2.0f;
float center_dist_sq = (a_cx - b_cx) * (a_cx - b_cx) +
(a_cy - b_cy) * (a_cy - b_cy);
// 计算对角线距离
Box enclose = a.union_box(b);
float enclose_cx = (enclose.x1 + enclose.x2) / 2.0f;
float enclose_cy = (enclose.y1 + enclose.y2) / 2.0f;
float enclose_diag_sq = (enclose_cx - a_cx) * (enclose_cx - a_cx) * 4 +
(enclose_cy - a_cy) * (enclose_cy - a_cy) * 4;
// DIoU = IoU - center_dist_sq / enclose_diag_sq
float diou = iou - center_dist_sq / (enclose_diag_sq + 1e-7f);
return diou;
}
// CIoU(Complete IoU)计算
float compute_ciou(const Box& a, const Box& b) {
float diou = compute_diou(a, b);
// 计算宽高比一致性
float a_w = a.x2 - a.x1;
float a_h = a.y2 - a.y1;
float b_w = b.x2 - b.x1;
float b_h = b.y2 - b.y1;
float atan_diff = std::atan(a_w / a_h) - std::atan(b_w / b_h);
float v = (4.0f / M_PI / M_PI) * atan_diff * atan_diff;
float alpha = v > 0 ? v / (1.0f - compute_iou(a, b) + v + 1e-7f) : 0;
// CIoU = DIoU - alpha * v
float ciou = diou - alpha * v;
return ciou;
}
3.3 RoI对齐(RoI Align)
cpp
// RoI对齐实现
Tensor roi_align(
const Tensor& feature_map, // [batch, channels, height, width]
const std::vector<Box>& rois, // 候选区域
int output_size, // 输出尺寸
float spatial_scale, // 空间缩放因子
int sampling_ratio // 采样点数
) {
int batch_size = feature_map.shape(0);
int channels = feature_map.shape(1);
int height = feature_map.shape(2);
int width = feature_map.shape(3);
int num_rois = rois.size();
Tensor output = Tensor({num_rois, channels, output_size, output_size});
for (int roi_idx = 0; roi_idx < num_rois; ++roi_idx) {
const Box& roi = rois[roi_idx];
// 计算RoI在feature map上的坐标
float roi_start_w = roi.x1 * spatial_scale;
float roi_start_h = roi.y1 * spatial_scale;
float roi_end_w = roi.x2 * spatial_scale;
float roi_end_h = roi.y2 * spatial_scale;
float roi_width = std::max(roi_end_w - roi_start_w, 1.0f);
float roi_height = std::max(roi_end_h - roi_start_h, 1.0f);
float bin_size_h = roi_height / output_size;
float bin_size_w = roi_width / output_size;
// 对每个输出bin
for (int ph = 0; ph < output_size; ++ph) {
for (int pw = 0; pw < output_size; ++pw) {
// 计算采样点
std::vector<float> sample_points_h, sample_points_w;
for (int i = 0; i < sampling_ratio; ++i) {
float sample_h = roi_start_h + ph * bin_size_h +
(i + 0.5f) * bin_size_h / sampling_ratio;
float sample_w = roi_start_w + pw * bin_size_w +
(i + 0.5f) * bin_size_w / sampling_ratio;
sample_h = std::min(std::max(sample_h, 0.0f), height - 1.0f);
sample_w = std::min(std::max(sample_w, 0.0f), width - 1.0f);
sample_points_h.push_back(sample_h);
sample_points_w.push_back(sample_w);
}
// 双线性插值获取特征值
for (int c = 0; c < channels; ++c) {
float bin_value = 0;
for (size_t i = 0; i < sample_points_h.size(); ++i) {
float h = sample_points_h[i];
float w = sample_points_w[i];
int h0 = static_cast<int>(std::floor(h));
int h1 = std::min(h0 + 1, height - 1);
int w0 = static_cast<int>(std::floor(w));
int w1 = std::min(w0 + 1, width - 1);
float h_lerp = h - h0;
float w_lerp = w - w0;
// 从feature map获取值
float v00 = feature_map[{0, c, h0, w0}];
float v01 = feature_map[{0, c, h0, w1}];
float v10 = feature_map[{0, c, h1, w0}];
float v11 = feature_map[{0, c, h1, w1}];
// 双线性插值
float v0 = v00 * (1 - w_lerp) + v01 * w_lerp;
float v1 = v10 * (1 - w_lerp) + v11 * w_lerp;
float interp_value = v0 * (1 - h_lerp) + v1 * h_lerp;
bin_value += interp_value;
}
bin_value /= sample_points_h.size();
// 写入输出
output[{roi_idx, c, ph, pw}] = bin_value;
}
}
}
}
return output;
}
四、图像增强算子
4.1 颜色抖动(Color Jitter)
cpp
// 颜色抖动增强
struct ColorJitterParams {
float brightness = 0.0f; // 亮度调整范围 [-brightness, brightness]
float contrast = 0.0f; // 对比度调整范围 [-contrast, contrast]
float saturation = 0.0f; // 饱和度调整范围 [-saturation, saturation]
float hue = 0.0f; // 色调调整范围 [-hue, hue]
};
Tensor color_jitter(const Tensor& input, const ColorJitterParams& params) {
Tensor output = input.clone();
int batch = input.shape(0);
int channels = input.shape(1);
int height = input.shape(2);
int width = input.shape(3);
for (int b = 0; b < batch; ++b) {
// 随机生成变换参数
float brightness_factor = 1.0f + (rand() % 200 - 100) / 100.0f * params.brightness;
float contrast_factor = 1.0f + (rand() % 200 - 100) / 100.0f * params.contrast;
float saturation_factor = 1.0f + (rand() % 200 - 100) / 100.0f * params.saturation;
float hue_factor = (rand() % 200 - 100) / 100.0f * params.hue;
for (int h = 0; h < height; ++h) {
for (int w = 0; w < width; ++w) {
float r = input[{b, 0, h, w}];
float g = input[{b, 1, h, w}];
float b_val = input[{b, 2, h, w}];
// 亮度调整
r *= brightness_factor;
g *= brightness_factor;
b_val *= brightness_factor;
// 对比度调整
r = (r - 0.5f) * contrast_factor + 0.5f;
g = (g - 0.5f) * contrast_factor + 0.5f;
b_val = (b_val - 0.5f) * contrast_factor + 0.5f;
// 饱和度调整(转换到HSV)
float max_val = std::max({r, g, b_val});
float min_val = std::min({r, g, b_val});
float delta = max_val - min_val;
float saturation = max_val > 0 ? delta / max_val : 0;
saturation *= saturation_factor;
delta = saturation * max_val;
float new_min = max_val - delta;
r = r == max_val ? max_val : (r == min_val ? new_min : r + (max_val - r) * saturation / (1 - saturation));
g = g == max_val ? max_val : (g == min_val ? new_min : g + (max_val - g) * saturation / (1 - saturation));
b_val = b_val == max_val ? max_val : (b_val == min_val ? new_min : b_val + (max_val - b_val) * saturation / (1 - saturation));
// 色调调整
if (hue_factor != 0) {
float h_rad = hue_factor * M_PI;
float cos_h = std::cos(h_rad);
float sin_h = std::sin(h_rad);
float r_new = r * 0.299f + g * 0.587f + b_val * 0.114f +
(r * 0.701f - g * 0.587f - b_val * 0.114f) * cos_h +
(r * 0.168f - g * 0.330f + b_val * 0.162f) * sin_h;
float g_new = r * 0.299f + g * 0.587f + b_val * 0.114f -
(r * 0.300f - g * 0.413f + b_val * 0.114f) * cos_h +
(r * 0.168f - g * 0.330f + b_val * 0.162f) * sin_h;
float b_new = r * 0.299f + g * 0.587f + b_val * 0.114f -
(r * 0.300f - g * 0.588f + b_val * 0.886f) * cos_h +
(r * 0.168f - g * 0.330f - b_val * 0.838f) * sin_h;
r = r_new;
g = g_new;
b_val = b_new;
}
// Clamp到[0, 1]
output[{b, 0, h, w}] = std::max(0.0f, std::min(1.0f, r));
output[{b, 1, h, w}] = std::max(0.0f, std::min(1.0f, g));
output[{b, 2, h, w}] = std::max(0.0f, std::min(1.0f, b_val));
}
}
}
return output;
}
4.2 随机裁剪(Random Crop)
cpp
// 随机裁剪
Tensor random_crop(
const Tensor& input,
int crop_height,
int crop_width
) {
int batch = input.shape(0);
int channels = input.shape(1);
int height = input.shape(2);
int width = input.shape(3);
// 确保裁剪尺寸不超过原图
crop_height = std::min(crop_height, height);
crop_width = std::min(crop_width, width);
// 随机选择裁剪位置
int h_offset = rand() % (height - crop_height + 1);
int w_offset = rand() % (width - crop_width + 1);
// 创建输出张量
Tensor output = Tensor({batch, channels, crop_height, crop_width});
// 执行裁剪
for (int b = 0; b < batch; ++b) {
for (int c = 0; c < channels; ++c) {
for (int h = 0; h < crop_height; ++h) {
for (int w = 0; w < crop_width; ++w) {
output[{b, c, h, w}] = input[{b, c, h + h_offset, w + w_offset}];
}
}
}
}
return output;
}
五、完整目标检测流程示例
5.1 YOLO检测框解码
cpp
// YOLO检测框解码
struct YOLODecodeParams {
int num_classes = 80;
std::vector<std::pair<int, int>> anchors = {
{10, 13}, {16, 30}, {33, 23}, // P3/8
{30, 61}, {62, 45}, {59, 119}, {116, 90}, {156, 198}, {373, 326} // P4/16
};
std::vector<int> strides = {8, 16};
};
struct Detection {
Box box;
int class_id;
float score;
};
std::vector<Detection> yolox_decode(
const Tensor& predictions, // [1, num_anchors * (5 + num_classes), grid_h, grid_w]
int input_height,
int input_width,
const YOLODecodeParams& params
) {
std::vector<Detection> detections;
int num_classes = params.num_classes;
int grid_h = predictions.shape(2);
int grid_w = predictions.shape(3);
int num_anchors = predictions.shape(1) / (5 + num_classes);
for (int a = 0; a < num_anchors; ++a) {
int anchor_idx = a % params.anchors.size();
int stride_idx = a / (params.anchors.size() / params.strides.size());
int stride = params.strides[stride_idx];
auto [anchor_w, anchor_h] = params.anchors[anchor_idx];
for (int gh = 0; gh < grid_h; ++gh) {
for (int gw = 0; gw < grid_w; ++gw) {
// 获取预测值
float x = predictions[{0, a * (5 + num_classes) + 0, gh, gw}];
float y = predictions[{0, a * (5 + num_classes) + 1, gh, gw}];
float w = predictions[{0, a * (5 + num_classes) + 2, gh, gw}];
float h = predictions[{0, a * (5 + num_classes) + 3, gh, gw}];
float objectness = predictions[{0, a * (5 + num_classes) + 4, gh, gw}];
// 解码边界框
float cx = (gw + 0.5f + sigmoid(x)) * stride;
float cy = (gh + 0.5f + sigmoid(y)) * stride;
float bw = std::exp(w) * anchor_w;
float bh = std::exp(h) * anchor_h;
float x1 = cx - bw / 2;
float y1 = cy - bh / 2;
float x2 = cx + bw / 2;
float y2 = cy + bh / 2;
// 缩放到原图尺寸
x1 *= input_width / grid_w;
y1 *= input_height / grid_h;
x2 *= input_width / grid_w;
y2 *= input_height / grid_h;
// 获取类别预测
float max_score = 0;
int max_class = 0;
for (int c = 0; c < num_classes; ++c) {
float class_score = sigmoid(predictions[{0, a * (5 + num_classes) + 5 + c, gh, gw}]);
float score = class_score * objectness;
if (score > max_score) {
max_score = score;
max_class = c;
}
}
// 过滤低置信度检测
if (max_score > 0.25f) {
detections.push_back({
{x1, y1, x2, y2},
max_class,
max_score
});
}
}
}
}
return detections;
}
5.2 完整检测流程
cpp
// 完整的目标检测流程
class ObjectDetectionPipeline {
public:
ObjectDetectionPipeline(const YOLODecodeParams& decode_params)
: decode_params_(decode_params) {}
std::vector<Detection> detect(const Tensor& input) {
// 步骤1: 图像预处理
Tensor preprocessed = preprocess(input);
// 步骤2: 模型推理
Tensor predictions = model_inference(preprocessed);
// 步骤3: 检测框解码
std::vector<Detection> detections = yolox_decode(
predictions,
input.shape(2),
input.shape(3),
decode_params_
);
// 步骤4: NMS后处理
std::vector<Detection> final_detections = nms_postprocess(detections);
return final_detections;
}
private:
Tensor preprocess(const Tensor& input) {
// 图像归一化、resize等预处理
Tensor resized = resize(input, {640, 640});
return resized / 255.0f;
}
Tensor model_inference(const Tensor& input) {
// 调用模型进行推理
// 这里简化处理,实际需要调用具体模型
return Tensor({1, 8640, 80, 80}); // 示例输出
}
std::vector<Detection> nms_postprocess(const std::vector<Detection>& detections) {
if (detections.empty()) return {};
// 按类别分组
std::map<int, std::vector<Detection>> detections_by_class;
for (const auto& det : detections) {
detections_by_class[det.class_id].push_back(det);
}
std::vector<Detection> final_detections;
// 对每个类别执行NMS
for (auto& [class_id, class_detections] : detections_by_class) {
std::vector<Box> boxes;
std::vector<float> scores;
for (const auto& det : class_detections) {
boxes.push_back(det.box);
scores.push_back(det.score);
}
std::vector<int> keep = nms(boxes, scores, 0.45f);
for (int idx : keep) {
final_detections.push_back(class_detections[idx]);
}
}
return final_detections;
}
YOLODecodeParams decode_params_;
};
六、总结
ops-cv作为CANN开源社区的计算机视觉算子库,为CV应用提供了丰富的算子支持和高性能实现。其主要优势包括:
- 算子丰富:涵盖图像变换、颜色空间转换、目标检测、图像增强等各类CV算子
- 检测优化:专门针对目标检测场景优化,提供NMS、IoU、RoI等核心算子
- 性能优异:针对NPU硬件特性优化,充分利用硬件加速能力
- 易于集成:提供简洁的API,便于集成到各类视觉框架
随着计算机视觉应用的不断普及,高效的图像处理算子变得越来越重要。ops-cv为开发者提供了一个强大的基础算子库,助力构建高性能的CV应用。
参考资料: