一、为什么在 NPU 上做数据增强
1.1 CPU 数据增强的瓶颈
复制代码
传统 CPU 数据增强:
数据读取 → CPU 增强 → 内存拷贝 → NPU 训练
↑
瓶颈: CPU 计算慢,频繁内存拷贝
典型延迟 (ImageNet, 224x224):
随机裁剪: 2ms
水平翻转: 1ms
颜色抖动: 3ms
归一化: 1ms
─────────────
总计: 7ms/样本
NPU 训练一步: ~50ms
数据增强占比: 14%
1.2 NPU 数据增强优势
复制代码
NPU 数据增强:
数据读取 → NPU 增强 → NPU 训练 (无拷贝)
优势:
1. 零拷贝: 增强后数据直接在显存中
2. 高吞吐: NPU 并行计算,远快于 CPU
3. 流水线: 增强与训练重叠执行
4. 一致性: 所有设备执行相同增强逻辑
加速效果:
CPU 增强: 7ms/样本
NPU 增强: 0.5ms/样本
加速比: 14x
二、NPU 图像增强算子
2.1 基础增强算子
python
复制代码
import torch
import torch.npu
class NpuImageAugmentation:
"""NPU 图像增强算子集合"""
def __init__(self):
self.augmentations = {
'random_crop': self.random_crop,
'random_flip': self.random_flip,
'color_jitter': self.color_jitter,
'normalize': self.normalize,
'random_rotation': self.random_rotation,
'random_erasing': self.random_erasing,
}
def random_crop(self, image, output_size):
"""随机裁剪"""
_, h, w = image.shape
top = torch.randint(0, h - output_size[0] + 1, (1,)).item()
left = torch.randint(0, w - output_size[1] + 1, (1,)).item()
return image[:, top:top+output_size[0], left:left+output_size[1]]
def random_flip(self, image, p=0.5):
"""随机水平翻转"""
if torch.rand(1).item() < p:
return torch.flip(image, dims=[2])
return image
def color_jitter(self, image, brightness=0.2, contrast=0.2, saturation=0.2):
"""颜色抖动"""
# 亮度
if brightness > 0:
factor = 1.0 + (torch.rand(1).item() * 2 - 1) * brightness
image = image * factor
# 对比度
if contrast > 0:
factor = 1.0 + (torch.rand(1).item() * 2 - 1) * contrast
mean = image.mean()
image = (image - mean) * factor + mean
# 饱和度 (转换到 HSV 空间)
if saturation > 0:
# 简化实现: 直接调整 G 通道权重
factor = 1.0 + (torch.rand(1).item() * 2 - 1) * saturation
image[1] = image[1] * factor
return image.clamp(0, 1)
def normalize(self, image, mean, std):
"""归一化"""
for i in range(3):
image[i] = (image[i] - mean[i]) / std[i]
return image
def random_rotation(self, image, max_angle=15):
"""随机旋转"""
angle = (torch.rand(1).item() * 2 - 1) * max_angle
# 使用仿射变换
cos_a = torch.cos(torch.tensor(angle * 3.14159 / 180))
sin_a = torch.sin(torch.tensor(angle * 3.14159 / 180))
# 简化实现: 返回原图 (实际应使用 grid_sample)
return image
def random_erasing(self, image, p=0.5, scale=(0.02, 0.33)):
"""随机擦除"""
if torch.rand(1).item() >= p:
return image
_, h, w = image.shape
area = h * w * torch.rand(1).item() * (scale[1] - scale[0]) + scale[0]
aspect_ratio = torch.rand(1).item() * 0.5 + 0.5
eh = int(torch.sqrt(torch.tensor(area * aspect_ratio)))
ew = int(torch.sqrt(torch.tensor(area / aspect_ratio)))
top = torch.randint(0, h - eh + 1, (1,)).item()
left = torch.randint(0, w - ew + 1, (1,)).item()
image[:, top:top+eh, left:left+ew] = torch.rand(3, eh, ew)
return image
# 使用示例
augmenter = NpuImageAugmentation()
image = torch.rand(3, 224, 224).npu()
# 应用增强
image = augmenter.random_flip(image)
image = augmenter.color_jitter(image)
image = augmenter.normalize(image, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
2.2 批量增强
python
复制代码
class NpuBatchAugmentation:
"""NPU 批量增强"""
def __init__(self, augmentation_pipeline):
self.pipeline = augmentation_pipeline
def augment_batch(self, images):
"""批量增强 (并行处理)"""
batch_size = images.shape[0]
# 对 batch 中每个样本应用相同的随机增强
augmented = []
for i in range(batch_size):
aug_image = images[i]
for aug_func, kwargs in self.pipeline:
aug_image = aug_func(aug_image, **kwargs)
augmented.append(aug_image)
return torch.stack(augmented)
# 定义增强流水线
pipeline = [
(lambda img, **kw: img.npu(), {}),
(lambda img, **kw: NpuImageAugmentation().random_flip(img), {}),
(lambda img, **kw: NpuImageAugmentation().color_jitter(img), {}),
(lambda img, **kw: NpuImageAugmentation().normalize(img, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), {}),
]
batch_augmenter = NpuBatchAugmentation(pipeline)
# 使用示例
images = torch.rand(32, 3, 224, 224) # batch of 32
augmented_images = batch_augmenter.augment_batch(images)
三、与训练流水线集成
3.1 NPU 数据增强 + 训练
python
复制代码
class NpuTrainingPipeline:
"""NPU 增强 + 训练流水线"""
def __init__(self, model, optimizer, augmentation):
self.model = model.npu()
self.optimizer = optimizer
self.augmentation = augmentation
# 创建两个 Stream
self.aug_stream = torch.npu.Stream() # 增强 Stream
self.train_stream = torch.npu.Stream() # 训练 Stream
def train_step(self, images, labels):
"""一步训练 (增强与训练重叠)"""
# 1. 在 aug_stream 上执行增强
with torch.npu.stream(self.aug_stream):
aug_images = self.augmentation(images.npu())
# 2. 在 train_stream 上执行训练 (等待增强完成)
with torch.npu.stream(self.train_stream):
self.train_stream.wait_stream(self.aug_stream)
self.optimizer.zero_grad()
output = self.model(aug_images)
loss = torch.nn.functional.cross_entropy(output, labels.npu())
loss.backward()
self.optimizer.step()
return loss.item()
def train_epoch(self, dataloader, epoch):
"""训练一个 epoch"""
self.model.train()
total_loss = 0
for batch_idx, (images, labels) in enumerate(dataloader):
loss = self.train_step(images, labels)
total_loss += loss
if batch_idx % 100 == 0:
print(f"Epoch {epoch}, Batch {batch_idx}: Loss={loss:.4f}")
return total_loss / len(dataloader)
# 使用示例
model = torch.hub.load('pytorch/vision', 'mobilenet_v2', num_classes=10)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
augmentation = NpuImageAugmentation()
pipeline = NpuTrainingPipeline(model, optimizer, augmentation)
# 训练
for epoch in range(10):
avg_loss = pipeline.train_epoch(train_loader, epoch)
print(f"Epoch {epoch}: Avg Loss={avg_loss:.4f}")
3.2 预取与流水线
python
复制代码
class NpuPrefetchPipeline:
"""NPU 预取流水线"""
def __init__(self, model, augmentation, buffer_size=3):
self.model = model.npu()
self.augmentation = augmentation
self.buffer = []
self.buffer_size = buffer_size
self.aug_stream = torch.npu.Stream()
self.train_stream = torch.npu.Stream()
def prefetch(self, dataloader):
"""预取下一批数据"""
if len(self.buffer) < self.buffer_size:
for batch in dataloader:
images, labels = batch
# 异步增强
with torch.npu.stream(self.aug_stream):
aug_images = self.augmentation(images.npu())
self.buffer.append((aug_images, labels.npu()))
if len(self.buffer) >= self.buffer_size:
break
def get_batch(self):
"""获取预取的 batch"""
if self.buffer:
return self.buffer.pop(0)
return None
def train_step(self):
"""一步训练"""
batch = self.get_batch()
if batch is None:
return None
aug_images, labels = batch
with torch.npu.stream(self.train_stream):
self.optimizer.zero_grad()
output = self.model(aug_images)
loss = torch.nn.functional.cross_entropy(output, labels)
loss.backward()
self.optimizer.step()
return loss.item()
四、增强效果验证
4.1 增强统计
python
复制代码
def validate_augmentation(original_images, augmented_images):
"""验证增强效果"""
stats = {
"original_mean": original_images.mean().item(),
"augmented_mean": augmented_images.mean().item(),
"original_std": original_images.std().item(),
"augmented_std": augmented_images.std().item(),
"value_range_original": (original_images.min().item(), original_images.max().item()),
"value_range_augmented": (augmented_images.min().item(), augmented_images.max().item()),
}
print("增强效果验证:")
for key, value in stats.items():
print(f" {key}: {value}")
# 检查增强是否改变了数据
diff = (original_images - augmented_images).abs().mean().item()
print(f" 平均差异: {diff:.6f}")
if diff > 0.01:
print(" ✓ 增强有效")
else:
print(" ⚠ 增强可能无效")
# 使用示例
original = torch.rand(32, 3, 224, 224).npu()
augmented = batch_augmenter.augment_batch(original)
validate_augmentation(original, augmented)
4.2 训练效果对比
python
复制代码
def compare_training_with_augmentation():
"""对比有无增强的训练效果"""
# 无增强训练
model_no_aug = torch.hub.load('pytorch/vision', 'mobilenet_v2', num_classes=10).npu()
optimizer_no_aug = torch.optim.SGD(model_no_aug.parameters(), lr=0.01)
# 有增强训练
model_with_aug = torch.hub.load('pytorch/vision', 'mobilenet_v2', num_classes=10).npu()
optimizer_with_aug = torch.optim.SGD(model_with_aug.parameters(), lr=0.01)
augmentation = NpuImageAugmentation()
results = {
"no_augmentation": [],
"with_augmentation": []
}
for epoch in range(5):
# 无增强
loss_no_aug = train_epoch(model_no_aug, optimizer_no_aug, train_loader, None)
acc_no_aug = evaluate(model_no_aug, test_loader)
results["no_augmentation"].append({"loss": loss_no_aug, "acc": acc_no_aug})
# 有增强
loss_with_aug = train_epoch(model_with_aug, optimizer_with_aug, train_loader, augmentation)
acc_with_aug = evaluate(model_with_aug, test_loader)
results["with_augmentation"].append({"loss": loss_with_aug, "acc": acc_with_aug})
print(f"Epoch {epoch}:")
print(f" 无增强: Loss={loss_no_aug:.4f}, Acc={acc_no_aug:.2f}%")
print(f" 有增强: Loss={loss_with_aug:.4f}, Acc={acc_with_aug:.2f}%")
return results
五、常见问题
| 问题 |
原因 |
解决方案 |
| 增强后数据异常 |
颜色空间转换错误 |
检查归一化参数 |
| 训练不稳定 |
增强太强 |
降低增强强度 |
| 内存不足 |
批量增强占用显存 |
减小 batch size |
| 增强不生效 |
随机种子问题 |
检查随机数生成 |
| NPU 报错 |
不支持的操作 |
使用 CPU fallback |
相关仓库