torch_pruning库介绍
在结构修剪中,**Group被定义为深度网络中最小的可移除单元。**每个组由多个相互依赖的层组成,需要同时修剪这些层以保持最终结构的完整性。然而,深度网络通常表现出层与层之间错综复杂的依赖关系,这对结构修剪提出了重大挑战。这项研究通过引入DepGraph轻松实现参数分组,并有助于修剪各种深度网络。
如修剪图中高亮的神经元,我们需要对所有有连线的层都进行修剪。(a) W_l、W_l+1 (b) W_l、W_l+1、W_l+2 © W_l、W_l+1、W_l+2 (d) W_l
一个简单的例子
我们修剪resnet18,把其第一个卷积的输出通道维度减少3。
python
import torch
from torchvision.models import resnet18
import torch_pruning as tp
model = resnet18(pretrained=True).eval()
print(model)
# 1. build dependency graph for resnet18
DG = tp.DependencyGraph().build_dependency(model, example_inputs=torch.randn(1,3,224,224))
# 2. Specify the to-be-pruned channels. Here we prune those channels indexed by [2, 6, 9].
group = DG.get_pruning_group( model.conv1, tp.prune_conv_out_channels, idxs=[2, 6, 9] )
print(group)
# 3. prune all grouped layers that are coupled with model.conv1 (included).
if DG.check_pruning_group(group): # avoid full pruning, i.e., len(channels)=0.
group.prune()
print(model)
跟conv1相关的层都被修剪了,剪的是第2,6,9维,但这种方式不给灵活,只能修剪固定的索引,下面我实现更灵活的方式。
下面我们对整个resnet18剪枝,使其通道数减半,ResNet18 = {64, 128, 256, 512} => ResNet18_Half = {32, 64, 128, 256}
这里需要我们做两件事,第一,重要性函数,抽象的说就是评估每个组的重要性,不重要的就可以修剪掉,可以使用内置重要性函数,也可以自定义。第二,配置剪枝器,如使用剪枝多少步,最后的稀疏性是多少,这里我们设置稀疏性是0.5。
python
import torch
from torchvision.models import resnet18
import torch_pruning as tp
model = resnet18(pretrained=True)
# Importance criteria
example_inputs = torch.randn(1, 3, 224, 224)
imp = tp.importance.TaylorImportance()
ignored_layers = []
for m in model.modules():
if isinstance(m, torch.nn.Linear) and m.out_features == 1000:
ignored_layers.append(m) # DO NOT prune the final classifier!
iterative_steps = 5 # progressive pruning
pruner = tp.pruner.MagnitudePruner(
model,
example_inputs,
importance=imp,
iterative_steps=iterative_steps,
ch_sparsity=0.5, # remove 50% channels, ResNet18 = {64, 128, 256, 512} => ResNet18_Half = {32, 64, 128, 256}
ignored_layers=ignored_layers,
)
base_macs, base_nparams = tp.utils.count_ops_and_params(model, example_inputs)
print(f"Before Pruning: MACs={base_macs / 1e9: .5f} G, #Params={base_nparams / 1e6: .5f} M")
for i in range(iterative_steps):
if isinstance(imp, tp.importance.TaylorImportance):
# Taylor expansion requires gradients for importance estimation
loss = model(example_inputs).sum() # a dummy loss for TaylorImportance
loss.backward() # before pruner.step()
pruner.step()
macs, nparams = tp.utils.count_ops_and_params(model, example_inputs)
print(f"{i+1}/{iterative_steps} Pruning: MACs={macs / 1e9: .5f} G, #Params={nparams / 1e6: .5f} M")
# finetune your model here
# finetune(model),model.train()
# ...
在YOLOv8中剪枝
通过上面的步骤,发现剪枝好像并不困难,主要与ultralytics框架集成会麻烦点,核心剪枝代码在下面,有注释,完整代码在这里:yolov8_pruning.py
python
def prune(args):
# 加载模型,yaml,pt
model = YOLO(args.model)
# 加入train_v2方法在YOLO对象中,主要是重写train方法,为了不让model.train()每次新建一个模型来训练,这样会导致剪枝失败。
# 解决方法就是在训练完后重新加载训练完的权重到YOLO.model中。
model.__setattr__("train_v2", train_v2.__get__(model))
pruning_cfg = yaml_load(check_yaml(args.cfg))
batch_size = pruning_cfg['batch']
model.model.train()
# split操作不支持剪枝,使用两个卷积替换split操作
replace_c2f_with_c2f_v2(model.model)
initialize_weights(model.model) #设置 BN.eps, momentum, ReLU.inplace
# 开启梯度训练
for name, param in model.model.named_parameters():
param.requires_grad = True
example_inputs = torch.randn(1, 3, pruning_cfg["imgsz"], pruning_cfg["imgsz"]).to(model.device)
# 保存浮点数、参数量、mAP和剪枝mAP的记录
macs_list, nparams_list, map_list, pruned_map_list = [], [], [], []
# 计算浮点数和参数量
base_macs, base_nparams = tp.utils.count_ops_and_params(model.model, example_inputs)
# 在剪枝操作之前先评估一次模型
pruning_cfg['name'] = f"baseline_val"
validation_model = deepcopy(model)
metric = validation_model.val(**pruning_cfg)
init_map = metric.box.map
# 保存浮点数、参数量、mAP和剪枝mAP的记录
macs_list.append(base_macs)
nparams_list.append(base_nparams)
map_list.append(init_map)
pruned_map_list.append(init_map)
print(f"Before Pruning: MACs={base_macs / 1e9: .5f} G, #Params={base_nparams / 1e6: .5f} M, mAP={init_map: .5f}")
# 每一步的剪枝率
pruning_ratio = 1 - math.pow((1 - args.target_prune_rate), 1 / args.iterative_steps)
print(pruning_ratio)
# 这里可以发现剪枝器可以在循环里或者循环外,虽然最终模型的稀疏性是没变化的,但每剪一次微调一次的效果会更好。
for i in range(args.iterative_steps):
model.model.train()
for name, param in model.model.named_parameters():
param.requires_grad = True
ignored_layers = []
unwrapped_parameters = []
# 忽略的层,一般都对头部网络进行忽略,如果是目标检测就换成Detect,记得先引入,这是个类。
for m in model.model.modules():
if isinstance(m, (Segment,)):
ignored_layers.append(m)
example_inputs = example_inputs.to(model.device)
pruner = tp.pruner.GroupNormPruner(
model.model,
example_inputs,
importance=tp.importance.GroupNormImportance(), # L2 norm pruning,
iterative_steps=1,
pruning_ratio=pruning_ratio,
ignored_layers=ignored_layers,
unwrapped_parameters=unwrapped_parameters
)
pruner.step()
# 剪枝完后先评估一遍模型
pruning_cfg['name'] = f"step_{i}_pre_val"
validation_model.model = deepcopy(model.model)
metric = validation_model.val(**pruning_cfg)
pruned_map = metric.box.map
pruned_macs, pruned_nparams = tp.utils.count_ops_and_params(pruner.model, example_inputs.to(model.device))
current_speed_up = float(macs_list[0]) / pruned_macs
print(f"After pruning iter {i + 1}: MACs={pruned_macs / 1e9} G, #Params={pruned_nparams / 1e6} M, "
f"mAP={pruned_map}, speed up={current_speed_up}")
# 微调模型,重新训练,一般10-50epochs?
for name, param in model.model.named_parameters():
param.requires_grad = True
pruning_cfg['name'] = f"step_{i}_finetune"
pruning_cfg['batch'] = batch_size # restore batch size
model.train_v2(pruning=True, **pruning_cfg)
# 微调完后再评估一遍模型
pruning_cfg['name'] = f"step_{i}_post_val"
validation_model = YOLO(model.trainer.best)
metric = validation_model.val(**pruning_cfg)
current_map = metric.box.map
print(f"After fine tuning mAP={current_map}")
macs_list.append(pruned_macs)
nparams_list.append(pruned_nparams / base_nparams * 100)
pruned_map_list.append(pruned_map)
map_list.append(current_map)
# 移除剪枝器
del pruner
save_pruning_performance_graph(nparams_list, map_list, macs_list, pruned_map_list)
if init_map - current_map > args.max_map_drop and current_speed_up>=1.2:
print("Pruning early stop")
break
model.export(format='onnx')