问题:pytorch->onnx(sim)->rknn转换,rknn跑在RK3588 Ubuntu Arm Npu架构上,精度断崖式下降,20%~30%甚至更大。
调试排查思路:
- 检查Windows测试集数据是否和RK3588使用的测试集是否一致;
- 检查pytorch->onnx转换过程,比对pytorch和onnx测试结果是否存在误差;
- 仿真测试onnx->rknn的转换,在虚拟机WSL Ubuntu上测试转换误差;
- 如果以上都没有问题,则需要真机逐层测试,按从后往前的顺序,输出onnx和rknn每层的结果,比对每层的误差,精确定位到误差的源头。
具体的:
1、测试集检查
重点检查数据集的划分方式,在训练生成pytorch过程中,保存好对应的测试集,然后拷贝到真机上进行测试。
2、pytorch->onnx检查
a.调用pytorch
python
def _run_torch(weight_path: Path, x: np.ndarray) -> np.ndarray:
model = get_model()
state_dict = torch.load(weight_path, map_location="cpu")
model.load_state_dict(state_dict, strict=False)
model.eval()
with torch.no_grad():
torch_out = model(torch.from_numpy(x).float())
return torch_out.cpu().numpy()
b.调用onnx
python
def _run_onnx(onnx_path: Path, x: np.ndarray) -> np.ndarray:
if ort is None:
raise RuntimeError("onnxruntime is not installed: %s" % ONNXRUNTIME_IMPORT_ERROR)
sess = ort.InferenceSession(str(onnx_path))
return sess.run(None, {"input": x.astype(np.float32)})[0]
c.结果比对
python
abs_diff = np.abs(torch_out - onnx_out)
torch_pred = np.argmax(torch_out, axis=1)
onnx_pred = np.argmax(onnx_out, axis=1)
agreement = float(np.mean(torch_pred == onnx_pred) * 100.0)
mismatch_indices = np.where(torch_pred != onnx_pred)[0]
正常这里不会出现异常,两者的结果应该是一致的。
3、onnx->rknn仿真测试
对比ONNX输出 VS RKNN Simulator输出
导出:error_analysis.txt、map_name_to_file.txt用于定位哪一层开始出现误差。
python
from rknn.api import RKNN
import numpy as np
ONNX_MODEL = './sim_onnx/' + mode + '_sim.onnx'
RKNN_MODEL = './sim_rknn/' + mode + '_sim.rknn'
rknn = RKNN(verbose=True)
# ⭐⭐⭐ 稳定配置
rknn.config(
target_platform='rk3588',
optimization_level=3,
)
print('--> Loading ONNX')
ret = rknn.load_onnx(
model=ONNX_MODEL,
inputs=['input'],
input_size_list=[[1,1,8,750]]
)
if ret != 0:
exit(ret)
print('--> Building')
ret = rknn.build(do_quantization=False)
if ret != 0:
exit(ret)
rknn.accuracy_analysis(
inputs=['./npy/1.npy'],
output_dir='./fp16_accuracy',
target=None
)
rknn.release()
4、rknn真机逐层测试
从输出层一次往前测试onnx的输出 VS rknn输出。
(1)首先,正常输出只有最后一层,需增加输出节点:
python
import onnx
from onnx import helper
model = onnx.load("./exports/sim_onnx/model_sim.onnx")
# 输出onnx模型的节点信息,用于后续输出节点增加
for i,node in enumerate(model.graph.node):
print(i,node.op_type,node.output)
# 增加输出节点执行下面的代码
for name in [
"onnx::Pad_90",
"onnx::Clip_93",
"input.48"
]:
extra = helper.ValueInfoProto()
extra.name = name
model.graph.output.append(extra)
onnx.save(
model,
"./exports/model_debug.onnx"
)
(2)保存onnx输出的节点值:
python
import onnxruntime as ort
import numpy as np
datas = np.load("./test_data.npy")
datas = datas.astype(np.float32)
pad90_list = []
clip93_list = []
input48_list = []
# 加载模型
session = ort.InferenceSession("./exports/model_debug.onnx")
for idx, x in enumerate(datas):
# 输出读取数据的某一位,用于比对和RK3588上读取数据是否一致
print("数据比对:",x[0][7])
x = np.expand_dims(x, axis=(0,1))
out = session.run(
None,
{"input": x}
)
onnx_outs.append(out[0])
pad90_list.append(out[1])
clip93_list.append(out[2])
input48_list.append(out[3])
pad90_list = np.array(pad90_list)
clip93_list = np.array(clip93_list)
input48_list = np.array(input48_list)
np.save("./feature_out/onnx_pad90.npy", pad90_list)
np.save("./feature_out/onnx_clip93.npy", clip93_list)
np.save("./feature_out/onnx_input48.npy", input48_list)
(3)保存RK3588上rknn输出的节点值:
python
import numpy as np
from rknnlite.api import RKNNLite
datas = np.load("./test_data.npy")
datas = datas.astype(np.float32)
pad90_list = []
clip93_list = []
input48_list = []
for idx, x in enumerate(datas):
# 输出读取数据的某一位,用于比对和onnx上读取数据是否一致
print("数据比对:",x[0][7])
x = np.expand_dims(x, axis=(0,1))
rknn = RKNNLite()
rknn.load_rknn("./model_debug.rknn")
rknn.init_runtime()
rknn_out = rknn.inference(inputs=[x], data_format='nchw')
pred = np.argmax(rknn_out[0], axis=1)[0]
onnx_outs.append(rknn_out[0])
pad90_list.append(rknn_out[1])
clip93_list.append(rknn_out[2])
input48_list.append(rknn_out[3])
pad90_list = np.array(pad90_list)
clip93_list = np.array(clip93_list)
input48_list = np.array(input48_list)
np.save("./feature_rknnout/rknn_pad90.npy", pad90_list)
np.save("./feature_rknnout/rknn_clip93.npy", clip93_list)
np.save("./feature_rknnout/rknn_input48.npy", input48_list)
(4)比较onnx和rknn节点输出的误差:
python
import numpy as np
def compare_feature(name):
onnx_feat = np.load(f"./feature_out/onnx_{name}.npy")
rknn_feat = np.load(f"./feature_rknnout/rknn_{name}.npy")
print("\n" + "=" * 60)
print(name)
print("=" * 60)
print("onnx shape:", onnx_feat.shape)
print("rknn shape:", rknn_feat.shape)
diff = np.abs(onnx_feat - rknn_feat)
print("global max :", diff.max())
print("global mean:", diff.mean())
print("\nper sample error:")
sample_max = []
for i in range(len(diff)):
err = diff[i].max()
sample_max.append(err)
print(
f"sample {i:02d}: "
f"{err:.6f}"
)
worst_idx = np.argmax(sample_max)
print("\nWorst sample:")
print(
f"idx={worst_idx}, "
f"error={sample_max[worst_idx]:.6f}"
)
print("\nFirst 20 values comparison:")
print("ONNX:")
print(
onnx_feat[worst_idx]
.flatten()[:20]
)
print("RKNN:")
print(
rknn_feat[worst_idx]
.flatten()[:20]
)
compare_feature("pad90")
compare_feature("clip93")
compare_feature("input48")
通过逐层分析,博主发现误差源头在pad90,对应的模型层为pooling_layer((1, 75), stride=25),起初博主认为是池化层核大小及步长太大,更改为pooling_layer((1, 25), stride=25),pooling_layer((1, 3), stride=1)后,误差源头变为pooling_layer((1, 25), stride=25),由于硬件编码限制问题,这里怀疑池化层核大小和步长不能使用奇数,这里改为pooling_layer((1, 24), stride=24)后,误差消失,模型性能转换前后几乎一致,误差小于0.5%。
提示:如果池化为奇数时,在进行onnx到rknn的转换时,会出现报错信息E RKNN: [08:53:58.638] REGTASK: The bit width of field value exceeds the limit, target: v2, offset: 0x1014, shift = 0, limit: 0x7, value: 0x19,根据这个报错,博主判断是池化层步长导致的,步长应为{1,2,4,8,16,32...}这种。
注:博主这里的结论并不严谨,并没有更进一步深入分析和测试。
另外,①检查输入维度变换(NCHW/NHWC)是否正确;②检查数据类型float32;③检查torch.squeeze()、nn.LogSoftmax(dim=1)等。