1. LLVM中的循环自动向量化
Clang编译器提供循环自动向量化的能力,可以帮助开发者提高代码性能,clang在O2以上优化级别默认开启该优化。对LLVM自动向量化不了解的读者,请先阅读Auto-Vectorization in LLVM,官方文档中给出了可以进行向量化的常见代码类型,以及向量化优化后的部分性能数据。
LLVM中循环的自动向量化在LoopVectorizePass中实现,可以将这个Pass拆分为3个主要过程:
(1)合法性检查
(2)代价模型分析
(3)向量代码生成
第1个步骤主要是检查能不能做;第2步检查是否值得做;第3部分执行变换。下面给出一个简单的循环向量化的例子:
cpp
int loop_vectorize_test(int* arr , int n) {
int res = 0;
// #pragma clang loop vectorize_width(4) interleave_count(2)
for (int i = 0; i < n; i++) {
if (n <= arr[i]) {
res += arr[i];
}
}
return res;
}
执行clang++ -S -emit-llvm -O1 vec_test.cc -o vec_test.ll
,获取标量版本的IR:
ll
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) uwtable
define dso_local noundef i32 @_Z19loop_vectorize_testPii(ptr nocapture noundef readonly %0, i32 noundef %1) local_unnamed_addr #0 {
%3 = icmp sgt i32 %1, 0
br i1 %3, label %4, label %6
4: ; preds = %2
%5 = zext i32 %1 to i64
br label %8
6: ; preds = %8, %2
%7 = phi i32 [ 0, %2 ], [ %15, %8 ]
ret i32 %7
8: ; preds = %4, %8
%9 = phi i64 [ 0, %4 ], [ %16, %8 ]
%10 = phi i32 [ 0, %4 ], [ %15, %8 ]
%11 = getelementptr inbounds i32, ptr %0, i64 %9
%12 = load i32, ptr %11, align 4, !tbaa !5
%13 = icmp slt i32 %12, %1
%14 = select i1 %13, i32 0, i32 %12
%15 = add nsw i32 %14, %10
%16 = add nuw nsw i64 %9, 1
%17 = icmp eq i64 %16, %5
br i1 %17, label %6, label %8, !llvm.loop !9
}
在循环基本块8中可以看到,这里的逻辑和C++代码逻辑基本一致,执行的是标量运算。现在我们解除上面C++代码中对于#pragma行的注释,并指定向量宽度vectorize_width为4,展开因子interleave_count为2。vectorize_width主要控制向量的宽度,在本例子中如果设置为4就是load <4 x i32> 向量到寄存器,2就是load <2 x i32> 的数据到寄存器。(注意ARM64架构下,如果是load <4 * i16> 可能最后直接就加载数据到普通的64位寄存器中)。interleave_count控制展开的次数,在本例子中count为2,也就是在一次循环中,需要加载load 2次<4 * i16>的数据。
cpp
#pragma clang loop vectorize_width(4) interleave_count(2)
for (int i = 0; i < n; i++) {
...
再次执行前面的编译命令,现在可以看到IR已经进行了向量化优化:
ll
; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read) uwtable
define dso_local noundef i32 @_Z19loop_vectorize_testPii(ptr nocapture noundef readonly %0, i32 noundef %1) local_unnamed_addr #0 {
%3 = icmp sgt i32 %1, 0
br i1 %3, label %4, label %36
4: ; preds = %2
%5 = zext i32 %1 to i64
%6 = icmp ult i32 %1, 8
br i1 %6, label %33, label %7
7: ; preds = %4
%8 = and i64 %5, 4294967288
%9 = insertelement <4 x i32> poison, i32 %1, i64 0
%10 = shufflevector <4 x i32> %9, <4 x i32> poison, <4 x i32> zeroinitializer
%11 = insertelement <4 x i32> poison, i32 %1, i64 0
%12 = shufflevector <4 x i32> %11, <4 x i32> poison, <4 x i32> zeroinitializer
br label %13
13: ; preds = %13, %7
%14 = phi i64 [ 0, %7 ], [ %27, %13 ]
%15 = phi <4 x i32> [ zeroinitializer, %7 ], [ %25, %13 ]
%16 = phi <4 x i32> [ zeroinitializer, %7 ], [ %26, %13 ]
%17 = getelementptr inbounds i32, ptr %0, i64 %14
%18 = load <4 x i32>, ptr %17, align 4, !tbaa !5
%19 = getelementptr inbounds i32, ptr %17, i64 4
%20 = load <4 x i32>, ptr %19, align 4, !tbaa !5
%21 = icmp slt <4 x i32> %18, %10
%22 = icmp slt <4 x i32> %20, %12
%23 = select <4 x i1> %21, <4 x i32> zeroinitializer, <4 x i32> %18
%24 = select <4 x i1> %22, <4 x i32> zeroinitializer, <4 x i32> %20
%25 = add <4 x i32> %23, %15
%26 = add <4 x i32> %24, %16
%27 = add nuw i64 %14, 8
%28 = icmp eq i64 %27, %8
br i1 %28, label %29, label %13, !llvm.loop !9
29: ; preds = %13
%30 = add <4 x i32> %26, %25
%31 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %30)
%32 = icmp eq i64 %8, %5
br i1 %32, label %36, label %33
33: ; preds = %4, %29
%34 = phi i64 [ 0, %4 ], [ %8, %29 ]
%35 = phi i32 [ 0, %4 ], [ %31, %29 ]
br label %38
36: ; preds = %38, %29, %2
%37 = phi i32 [ 0, %2 ], [ %31, %29 ], [ %45, %38 ]
ret i32 %37
38: ; preds = %33, %38
%39 = phi i64 [ %46, %38 ], [ %34, %33 ]
%40 = phi i32 [ %45, %38 ], [ %35, %33 ]
%41 = getelementptr inbounds i32, ptr %0, i64 %39
%42 = load i32, ptr %41, align 4, !tbaa !5
%43 = icmp slt i32 %42, %1
%44 = select i1 %43, i32 0, i32 %42
%45 = add nsw i32 %44, %40
%46 = add nuw nsw i64 %39, 1
%47 = icmp eq i64 %46, %5
br i1 %47, label %36, label %38, !llvm.loop !14
}
在循环块13中,可以看到LLVM已经按照我们制定的展开因子和向量宽度进行了自动向量化优化。然后再执行llc -march=aarch64 -filetype=obj vec_test.ll -o vec_test.o
,llvm-objdump -d vec_test.o
,可以看到arm64的汇编代码中将数组的数据加载到了128bit寄存器q3,q4(对arm64汇编不熟悉的读者可以先参考Armv8/armv9架构入门指南):
asm
0000000000000000 <_Z19loop_vectorize_testPii>:
0: 7100043f cmp w1, #0x1
4: 540000eb b.lt 0x20 <_Z19loop_vectorize_testPii+0x20>
8: 71001c3f cmp w1, #0x7
c: 2a0103e9 mov w9, w1
10: 540000e8 b.hi 0x2c <_Z19loop_vectorize_testPii+0x2c>
14: aa1f03ea mov x10, xzr
18: 2a1f03e8 mov w8, wzr
1c: 14000019 b 0x80 <_Z19loop_vectorize_testPii+0x80>
20: 2a1f03e8 mov w8, wzr
24: 2a0803e0 mov w0, w8
28: d65f03c0 ret
2c: 6f00e400 movi v0.2d, #0000000000000000
30: 6f00e401 movi v1.2d, #0000000000000000
34: 927d712a and x10, x9, #0xfffffff8
38: 4e040c22 dup v2.4s, w1
3c: 91004008 add x8, x0, #0x10
40: aa0a03eb mov x11, x10
44: ad7f9103 ldp q3, q4, [x8, #-0x10]
48: f100216b subs x11, x11, #0x8
4c: 91008108 add x8, x8, #0x20
50: 4ea33445 cmgt v5.4s, v2.4s, v3.4s
54: 4ea43446 cmgt v6.4s, v2.4s, v4.4s
58: 4e651c63 bic v3.16b, v3.16b, v5.16b
5c: 4e661c84 bic v4.16b, v4.16b, v6.16b
60: 4ea08460 add v0.4s, v3.4s, v0.4s
64: 4ea18481 add v1.4s, v4.4s, v1.4s
68: 54fffee1 b.ne 0x44 <_Z19loop_vectorize_testPii+0x44>
6c: 4ea08420 add v0.4s, v1.4s, v0.4s
70: eb09015f cmp x10, x9
74: 4eb1b800 addv s0, v0.4s
78: 1e260008 fmov w8, s0
7c: 54000120 b.eq 0xa0 <_Z19loop_vectorize_testPii+0xa0>
80: 8b0a080b add x11, x0, x10, lsl #2
84: cb0a0129 sub x9, x9, x10
88: b840456a ldr w10, [x11], #0x4
8c: 6b01015f cmp w10, w1
90: 1a8ab3ea csel w10, wzr, w10, lt
94: f1000529 subs x9, x9, #0x1
98: 0b080148 add w8, w10, w8
9c: 54ffff61 b.ne 0x88 <_Z19loop_vectorize_testPii+0x88>
a0: 2a0803e0 mov w0, w8
a4: d65f03c0 ret
2. 哪些常见的循环结构会被Loop Vectorize合法性检查拒绝
循环自动向量化虽然很方便,但是由于自动向量化代码生成的复杂性较高,同时需要确保生成代码的安全性和逻辑一致性,所以即使是很多常见的循环结构,也无法进行自动的向量化,下面是几种较为常见的无法自动向量化的类型。
2.1 循环中途退出,导致循环向量化器没法计算循环次数
cpp
int loop_vectorize_test(int* arr , int n) {
int res = 0;
#pragma clang loop vectorize(enable)
for (int i = 0; i < n; i++) {
if (arr[i] == n) {
res = arr[i];
break;
}
}
return res;
}
使用命令编译clang++ -Rpass-analysis=loop-vectorize -O3 -S -emit-llvm vec_test.cc -o vec_test.ll
,可以看到
cmd
vec_test.cc:5:5: remark: loop not vectorized: could not determine number of loop iterations [-Rpass-analysis]
for (int i = 0; i < n; i++) {
2.2 循环中有较复杂的switch语句
cpp
int loop_vectorize_test(int* arr, int n) {
int res = 0;
#pragma clang loop vectorize(enable)
for (int i = 0; i < n; i++) {
switch(arr[i]) {
case 0: break;
case 1: arr[i] = i; break;
default: arr[i] = 0;
}
}
return res;
}
2.3 循环中有数据依赖
cpp
int loop_vectorize_test(int* arr, int n) {
int res = 0;
#pragma clang loop vectorize(enable)
for (int i = 1; i <= n - 3; i += 3) {
arr[i] = arr[i-1];
arr[i+1] = arr[i+3];
}
return res;
}
3. 如何通过手动向量化优化程序
下面就一个在数组中循环查找第一个目标值的简单程序,讲解手动向量化的2种常见方案
cpp
int loop_vectorize_test(const int* arr, int n, int target) {
for (int i = 0; i < n; i++) {
if (arr[i] == target) {
return i;
}
}
return -1;
}
(1)使用GCC/Clang编译器的向量类型拓展实现向量化搜索
cpp
// 使用编译器拓展定义4*4 bytes的vec
typedef int v4si __attribute__ ((vector_size (16)));
int find_target_vectorize(const int* arr, int n, int target) {
int i = 0;
for (; i + 3 < n; i += 4) {
v4si src = {arr[i], arr[i + 1], arr[i + 2], arr[i + 3]};
v4si tgt = {target, target, target, target};
v4si res = (src == tgt);
// 如果向量元素的值之和不等于0,说明找到了目标数
// 编译器会自动优化成向量比较
if (res[0] + res[1] + res[2] + res[3] != 0) {
if (res[0] == -1) {
return i;
}
if (res[1] == -1) {
return i + 1;
}
if (res[2] == -1) {
return i + 2;
}
if (res[3] == -1) {
return i + 3;
}
}
}
// 在剩余部分搜索
for (; i < n; i++) {
if (arr[i] == target) {
return i;
}
}
return -1;
}
通过clang++ -O3 -S -emit-llvm vec_test.cc -o vec_test.ll
编译可以看到ll文件中进行了向量的加载和比较。
ll
......
15: ; preds = %5, %38
%16 = phi i64 [ 0, %5 ], [ %39, %38 ]
%17 = phi i64 [ 3, %5 ], [ %40, %38 ]
%18 = getelementptr inbounds i32, ptr %0, i64 %16
%19 = load <4 x i32>, ptr %18, align 4, !tbaa !6
%20 = icmp eq <4 x i32> %19, %7
%21 = sext <4 x i1> %20 to <4 x i32>
%22 = extractelement <4 x i32> %21, i64 0
%23 = extractelement <4 x i32> %21, i64 1
%24 = add nsw i32 %22, %23
%25 = extractelement <4 x i32> %21, i64 2
%26 = add nsw i32 %24, %25
%27 = extractelement <4 x i32> %21, i64 3
%28 = sub nsw i32 0, %27
%29 = icmp eq i32 %26, %28
br i1 %29, label %38, label %30
30: ; preds = %15
%31 = icmp eq i32 %22, -1
br i1 %31, label %55, label %32
32: ; preds = %30
%33 = icmp eq i32 %23, -1
br i1 %33, label %57, label %34
......
(2)手动循环拆分后提示编译器进行向量化
对于有些循环中有退出循环的指令,如break, return等,我们可以将其改写为2个循环,外循环执行退出循环逻辑,内循环不退出,便于编译器进行循环自动向量化。
cpp
int find_target_vectorize_nested(const int* arr, int n, int target) {
int i = 0;
// 处理剩余的元素
for (; i + 3 < n; i += 4) {
bool mask = 0;
#pragma clang loop vectorize_width(4)
for (int j = i; j < i + 4; j++) {
mask |= (arr[j] == target);
}
if (mask != false) {
if (arr[i] == target) {
return i;
}
if (arr[i + 1] == target) {
return i + 1;
}
if (arr[i + 2] == target) {
return i + 2;
}
if (arr[i + 3] == target) {
return i + 3;
}
}
}
// 在剩余部分搜索
for (; i < n; i++) {
if (arr[i] == target) {
return i;
}
}
return -1; // 没有找到目标值
}
编译后的ll中,可以看到从数组中加载向量和比较的过程
ll
......
5: ; preds = %3
%6 = zext i32 %2 to i64
%7 = insertelement <4 x i32> poison, i32 %1, i64 0
%8 = shufflevector <4 x i32> %7, <4 x i32> poison, <4 x i32> zeroinitializer
br label %9
9: ; preds = %5, %42
%10 = phi i64 [ 3, %5 ], [ %44, %42 ]
%11 = phi i64 [ 0, %5 ], [ %17, %42 ]
%12 = getelementptr inbounds i32, ptr %0, i64 %11
%13 = load <4 x i32>, ptr %12, align 4, !tbaa !6
%14 = icmp eq <4 x i32> %13, %8
%15 = bitcast <4 x i1> %14 to i4
%16 = icmp eq i4 %15, 0
%17 = add nuw nsw i64 %11, 4
br i1 %16, label %42, label %26
......
(3)各个平台提供了向量化相关的库,如arm的neon,intel的simd等,通过调用库函数的方式执行向量运算。读者可以根据需求去阅读相关的文档并尝试实现,本文不再对该方法进行展开。