写在前面

在使用现代编译器时,对于计算密集型循环,我们可以使用 #pragma simd#pragma ivdep来获得“免费”的性能提升。

其本质是为编译器提供了更多的先验知识,使其能够更有效的进行自动向量化指令集并行甚至使用SIMD

但是,嵌入式的 MCU 往往:

  • 没有 SIMD 指令集
  • 单发射,一次只能执行一条指令
  • 流水线深度较浅

在这种情况下,编译器的自动向量化是否还能优化性能呢?本文将在各种 MCU 平台上对比手动循环展开和编译器自动优化的效果,来探讨这一问题。

需要说明的是,这并非严谨的性能测试,各位图一乐就好(

测试代码

测试代码为简单的前缀和加法:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
int baseline(int *restrict arr, size_t len) {
int sum = 0;
for (size_t i = 0; i < len; i++) {
sum += arr[i];
}
return sum;
}

int unroll_1x2(int *restrict arr, size_t len) {
int sum = 0;
size_t i;
for (i = 0; i < len; i += 2) {
sum += arr[i] + arr[i + 1];
}
for (; i < len; i++) {
sum += arr[i];
}
return sum;
}

int unroll_1x4(int *restrict arr, size_t len) {
int sum = 0;
size_t i;
for (i = 0; i < len; i += 4) {
sum += arr[i] + arr[i + 1] + arr[i + 2] + arr[i + 3];
}
for (; i < len; i++) {
sum += arr[i];
}
return sum;
}

int unroll_4x1(int *restrict arr, size_t len) {
int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
for (size_t i = 0; i < len; i += 4) {
sum0 += arr[i];
sum1 += arr[i + 1];
sum2 += arr[i + 2];
sum3 += arr[i + 3];
}
return sum0 + sum1 + sum2 + sum3;
}

int unroll_2x2(int *restrict arr, size_t len) {
int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
size_t i;
for (i = 0; i < len; i += 4) {
sum0 += arr[i] + arr[i + 1];
sum1 += arr[i + 2] + arr[i + 3];
}
for (; i < len; i++) {
sum0 += arr[i];
}
return sum0 + sum1;
}

int unroll_4x4(int *restrict arr, size_t len) {
int sum0 = 0, sum1 = 0, sum2 = 0, sum3 = 0;
size_t i;
for (i = 0; i < len; i += 16) {
sum0 += arr[i] + arr[i + 1] + arr[i + 2] + arr[i + 3];
sum1 += arr[i + 4] + arr[i + 5] + arr[i + 6] + arr[i + 7];
sum2 += arr[i + 8] + arr[i + 9] + arr[i + 10] + arr[i + 11];
sum3 += arr[i + 12] + arr[i + 13] + arr[i + 14] + arr[i + 15];
}
for (; i < len; i++) {
sum0 += arr[i];
}
return sum0 + sum1 + sum2 + sum3;
}

int pragma_unroll(int *restrict arr, size_t len) {
int sum = 0;
#pragma unroll
for (size_t i = 0; i < len; i++) {
sum += arr[i];
}
return sum;
}

int pragma_simd(int *restrict arr, size_t len) {
int sum = 0;
#pragma simd
for (size_t i = 0; i < len; i++) {
sum += arr[i];
}
return sum;
}

int pragma_ivdep(int *restrict arr, size_t len) {
int sum = 0;
#pragma ivdep
for (size_t i = 0; i < len; i++) {
sum += arr[i];
}
return sum;
}

int pragma_align(int *restrict arr, size_t len) {
int sum = 0;
#pragma vector aligned
for (size_t i = 0; i < len; i += 2) {
sum += arr[i] + arr[i + 1];
}
return sum;
}

在 x86 平台上使用 Time Stamp Counter来获得时间:

1
2
3
uint64_t rdtsc() {
return __rdtsc();
}

在嵌入式平台上则使用 SysTick或其他高精度定时器:

1
2
3
4
5
6
7
uint32_t get_tick_count(void) {
return SysTick->VAL;
}

uint32_t get_cycle_count() {
return esp_cpu_get_cycle_count();
}

在有 cache 的平台上,预热 cache 确保结果准确:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
__always_inline void run_test(int (*func)(), int *restrict arr, size_t len, const char *name) {
uint64_t time_usage[TEST_COUNT];
double std_dev = 0.0;

// warm up
int sum = func(arr, len);
for (int i = 0; i < WARMUP_COUNT; i++) {
int res = func(arr, len);
assert(sum == res);
}

for (size_t i = 0; i < TEST_COUNT; i++) {
__asm__ volatile("" ::: "memory");

uint64_t start = rdtsc();
int res = func(arr, len);
uint64_t end = rdtsc();

__asm__ volatile("" ::: "memory");

time_usage[i] = end - start;
assert(sum == res);
}

for (int i = 0; i < TEST_COUNT - 1; i++) {
for (int j = 0; j < TEST_COUNT - i - 1; j++) {
if (time_usage[j] > time_usage[j + 1]) {
uint64_t temp = time_usage[j];
time_usage[j] = time_usage[j + 1];
time_usage[j + 1] = temp;
}
}
}

uint64_t min_time = time_usage[0];
uint64_t max_time = time_usage[TEST_COUNT - 1];
uint64_t median;

if (TEST_COUNT % 2 == 0) {
median = (time_usage[TEST_COUNT / 2 - 1] + time_usage[TEST_COUNT / 2]) / 2;
} else {
median = time_usage[TEST_COUNT / 2];
}

uint64_t total_time = 0;
for (size_t i = 1; i < TEST_COUNT - 1; i++) {
total_time += time_usage[i];
}
double avg_time = (double)total_time / (TEST_COUNT - 2);

// calculate cycles per element
double cpe = avg_time / len;

printf("%-15s | %10lu | %10lu | %10lu | %10.2f | %9.2f%% | %10.4f\n",
name, min_time, median, max_time, avg_time,
(double)(max_time - min_time) * 100 / avg_time, cpe);
}

测试结果与分析

x86_64

Ryzen 5800U,WSL 环境,样本大小为 1M:

编译器版本 编译器选项 测试方法 耗时 (ticks) CPE
gcc 14.2.1 -O0 baseline 2028703 2.0287
gcc 14.2.1 -O0 unroll_1x2 1343064 1.3431
gcc 14.2.1 -O0 unroll_1x4 955068 0.9551
gcc 14.2.1 -O0 unroll_4x1 1095649 1.0956
gcc 14.2.1 -O0 unroll_2x2 1036262 1.0363
gcc 14.2.1 -O0 unroll_4x4 778363 0.7784
gcc 14.2.1 -O0 pragma_unroll 2193519 2.1935
gcc 14.2.1 -O0 pragma_simd 2156972 2.1570
gcc 14.2.1 -O0 pragma_ivdep 2036412 2.0364
gcc 14.2.1 -O0 pragma_align 1339773 1.3398
gcc 14.2.1 -Ofast -funroll-loops baseline 130114 0.1241
gcc 14.2.1 -Ofast -funroll-loops unroll_1x2 109789 0.1047
gcc 14.2.1 -Ofast -funroll-loops unroll_1x4 135755 0.1295
gcc 14.2.1 -Ofast -funroll-loops unroll_4x1 130874 0.1248
gcc 14.2.1 -Ofast -funroll-loops unroll_2x2 107680 0.1027
gcc 14.2.1 -Ofast -funroll-loops unroll_4x4 268581 0.2561
gcc 14.2.1 -Ofast -funroll-loops pragma_unroll 130929 0.1249
gcc 14.2.1 -Ofast -funroll-loops pragma_simd 130634 0.1246
gcc 14.2.1 -Ofast -funroll-loops pragma_ivdep 130245 0.1242
gcc 14.2.1 -Ofast -funroll-loops pragma_align 103944 0.0991
gcc 14.2.1 -Ofast -march=native -funroll-loops baseline 76857 0.0733
gcc 14.2.1 -Ofast -march=native -funroll-loops unroll_1x2 202782 0.1934
gcc 14.2.1 -Ofast -march=native -funroll-loops unroll_1x4 381189 0.3635
gcc 14.2.1 -Ofast -march=native -funroll-loops unroll_4x1 79439 0.0758
gcc 14.2.1 -Ofast -march=native -funroll-loops unroll_2x2 202879 0.1935
gcc 14.2.1 -Ofast -march=native -funroll-loops unroll_4x4 799904 0.7628
gcc 14.2.1 -Ofast -march=native -funroll-loops pragma_unroll 80628 0.0769
gcc 14.2.1 -Ofast -march=native -funroll-loops pragma_simd 79828 0.0761
gcc 14.2.1 -Ofast -march=native -funroll-loops pragma_ivdep 80134 0.0764
gcc 14.2.1 -Ofast -march=native -funroll-loops pragma_align 204036 0.1946

在现代处理器的 SIMD 和超标量执行技术的优化下,CPE(cycle per element)甚至可以远低于 1。

在优化等级为 O0时,手动循环展开相较于 baseline 均获得了一定程度的提升。

然而,当开启 Ofast优化并指定 -funroll-loops选项后,手动循环展开带来的性能提升变得非常微弱。特别的,unroll_4x4 的性能甚至不如 baseline,这可能是由于同时发射了过多的指令,造成后端解码单元饱和。

在指定 -march=native后,手动循环展开的性能普遍低于编译器自动优化生成的代码。不过,unroll_4x1 由于其同时操作四个元素的特性,恰好符合 SSE 的工作模式,因此性能未受到显著影响(?

RV32-IMC

ESP32-C3,样本大小为 8K,开启 32KB XIP cache:

编译器版本 编译器选项 测试方法 耗时 (ticks) CPE 符号大小 (bytes)^1
riscv32-esp-elf-gcc 12.2.0 -O0 baseline 173104 21.1309 26
riscv32-esp-elf-gcc 12.2.0 -O0 unroll_1x2 131940 16.1060 56
riscv32-esp-elf-gcc 12.2.0 -O0 unroll_1x4 107204 13.0865 80
riscv32-esp-elf-gcc 12.2.0 -O0 unroll_4x1 119566 14.5955 74
riscv32-esp-elf-gcc 12.2.0 -O0 unroll_2x2 109268 13.3384 84
riscv32-esp-elf-gcc 12.2.0 -O0 unroll_4x4 91746 11.1995 236
riscv32-esp-elf-gcc 12.2.0 -O0 pragma_unroll 173132 21.1344 26
riscv32-esp-elf-gcc 12.2.0 -O0 pragma_simd 181386 22.1419 26
riscv32-esp-elf-gcc 12.2.0 -O0 pragma_ivdep 173135 21.1347 26
riscv32-esp-elf-gcc 12.2.0 -O0 pragma_align 131924 16.1041 38
riscv32-esp-elf-gcc 12.2.0 -Ofast baseline 49460 6.0377 26
riscv32-esp-elf-gcc 12.2.0 -Ofast unroll_1x2 32988 4.0269 56
riscv32-esp-elf-gcc 12.2.0 -Ofast unroll_1x4 24736 3.0196 80
riscv32-esp-elf-gcc 12.2.0 -Ofast unroll_4x1 24713 3.0168 74
riscv32-esp-elf-gcc 12.2.0 -Ofast unroll_2x2 24698 3.0149 84
riscv32-esp-elf-gcc 12.2.0 -Ofast unroll_4x4 18544 2.2637 236
riscv32-esp-elf-gcc 12.2.0 -Ofast pragma_unroll 49451 6.0365 26
riscv32-esp-elf-gcc 12.2.0 -Ofast pragma_simd 49430 6.0340 26
riscv32-esp-elf-gcc 12.2.0 -Ofast pragma_ivdep 49449 6.0363 26
riscv32-esp-elf-gcc 12.2.0 -Ofast pragma_align 32976 4.0254 38
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops baseline 21641 2.6418 328
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops unroll_1x2 18559 2.2656 660
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops unroll_1x4 17535 2.1406 606
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops unroll_4x1 17785 2.1711 568
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops unroll_2x2 17536 2.1407 602
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops unroll_4x4 18060 2.2046 362
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops pragma_unroll 21654 2.6434 328
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops pragma_simd 20638 2.5193 328
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops pragma_ivdep 20597 2.5143 328
riscv32-esp-elf-gcc 12.2.0 -Ofast -funroll-loops pragma_align 18562 2.2659 640
riscv32-esp-elf-gcc 12.2.0 -Oz baseline 90668 11.0679 25
riscv32-esp-elf-gcc 12.2.0 -Oz unroll_1x2 49469 6.0388 56
riscv32-esp-elf-gcc 12.2.0 -Oz unroll_1x4 32987 4.0268 80
riscv32-esp-elf-gcc 12.2.0 -Oz unroll_4x1 37081 4.5266 74
riscv32-esp-elf-gcc 12.2.0 -Oz unroll_2x2 32988 4.0269 84
riscv32-esp-elf-gcc 12.2.0 -Oz unroll_4x4 25765 3.1451 236
riscv32-esp-elf-gcc 12.2.0 -Oz pragma_unroll 90696 11.0714 26
riscv32-esp-elf-gcc 12.2.0 -Oz pragma_simd 90686 11.0702 26
riscv32-esp-elf-gcc 12.2.0 -Oz pragma_ivdep 90686 11.0702 26
riscv32-esp-elf-gcc 12.2.0 -Oz pragma_align 49430 6.0340 38

可以看到,测试结果和 x86 上的结果几乎完全相反——在任何情况下,手动循环展开的性能都优于编译器自动优化。

当开启 Ofast优化时,和 O0相比,符号大小并为发生变化,说明编译器并没有尝试循环展开。即使是 pragma_unroll 版本,编译器也没有进行循环展开。

在指定 -funroll-loops选项后,CPE 下降到接近 2,此时的瓶颈主要在于访存和加法(各占用一个时钟周期)。在这种情况下,进一步手动循环展开的收益非常有限,反而会增加二进制的体积。

然而,这种性能提升并非是“免费”的——手动循环展开的符号大小远高于 baseline。特别是指定 -funroll-loops后,相较于 baseline 符号大小增加了十几倍。在 x86 计算机上,这种空间换时间的策略没有什么太大的问题,但在 FLASH 资源紧张的嵌入式环境,必须在性能和代码大小之间做出权衡。

Cortex-M0+

STM32F030F4P6,样本大小为 384,FLASH Latency 1 WS:

编译器版本 编译器选项 测试方法 耗时 (ticks) CPE
armclang 6.19 -O0 baseline 16969 44.1901
armclang 6.19 -O0 unroll_1x2 9881 25.7318
armclang 6.19 -O0 unroll_1x4 5753 14.9818
armclang 6.19 -O0 unroll_4x1 9213 23.9922
armclang 6.19 -O0 unroll_2x2 6916 18.0104
armclang 6.19 -O0 unroll_4x4 3613 9.4089
armclang 6.19 -O0 pragma_unroll 16969 44.1901
armclang 6.19 -O0 pragma_simd 17736 46.1875
armclang 6.19 -O0 pragma_ivdep 16969 44.1901
armclang 6.19 -O0 pragma_align 9865 25.6901
armclang 6.19 -Ofast baseline 3081 8.0234
armclang 6.19 -Ofast unroll_1x2 2800 7.2917
armclang 6.19 -Ofast unroll_1x4 2776 7.2292
armclang 6.19 -Ofast unroll_4x1 5112 13.3125
armclang 6.19 -Ofast unroll_2x2 2790 7.2656
armclang 6.19 -Ofast unroll_4x4 3237 8.4297
armclang 6.19 -Ofast pragma_unroll 3273 8.5234
armclang 6.19 -Ofast pragma_simd 3080 8.0208
armclang 6.19 -Ofast pragma_ivdep 3080 8.0208
armclang 6.19 -Ofast pragma_align 2785 7.2526
armclang 6.19 -Oz baseline 5404 14.0729
armclang 6.19 -Oz unroll_1x2 3498 9.1094
armclang 6.19 -Oz unroll_1x4 2252 5.8646
armclang 6.19 -Oz unroll_4x1 4671 12.1641
armclang 6.19 -Oz unroll_2x2 2930 7.6302
armclang 6.19 -Oz unroll_4x4 3137 8.1693
armclang 6.19 -Oz pragma_unroll 5404 14.0729
armclang 6.19 -Oz pragma_simd 5019 13.0703
armclang 6.19 -Oz pragma_ivdep 5404 14.0729
armclang 6.19 -Oz pragma_align 3496 9.1042
arm-none-eabi-gcc 14.2.1 -O0 baseline 5427 14.1328
arm-none-eabi-gcc 14.2.1 -O0 unroll_1x2 4088 10.6458
arm-none-eabi-gcc 14.2.1 -O0 unroll_1x4 3320 8.6458
arm-none-eabi-gcc 14.2.1 -O0 unroll_4x1 3425 8.9193
arm-none-eabi-gcc 14.2.1 -O0 unroll_2x2 3323 8.6536
arm-none-eabi-gcc 14.2.1 -O0 unroll_4x4 4086 10.6406
arm-none-eabi-gcc 14.2.1 -O0 pragma_unroll 5427 14.1328
arm-none-eabi-gcc 14.2.1 -O0 pragma_simd 5427 14.1328
arm-none-eabi-gcc 14.2.1 -O0 pragma_ivdep 5427 14.1328
arm-none-eabi-gcc 14.2.1 -O0 pragma_align 4085 10.6380
arm-none-eabi-gcc 14.2.1 -Ofast baseline 3844 10.0104
arm-none-eabi-gcc 14.2.1 -Ofast unroll_1x2 3071 7.9974
arm-none-eabi-gcc 14.2.1 -Ofast unroll_1x4 2303 5.9974
arm-none-eabi-gcc 14.2.1 -Ofast unroll_4x1 2321 6.0443
arm-none-eabi-gcc 14.2.1 -Ofast unroll_2x2 2309 6.0130
arm-none-eabi-gcc 14.2.1 -Ofast unroll_4x4 2171 5.6536
arm-none-eabi-gcc 14.2.1 -Ofast pragma_unroll 4222 10.9948
arm-none-eabi-gcc 14.2.1 -Ofast pragma_simd 3840 10.0000
arm-none-eabi-gcc 14.2.1 -Ofast pragma_ivdep 4222 10.9948
arm-none-eabi-gcc 14.2.1 -Ofast pragma_align 3071 7.9974
arm-none-eabi-gcc 14.2.1 -Oz baseline 6183 16.1016
arm-none-eabi-gcc 14.2.1 -Oz unroll_1x2 4462 11.6198
arm-none-eabi-gcc 14.2.1 -Oz unroll_1x4 2908 7.5729
arm-none-eabi-gcc 14.2.1 -Oz unroll_4x1 2919 7.6016
arm-none-eabi-gcc 14.2.1 -Oz unroll_2x2 3008 7.8333
arm-none-eabi-gcc 14.2.1 -Oz unroll_4x4 2426 6.3177
arm-none-eabi-gcc 14.2.1 -Oz pragma_unroll 6183 16.1016
arm-none-eabi-gcc 14.2.1 -Oz pragma_simd 6183 16.1016
arm-none-eabi-gcc 14.2.1 -Oz pragma_ivdep 6165 16.0547
arm-none-eabi-gcc 14.2.1 -Oz pragma_align 4444 11.5729

(由于 STM32F030F4P6 的 FLASH 实在太小,就不测 -funroll-loops啦 qwq)

整体上看和 ESP32 的状况差不多——大部分情况下,手工展开循环占有一定优势,但在少部分情况下会出现反例(如 armclang unroll_4x1)。

而 armclang 作为商业编译器,在各个方面上表现均由于 gcc。

总结

在现代高性能处理器上编程时,通常无需操心“手动循环展开”这种脏活累活——即使真的遇到了性能瓶颈,也可以使用多线程甚至 CUDA 进行加速。

但是,为 MCU 设计程序时,由于工具链的古老以及各种客观条件的限制,我们不得不深入分析和研究这些问题,在性能、资源占用和开发效率之间做出合理的权衡。

拓展阅读

【深缓中字】您可能浪费了大部分的 CPU 性能
《深入理解计算机系统》第 5 章 优化程序性能