make MaxDepth in value tracking configurable,
A small MaxDepth will generate low-performance code, a example below:
// b.cc #include <cstddef> #include <stdint.h> typedef long long index; extern "C" index g_tid; extern "C" index g_num; void add3(float* __restrict__ a, float* __restrict__ b, float* __restrict__ c) { index n = 64*1024; index m = 16*1024; index k = 4*1024; index tid = g_tid; index num = g_num; __builtin_assume_aligned(a, 32); __builtin_assume_aligned(b, 32); __builtin_assume_aligned(c, 32); for (index i0=tid*k; i0<m; i0+=num*k) for (index i1=0; i1<n*m; i1+=m) for (index i2=0; i2<k; i2++) c[i1+i0+i2] = b[i0+i2] + a[i1+i0+i2]; }
compile with clang ./b.cc -Ofast -march=native -std=c++14 -S -o b.s. (intel i7-7500U)
which yield:
// b.s ...... vmovaps -224(%rdi,%rbx,4), %ymm0 vmovups -192(%rdi,%rbx,4), %ymm1 vmovups -160(%rdi,%rbx,4), %ymm2 vmovups -128(%rdi,%rbx,4), %ymm3 vaddps -224(%rsi,%rbx,4), %ymm0, %ymm0 vaddps -192(%rsi,%rbx,4), %ymm1, %ymm1 vaddps -160(%rsi,%rbx,4), %ymm2, %ymm2 vaddps -128(%rsi,%rbx,4), %ymm3, %ymm3 vmovaps %ymm0, -224(%rdx,%rbx,4) vmovups %ymm1, -192(%rdx,%rbx,4) vmovups %ymm2, -160(%rdx,%rbx,4) vmovups %ymm3, -128(%rdx,%rbx,4) ......
expect:
// b.s ...... vmovaps -224(%rdi,%rbx,4), %ymm0 vmovaps -192(%rdi,%rbx,4), %ymm1 vmovaps -160(%rdi,%rbx,4), %ymm2 vmovaps -128(%rdi,%rbx,4), %ymm3 vaddps -224(%rsi,%rbx,4), %ymm0, %ymm0 vaddps -192(%rsi,%rbx,4), %ymm1, %ymm1 vaddps -160(%rsi,%rbx,4), %ymm2, %ymm2 vaddps -128(%rsi,%rbx,4), %ymm3, %ymm3 vmovaps %ymm0, -224(%rdx,%rbx,4) vmovaps %ymm1, -192(%rdx,%rbx,4) vmovaps %ymm2, -160(%rdx,%rbx,4) vmovaps %ymm3, -128(%rdx,%rbx,4) ......
This is because the MaxDepth is too small, llvm is unable to calculate the alignment info, compile with clang ./b.cc -Ofast -march=native -std=c++14 -mllvm -value-tracking-max-depth=10 -S -o b.s, which produces the expected asm code.