This patch extends using of widening of induction variables for the cases of "sub nsw" and "mul nsw" instructions. Currently only "add nsw" are widened.
This patch eliminates tons of "sext" instructions for 64 bit code (and the corresponding target code) in cases like:
int N = 100; float **A; void foo(int x0, int x1) { float * A_cur = &A[0][0]; float * A_next = &A[1][0]; for(int x = x0; x < x1; ++x). { // Currently only [x+N] case is widened. Others 2 cases lead to sext. // This patch fixes it, so all 3 cases do not need sext. const float div = A_cur[x + N] + A_cur[x - N] + A_cur[x * N]; A_next[x] = div; } } ... > clang++ test.cpp -march=core-avx2 -Ofast -fno-unroll-loops -fno-tree-vectorize -S -o -
(with my patch)
.LBB0_2: # %for.body # =>This Inner Loop Header: Depth=1 vmovss (%rdi,%rcx,4), %xmm0 vaddss (%rdx,%rcx,4), %xmm0, %xmm0 vaddss (%rax), %xmm0, %xmm0 vmovss %xmm0, (%r8,%rcx,4) incq %rcx addq %r9, %rax cmpl %esi, %ecx jl .LBB0_2
vs trunk:
.LBB0_2: # %for.body # =>This Inner Loop Header: Depth=1 vmovss (%r10,%rcx,4), %xmm0 leal (%r11,%rcx), %edx movslq %edx, %rdx vaddss (%rax,%rdx,4), %xmm0, %xmm0 movslq %edi, %rdi vaddss (%rax,%rdi,4), %xmm0, %xmm0 vmovss %xmm0, (%r8,%rcx,4) incq %rcx addl %r9d, %edi cmpl %esi, %ecx jl .LBB0_2