This patch extends using of widening of induction variables for the cases of "sub nsw" and "mul nsw" instructions. Currently only "add nsw" are widened.
This patch eliminates tons of "sext" instructions for 64 bit code (and the corresponding target code) in cases like:
int N = 100;
float **A;
void foo(int x0, int x1)
{
float * A_cur = &A[0][0];
float * A_next = &A[1][0];
for(int x = x0; x < x1; ++x).
{
// Currently only [x+N] case is widened. Others 2 cases lead to sext.
// This patch fixes it, so all 3 cases do not need sext.
const float div = A_cur[x + N] + A_cur[x - N] + A_cur[x * N];
A_next[x] = div;
}
}
...
> clang++ test.cpp -march=core-avx2 -Ofast -fno-unroll-loops -fno-tree-vectorize -S -o -(with my patch)
.LBB0_2: # %for.body
# =>This Inner Loop Header: Depth=1
vmovss (%rdi,%rcx,4), %xmm0
vaddss (%rdx,%rcx,4), %xmm0, %xmm0
vaddss (%rax), %xmm0, %xmm0
vmovss %xmm0, (%r8,%rcx,4)
incq %rcx
addq %r9, %rax
cmpl %esi, %ecx
jl .LBB0_2vs trunk:
.LBB0_2: # %for.body
# =>This Inner Loop Header: Depth=1
vmovss (%r10,%rcx,4), %xmm0
leal (%r11,%rcx), %edx
movslq %edx, %rdx
vaddss (%rax,%rdx,4), %xmm0, %xmm0
movslq %edi, %rdi
vaddss (%rax,%rdi,4), %xmm0, %xmm0
vmovss %xmm0, (%r8,%rcx,4)
incq %rcx
addl %r9d, %edi
cmpl %esi, %ecx
jl .LBB0_2