Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h @@ -101,6 +101,8 @@ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp +++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2178,6 +2178,17 @@ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } +bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) { + // X86 specific here are "instruction number 1st priority". + return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, + C1.NumIVMuls, C1.NumBaseAdds, + C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, + C2.NumIVMuls, C2.NumBaseAdds, + C2.ScaleCost, C2.ImmCost, C2.SetupCost); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? Index: llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -131,7 +131,7 @@ // The flag adds instruction count to solutions cost comparision. static cl::opt InsnsCost( - "lsr-insns-cost", cl::Hidden, cl::init(false), + "lsr-insns-cost", cl::Hidden, cl::init(true), cl::desc("Add instruction count to a LSR cost model")); // Flag to choose how to narrow complex lsr solution Index: llvm/trunk/test/CodeGen/X86/2006-05-11-InstrSched.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2006-05-11-InstrSched.ll +++ llvm/trunk/test/CodeGen/X86/2006-05-11-InstrSched.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \ -; RUN: grep "asm-printer" | grep 35 +; RUN: grep "asm-printer" | grep 33 target datalayout = "e-p:32:32" define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind { Index: llvm/trunk/test/CodeGen/X86/atom-fixup-lea3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/atom-fixup-lea3.ll +++ llvm/trunk/test/CodeGen/X86/atom-fixup-lea3.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s -; CHECK: addl ([[reg:%[a-z]+]]) -; CHECK-NEXT: addl $4, [[reg]] +; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4) +; CHECK-NEXT: movl +; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4) +; CHECK-NEXT: incl ; Test for the FixupLEAs pre-emit pass. ; An LEA should NOT be substituted for the ADD instruction @@ -20,7 +22,7 @@ ; return sum; ;} -define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 { +define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.lr.ph, label %for.end @@ -35,6 +37,9 @@ %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ] %inc1 = add nsw i32 %j.09, 1 %arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09 + store i32 %0, i32* %m, align 4 + store i32 %sum.010, i32* %m, align 4 + store i32 %0, i32* %m, align 4 %1 = load i32, i32* %arrayidx, align 4 %add = add nsw i32 %0, %1 store i32 %add, i32* %m, align 4 Index: llvm/trunk/test/CodeGen/X86/full-lsr.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/full-lsr.ll +++ llvm/trunk/test/CodeGen/X86/full-lsr.ll @@ -1,16 +1,10 @@ ; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s -; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s +; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind { -; ATOM: foo -; ATOM: addl -; ATOM: addl -; ATOM: leal ; CHECK: foo -; CHECK: addl -; CHECK: addl -; CHECK: addl +; CHECK: incl entry: %0 = icmp sgt i32 %N, 0 ; [#uses=1] Index: llvm/trunk/test/CodeGen/X86/hoist-spill.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/hoist-spill.ll +++ llvm/trunk/test/CodeGen/X86/hoist-spill.ll @@ -3,10 +3,8 @@ ; Check no spills to the same stack slot after hoisting. ; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET1:-?[0-9]*]](%rsp) ; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET2:-?[0-9]*]](%rsp) -; CHECK: mov{{.}} %{{.*}}, [[SPOFFSET3:-?[0-9]*]](%rsp) ; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET1]](%rsp) ; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET2]](%rsp) -; CHECK-NOT: mov{{.}} %{{.*}}, [[SPOFFSET3]](%rsp) target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" Index: llvm/trunk/test/CodeGen/X86/loop-strength-reduce4.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/loop-strength-reduce4.ll +++ llvm/trunk/test/CodeGen/X86/loop-strength-reduce4.ll @@ -4,16 +4,19 @@ ; By starting the IV at -64 instead of 0, a cmp is eliminated, ; as the flags from the add can be used directly. -; STATIC: movl $-64, [[ECX:%e..]] +; STATIC: movl $-64, [[EAX:%e..]] -; STATIC: movl [[EAX:%e..]], _state+76([[ECX]]) -; STATIC: addl $16, [[ECX]] +; STATIC: movl %{{.+}}, _state+76([[EAX]]) +; STATIC: addl $16, [[EAX]] ; STATIC: jne -; In PIC mode the symbol can't be folded, so the change-compare-stride -; trick applies. +; The same for PIC mode. -; PIC: cmpl $64 +; PIC: movl $-64, [[EAX:%e..]] + +; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]]) +; PIC: addl $16, [[EAX]] +; PIC: jne @state = external global [0 x i32] ; <[0 x i32]*> [#uses=4] @S = external global [0 x i32] ; <[0 x i32]*> [#uses=4] Index: llvm/trunk/test/CodeGen/X86/madd.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/madd.ll +++ llvm/trunk/test/CodeGen/X86/madd.ll @@ -9,17 +9,17 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB0_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi), %xmm2 -; SSE2-NEXT: movdqu (%rsi), %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 ; SSE2-NEXT: pmaddwd %xmm2, %xmm3 ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: addq $16, %rsi -; SSE2-NEXT: addq $16, %rdi -; SSE2-NEXT: addq $-8, %rax +; SSE2-NEXT: addq $8, %rcx +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB0_1 ; SSE2-NEXT: # BB#2: # %middle.block ; SSE2-NEXT: paddd %xmm0, %xmm1 @@ -34,17 +34,17 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB0_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vmovdqu (%rsi), %xmm2 -; AVX2-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2 +; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 +; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX2-NEXT: addq $16, %rsi -; AVX2-NEXT: addq $16, %rdi -; AVX2-NEXT: addq $-8, %rax +; AVX2-NEXT: addq $8, %rcx +; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB0_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -60,17 +60,17 @@ ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB0_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vmovdqu (%rsi), %xmm2 -; AVX512-NEXT: vpmaddwd (%rdi), %xmm2, %xmm2 +; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm2 +; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm2, %xmm2 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm2 ; AVX512-NEXT: vpaddd %ymm0, %ymm2, %ymm0 -; AVX512-NEXT: addq $16, %rsi -; AVX512-NEXT: addq $16, %rdi -; AVX512-NEXT: addq $-8, %rax +; AVX512-NEXT: addq $8, %rcx +; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB0_1 ; AVX512-NEXT: # BB#2: # %middle.block ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -118,12 +118,13 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqu (%rdi), %xmm2 -; SSE2-NEXT: movdqu (%rsi), %xmm3 +; SSE2-NEXT: movdqu (%rdi,%rcx,2), %xmm2 +; SSE2-NEXT: movdqu (%rsi,%rcx,2), %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: pmulhuw %xmm2, %xmm4 ; SSE2-NEXT: pmullw %xmm2, %xmm3 @@ -132,9 +133,8 @@ ; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: addq $16, %rsi -; SSE2-NEXT: addq $16, %rdi -; SSE2-NEXT: addq $-8, %rax +; SSE2-NEXT: addq $8, %rcx +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block ; SSE2-NEXT: paddd %xmm1, %xmm0 @@ -149,6 +149,7 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB1_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 @@ -156,9 +157,8 @@ ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmulld %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: addq $16, %rsi -; AVX2-NEXT: addq $16, %rdi -; AVX2-NEXT: addq $-8, %rax +; AVX2-NEXT: addq $8, %rcx +; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -174,6 +174,7 @@ ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB1_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 @@ -181,9 +182,8 @@ ; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX512-NEXT: vpmulld %ymm1, %ymm2, %ymm1 ; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; AVX512-NEXT: addq $16, %rsi -; AVX512-NEXT: addq $16, %rdi -; AVX512-NEXT: addq $-8, %rax +; AVX512-NEXT: addq $8, %rcx +; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB1_1 ; AVX512-NEXT: # BB#2: # %middle.block ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -231,6 +231,7 @@ ; SSE2: # BB#0: # %entry ; SSE2-NEXT: movl %edx, %eax ; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: xorl %ecx, %ecx ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm2, %xmm2 @@ -263,9 +264,8 @@ ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: addq $16, %rsi -; SSE2-NEXT: addq $16, %rdi -; SSE2-NEXT: addq $-16, %rax +; SSE2-NEXT: addq $16, %rcx +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block ; SSE2-NEXT: paddd %xmm3, %xmm0 @@ -282,17 +282,17 @@ ; AVX2: # BB#0: # %entry ; AVX2-NEXT: movl %edx, %eax ; AVX2-NEXT: vpxor %ymm0, %ymm0, %ymm0 +; AVX2-NEXT: xorl %ecx, %ecx ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovsxbw (%rdi), %ymm2 -; AVX2-NEXT: vpmovsxbw (%rsi), %ymm3 +; AVX2-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 +; AVX2-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 ; AVX2-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vpaddd %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: addq $16, %rsi -; AVX2-NEXT: addq $16, %rdi -; AVX2-NEXT: addq $-16, %rax +; AVX2-NEXT: addq $16, %rcx +; AVX2-NEXT: cmpq %rcx, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 @@ -309,18 +309,18 @@ ; AVX512: # BB#0: # %entry ; AVX512-NEXT: movl %edx, %eax ; AVX512-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; AVX512-NEXT: xorl %ecx, %ecx ; AVX512-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB2_1: # %vector.body ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512-NEXT: vpmovsxbw (%rdi), %ymm2 -; AVX512-NEXT: vpmovsxbw (%rsi), %ymm3 +; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm2 +; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm3 ; AVX512-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm2 ; AVX512-NEXT: vpaddd %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: addq $16, %rsi -; AVX512-NEXT: addq $16, %rdi -; AVX512-NEXT: addq $-16, %rax +; AVX512-NEXT: addq $16, %rcx +; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: jne .LBB2_1 ; AVX512-NEXT: # BB#2: # %middle.block ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,6,7,0,1,0,1] Index: llvm/trunk/test/CodeGen/X86/masked-iv-safe.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked-iv-safe.ll +++ llvm/trunk/test/CodeGen/X86/masked-iv-safe.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: count_up ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: incq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_up(double* %d, i64 %n) nounwind { @@ -38,7 +38,7 @@ ; CHECK-LABEL: count_down ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_down(double* %d, i64 %n) nounwind { @@ -71,7 +71,7 @@ ; CHECK-LABEL: count_up_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: incq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_up_signed(double* %d, i64 %n) nounwind { @@ -106,7 +106,7 @@ ; CHECK-LABEL: count_down_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_down_signed(double* %d, i64 %n) nounwind { @@ -141,7 +141,7 @@ ; CHECK-LABEL: another_count_up ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_up(double* %d, i64 %n) nounwind { @@ -174,7 +174,7 @@ ; CHECK-LABEL: another_count_down ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq $-8, +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_down(double* %d, i64 %n) nounwind { @@ -207,7 +207,7 @@ ; CHECK-LABEL: another_count_up_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_up_signed(double* %d, i64 %n) nounwind { @@ -242,7 +242,7 @@ ; CHECK-LABEL: another_count_down_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: decq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_down_signed(double* %d, i64 %n) nounwind { Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/canonical.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/canonical.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/canonical.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -S < %s | FileCheck %s +; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -lsr-insns-cost=false -S < %s | FileCheck %s ; Check LSR formula canonicalization will put loop invariant regs before ; induction variable of current loop, so exprs involving loop invariant regs ; can be promoted outside of current loop. Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -163,7 +163,7 @@ ; X64: movzbl -3( ; ; X32: foldedidx: -; X32: movzbl -3( +; X32: movzbl 400( define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp { entry: br label %for.body @@ -275,7 +275,7 @@ ; ; X32: @testCmpZero ; X32: %for.body82.us -; X32: dec +; X32: cmp ; X32: jne define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp { entry: Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll @@ -22,16 +22,16 @@ ; CHECK-LABEL: @test2 ; CHECK-LABEL: test2.loop: ; CHECK: %lsr.iv1 = phi i32 [ %lsr.iv.next2, %test2.loop ], [ -16777216, %entry ] -; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -1, %entry ] -; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, 1 +; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ 1, %entry ] +; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, -1 ; CHECK: %lsr.iv.next2 = add nsw i32 %lsr.iv1, 16777216 ; ; CHECK-LABEL: for.end: -; CHECK: %tobool.us = icmp eq i32 %lsr.iv.next2, 0 +; CHECK: %tobool.us = icmp eq i32 %lsr.iv.next, 0 ; CHECK: %sub.us = select i1 %tobool.us, i32 0, i32 0 -; CHECK: %1 = sub i32 0, %sub.us -; CHECK: %2 = add i32 %1, %lsr.iv.next -; CHECK: %sext.us = mul i32 %lsr.iv.next2, %2 +; CHECK: %0 = sub i32 0, %sub.us +; CHECK: %1 = sub i32 %0, %lsr.iv.next +; CHECK: %sext.us = mul i32 %lsr.iv.next2, %1 ; CHECK: %f = ashr i32 %sext.us, 24 ; CHECK: ret i32 %f define i32 @test2() { Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN -; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS +; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN +; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost=false -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS ; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s ; OPT test checks that LSR optimize compare for static counter to compare with 0. Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN -; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS +; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN +; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost=false -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS ; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s ; OPT checks that LSR prefers less instructions to less registers. Index: llvm/trunk/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll =================================================================== --- llvm/trunk/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll +++ llvm/trunk/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -loop-reduce -S < %s | FileCheck %s ; Check when we use an outerloop induction variable inside of an innerloop ; induction value expr, LSR can still choose to use single induction variable @@ -22,18 +23,21 @@ for.body2.preheader: ; preds = %for.body br label %for.body2 -; Check LSR only generates one induction variable for for.body2 and the induction -; variable will be shared by multiple array accesses. +; Check LSR only generates two induction variables for for.body2 one for compare and +; one to shared by multiple array accesses. ; CHECK: for.body2: -; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ %lsr.iv.next, %for.body2 ], [ 0, %for.body2.preheader ] +; CHECK-NEXT: [[LSRAR:%[^,]+]] = phi i8* [ %scevgep, %for.body2 ], [ %maxarray, %for.body2.preheader ] +; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ %lsr.iv.next, %for.body2 ], [ %0, %for.body2.preheader ] ; CHECK-NOT: = phi i64 [ {{.*}}, %for.body2 ], [ {{.*}}, %for.body2.preheader ] -; CHECK: [[SCEVGEP1:%[^,]+]] = getelementptr i8, i8* %maxarray, i64 [[LSR]] -; CHECK: [[SCEVGEP2:%[^,]+]] = getelementptr i8, i8* [[SCEVGEP1]], i64 1 +; CHECK: [[LSRINT:%[^,]+]] = ptrtoint i8* [[LSRAR]] to i64 +; CHECK: [[SCEVGEP1:%[^,]+]] = getelementptr i8, i8* [[LSRAR]], i64 1 +; CHECK: {{.*}} = load i8, i8* [[SCEVGEP1]], align 1 +; CHECK: [[SCEVGEP2:%[^,]+]] = getelementptr i8, i8* %1, i64 [[LSRINT]] ; CHECK: {{.*}} = load i8, i8* [[SCEVGEP2]], align 1 -; CHECK: [[SCEVGEP3:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]] -; CHECK: {{.*}} = load i8, i8* [[SCEVGEP3]], align 1 -; CHECK: [[SCEVGEP4:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]] -; CHECK: store i8 {{.*}}, i8* [[SCEVGEP4]], align 1 +; CHECK: [[SCEVGEP3:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSRINT]] +; CHECK: store i8 {{.*}}, i8* [[SCEVGEP3]], align 1 +; CHECK: [[LSRNEXT:%[^,]+]] = add i64 [[LSR]], -1 +; CHECK: %exitcond = icmp ne i64 [[LSRNEXT]], 0 ; CHECK: br i1 %exitcond, label %for.body2, label %for.inc.loopexit for.body2: ; preds = %for.body2.preheader, %for.body2