Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -94,6 +94,8 @@ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2088,6 +2088,17 @@ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } +bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) { + // X86 specific here are "instruction number 1st priority". + return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, + C1.NumIVMuls, C1.NumBaseAdds, + C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, + C2.NumIVMuls, C2.NumBaseAdds, + C2.ScaleCost, C2.ImmCost, C2.SetupCost); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -924,6 +924,15 @@ return Changed; } +/// Returns true if A and B has same constant value. +/// +static bool hasSameConstValue(const SCEV *A, const SCEV *B) { + if (const SCEVConstant *AC = dyn_cast(A)) + if (const SCEVConstant *BC = dyn_cast(B)) + return APInt::isSameValue(AC->getAPInt(), BC->getAPInt()); + return false; +} + namespace { class LSRUse; @@ -1891,6 +1900,7 @@ void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateCrossUseConstantOffsets(); + void GenerateCrossUseICmpZero(); void GenerateAllReuseFormulae(); void FilterOutUndesirableDedicatedRegisters(); @@ -3845,6 +3855,85 @@ } #endif +/// Look for ICmp AddRecExpr that ends with zero and try to reuse them in +/// other formulas. +/// For the following: +/// ICmpZero {-40,+,4} +/// Address {%a,+,4} +/// Algorithm will add 1 Address Formula: +/// ICmpZero {-40,+,4} +/// Address {%a} + {0,+,4} +/// 40 + {%a} + {-40,+,4} +/// +void LSRInstance::GenerateCrossUseICmpZero() { + SmallVector Sequence; + // Get all ICmpZero registers that ens with zero. + for (LSRUse &LU : Uses) { + if (LU.Kind != LSRUse::ICmpZero) + continue; + for (const Formula &F : LU.Formulae) { + if (!F.hasZeroEnd()) + continue; + const SCEVAddRecExpr *Reg = dyn_cast(F.BaseRegs[0]); + if (!Reg || !isa(Reg->getStart())) + continue; + Sequence.push_back(F.BaseRegs[0]); + } + } + if (Sequence.empty()) + return; + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + if (LU.Kind == LSRUse::ICmpZero) + continue; + // If we found AddRecExpr register in LSR use that has same step, + // try to make it the same by shifting constant start. + for (const SCEV *CmpReg : Sequence) { + const SCEVAddRecExpr *RegAR = cast(CmpReg); + const SCEVConstant *RegStart = cast(RegAR->getStart()); + for (size_t K = 0, KE = LU.Formulae.size(); K != KE; ++K) { + Formula F = LU.Formulae[K]; + F.unscale(); + Formula NewF = F; + bool Changed = false; + for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { + const SCEVAddRecExpr *BaseRegAR = + dyn_cast(F.BaseRegs[N]); + if (!BaseRegAR) + continue; + if (!hasSameConstValue(BaseRegAR->getStepRecurrence(SE), + RegAR->getStepRecurrence(SE))) + continue; + const SCEVConstant *BaseRegStart = + dyn_cast(BaseRegAR->getStart()); + if (!BaseRegStart) + continue; + int64_t RegDiff = BaseRegStart->getAPInt().getSExtValue() - + RegStart->getAPInt().getSExtValue(); + Type *IntTy = SE.getEffectiveSCEVType(F.BaseRegs[N]->getType()); + const SCEV *NegRegDiff = + SE.getSCEV(ConstantInt::get(IntTy, -RegDiff)); + NewF.BaseOffset += RegDiff; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, NewF)) { + if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + + RegDiff)) + continue; + NewF = F; + NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + RegDiff; + } + NewF.BaseRegs[N] = SE.getAddExpr(NegRegDiff, F.BaseRegs[N]); + Changed = true; + } + if (!Changed) + continue; + NewF.canonicalize(*L); + (void)InsertFormula(LU, LUIdx, NewF); + } + } + } +} + /// Look for registers which are a constant distance apart and try to form reuse /// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { @@ -4034,7 +4123,7 @@ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) GenerateTruncates(LU, LUIdx, LU.Formulae[i]); } - + GenerateCrossUseICmpZero(); GenerateCrossUseConstantOffsets(); DEBUG(dbgs() << "\n" Index: test/CodeGen/X86/2006-05-11-InstrSched.ll =================================================================== --- test/CodeGen/X86/2006-05-11-InstrSched.ll +++ test/CodeGen/X86/2006-05-11-InstrSched.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \ -; RUN: grep "asm-printer" | grep 35 +; RUN: grep "asm-printer" | grep 33 target datalayout = "e-p:32:32" define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind { Index: test/CodeGen/X86/atom-fixup-lea3.ll =================================================================== --- test/CodeGen/X86/atom-fixup-lea3.ll +++ test/CodeGen/X86/atom-fixup-lea3.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s -; CHECK: addl ([[reg:%[a-z]+]]) -; CHECK-NEXT: addl $4, [[reg]] +; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4) +; CHECK-NEXT: movl +; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4) +; CHECK-NEXT: incl ; Test for the FixupLEAs pre-emit pass. ; An LEA should NOT be substituted for the ADD instruction @@ -20,7 +22,7 @@ ; return sum; ;} -define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 { +define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.lr.ph, label %for.end @@ -35,6 +37,9 @@ %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ] %inc1 = add nsw i32 %j.09, 1 %arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09 + store i32 %0, i32* %m, align 4 + store i32 %sum.010, i32* %m, align 4 + store i32 %0, i32* %m, align 4 %1 = load i32, i32* %arrayidx, align 4 %add = add nsw i32 %0, %1 store i32 %add, i32* %m, align 4 Index: test/CodeGen/X86/avoid_complex_am.ll =================================================================== --- test/CodeGen/X86/avoid_complex_am.ll +++ test/CodeGen/X86/avoid_complex_am.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx" -define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) { +define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c, i32 %n) { ; CHECK: @mulDouble entry: br label %for.body @@ -30,9 +30,7 @@ %arrayidx4 = getelementptr inbounds double, double* %a, i64 %indvars.iv store double %mul, double* %arrayidx4, align 8 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 -; Comparison should be 19 * 1 = 19. -; CHECK: icmp eq i32 {{%[^,]+}}, 19 - %exitcond = icmp eq i32 %lftr.wideiv, 20 + %exitcond = icmp eq i32 %lftr.wideiv, %n br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body Index: test/CodeGen/X86/compact-unwind.ll =================================================================== --- test/CodeGen/X86/compact-unwind.ll +++ test/CodeGen/X86/compact-unwind.ll @@ -66,12 +66,12 @@ ; NOFP-CU: Entry at offset 0x20: ; NOFP-CU-NEXT: start: 0x1d _test1 -; NOFP-CU-NEXT: length: 0x42 +; NOFP-CU-NEXT: length: 0x4b ; NOFP-CU-NEXT: compact encoding: 0x02040c0a ; NOFP-FROM-ASM: Entry at offset 0x20: ; NOFP-FROM-ASM-NEXT: start: 0x1d _test1 -; NOFP-FROM-ASM-NEXT: length: 0x42 +; NOFP-FROM-ASM-NEXT: length: 0x4b ; NOFP-FROM-ASM-NEXT: compact encoding: 0x02040c0a define void @test1(%class.ImageLoader* %image) optsize ssp uwtable { Index: test/CodeGen/X86/full-lsr.ll =================================================================== --- test/CodeGen/X86/full-lsr.ll +++ test/CodeGen/X86/full-lsr.ll @@ -1,16 +1,10 @@ ; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s -; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s +; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind { -; ATOM: foo -; ATOM: addl -; ATOM: addl -; ATOM: leal ; CHECK: foo -; CHECK: addl -; CHECK: addl -; CHECK: addl +; CHECK: incl entry: %0 = icmp sgt i32 %N, 0 ; [#uses=1] Index: test/CodeGen/X86/loop-strength-reduce4.ll =================================================================== --- test/CodeGen/X86/loop-strength-reduce4.ll +++ test/CodeGen/X86/loop-strength-reduce4.ll @@ -4,16 +4,19 @@ ; By starting the IV at -64 instead of 0, a cmp is eliminated, ; as the flags from the add can be used directly. -; STATIC: movl $-64, [[ECX:%e..]] +; STATIC: movl $-64, [[EAX:%e..]] -; STATIC: movl [[EAX:%e..]], _state+76([[ECX]]) -; STATIC: addl $16, [[ECX]] +; STATIC: movl %{{.+}}, _state+76([[EAX]]) +; STATIC: addl $16, [[EAX]] ; STATIC: jne -; In PIC mode the symbol can't be folded, so the change-compare-stride -; trick applies. +; The same for PIC mode. -; PIC: cmpl $64 +; PIC: movl $-64, [[EAX:%e..]] + +; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]]) +; PIC: addl $16, [[EAX]] +; PIC: jne @state = external global [0 x i32] ; <[0 x i32]*> [#uses=4] @S = external global [0 x i32] ; <[0 x i32]*> [#uses=4] Index: test/CodeGen/X86/masked-iv-safe.ll =================================================================== --- test/CodeGen/X86/masked-iv-safe.ll +++ test/CodeGen/X86/masked-iv-safe.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: count_up ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: incq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_up(double* %d, i64 %n) nounwind { @@ -38,7 +38,7 @@ ; CHECK-LABEL: count_down ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_down(double* %d, i64 %n) nounwind { @@ -71,7 +71,7 @@ ; CHECK-LABEL: count_up_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: incq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_up_signed(double* %d, i64 %n) nounwind { @@ -106,7 +106,7 @@ ; CHECK-LABEL: count_down_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_down_signed(double* %d, i64 %n) nounwind { @@ -141,7 +141,7 @@ ; CHECK-LABEL: another_count_up ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_up(double* %d, i64 %n) nounwind { @@ -174,7 +174,7 @@ ; CHECK-LABEL: another_count_down ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq $-8, +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_down(double* %d, i64 %n) nounwind { @@ -207,7 +207,7 @@ ; CHECK-LABEL: another_count_up_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_up_signed(double* %d, i64 %n) nounwind { @@ -242,7 +242,7 @@ ; CHECK-LABEL: another_count_down_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: decq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_down_signed(double* %d, i64 %n) nounwind { Index: test/CodeGen/X86/misched-matrix.ll =================================================================== --- test/CodeGen/X86/misched-matrix.ll +++ test/CodeGen/X86/misched-matrix.ll @@ -16,19 +16,19 @@ ; alias analysis ability (that doesn't require any AliasAnalysis pass). ; ; TOPDOWN-LABEL: %for.body -; TOPDOWN: movl %{{.*}}, ( +; TOPDOWN: movl %{{.*}}, 64( ; TOPDOWN: imull {{[0-9]*}}( -; TOPDOWN: movl %{{.*}}, 4( +; TOPDOWN: movl %{{.*}}, 68( ; TOPDOWN: imull {{[0-9]*}}( -; TOPDOWN: movl %{{.*}}, 8( -; TOPDOWN: movl %{{.*}}, 12( +; TOPDOWN: movl %{{.*}}, 72( +; TOPDOWN: movl %{{.*}}, 76( ; TOPDOWN-LABEL: %for.end ; ; For -misched=ilpmin, verify that each expression subtree is ; scheduled independently, and that the imull/adds are interleaved. ; ; ILPMIN-LABEL: %for.body -; ILPMIN: movl %{{.*}}, ( +; ILPMIN: movl %{{.*}}, 64( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -36,7 +36,7 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 4( +; ILPMIN: movl %{{.*}}, 68( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -44,7 +44,7 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 8( +; ILPMIN: movl %{{.*}}, 72( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -52,14 +52,14 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 12( +; ILPMIN: movl %{{.*}}, 76( ; ILPMIN-LABEL: %for.end ; ; For -misched=ilpmax, verify that each expression subtree is ; scheduled independently, and that the imull/adds are clustered. ; ; ILPMAX-LABEL: %for.body -; ILPMAX: movl %{{.*}}, ( +; ILPMAX: movl %{{.*}}, 64( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -67,7 +67,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 4( +; ILPMAX: movl %{{.*}}, 68( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -75,7 +75,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 8( +; ILPMAX: movl %{{.*}}, 72( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -83,7 +83,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 12( +; ILPMAX: movl %{{.*}}, 76( ; ILPMAX-LABEL: %for.end define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2, Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -162,10 +162,10 @@ ; Consequently, we should *not* form any chains. ; ; X64: foldedidx: -; X64: movzbl -3( +; X64: movzbl 400( ; ; X32: foldedidx: -; X32: movzbl -3( +; X32: movzbl 400( define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp { entry: br label %for.body @@ -277,7 +277,7 @@ ; ; X32: @testCmpZero ; X32: %for.body82.us -; X32: dec +; X32: cmp ; X32: jne define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp { entry: