Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -233,6 +233,17 @@ /// incurs significant execution cost. bool isLoweredToCall(const Function *F) const; + struct LSRCost { + unsigned Insns; + unsigned NumRegs; + unsigned AddRecCost; + unsigned NumIVMuls; + unsigned NumBaseAdds; + unsigned ImmCost; + unsigned SetupCost; + unsigned ScaleCost; + }; + /// Parameters that control the generic loop unrolling transformation. struct UnrollingPreferences { /// The cost threshold for the unrolled loop. Should be relative to the @@ -347,6 +358,10 @@ bool HasBaseReg, int64_t Scale, unsigned AddrSpace = 0) const; + /// \brief Return true if LSR cost of C1 is lower than C1. + bool isLSRCostLower(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) const; + /// \brief Return true if the target supports masked load/store /// AVX2 and AVX-512 targets allow masks for consecutive load and store bool isLegalMaskedStore(Type *DataType) const; @@ -708,6 +723,8 @@ int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; + virtual bool isLSRCostLower(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) = 0; virtual bool isLegalMaskedStore(Type *DataType) = 0; virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; @@ -875,6 +892,10 @@ return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, AddrSpace); } + bool isLSRCostLower(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) override { + return Impl.isLSRCostLower(C1, C2); + } bool isLegalMaskedStore(Type *DataType) override { return Impl.isLegalMaskedStore(DataType); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -218,6 +218,13 @@ return !BaseGV && BaseOffset == 0 && (Scale == 0 || Scale == 1); } + bool isLSRCostLower(TTI::LSRCost &C1, TTI::LSRCost &C2) { + return std::tie(C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds, + C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds, + C2.ScaleCost, C2.ImmCost, C2.SetupCost); + } + bool isLegalMaskedStore(Type *DataType) { return false; } bool isLegalMaskedLoad(Type *DataType) { return false; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -129,6 +129,10 @@ return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace); } + bool isLSRCostLower(TTI::LSRCost C1, TTI::LSRCost C2) { + return TargetTransformInfoImplBase::isLSRCostLower(C1, C2); + } + int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { TargetLoweringBase::AddrMode AM; Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -123,6 +123,10 @@ Scale, AddrSpace); } +bool TargetTransformInfo::isLSRCostLower(LSRCost &C1, LSRCost &C2) const { + return TTIImpl->isLSRCostLower(C1, C2); +} + bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const { return TTIImpl->isLegalMaskedStore(DataType); } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -87,6 +87,8 @@ int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + bool isLSRCostLower(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -1863,6 +1863,17 @@ return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } +bool X86TTIImpl::isLSRCostLower(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) { + // X86 specific here are "instruction number 1st priority". + return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, + C1.NumIVMuls, C1.NumBaseAdds, + C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, + C2.NumIVMuls, C2.NumBaseAdds, + C2.ScaleCost, C2.ImmCost, C2.SetupCost); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa(ScalarTy) ? Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -323,6 +323,8 @@ bool unscale(); + bool hasZeroEnd() const; + size_t getNumRegs() const; Type *getType() const; @@ -457,6 +459,14 @@ return true; } +bool Formula::hasZeroEnd() const { + if (UnfoldedOffset || BaseOffset) + return false; + if (BaseRegs.size() != 1 || ScaledReg) + return false; + return true; +} + /// Return the total number of register operands used by this formula. This does /// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { @@ -855,6 +865,15 @@ return Changed; } +/// Returns true if A and B has same constant value. +/// +static bool hasSameConstValue(const SCEV *A, const SCEV *B) { + if (const SCEVConstant *AC = dyn_cast(A)) + if (const SCEVConstant *BC = dyn_cast(B)) + return APInt::isSameValue(AC->getAPInt(), BC->getAPInt()); + return false; +} + namespace { class LSRUse; @@ -882,36 +901,39 @@ class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. - unsigned NumRegs; - unsigned AddRecCost; - unsigned NumIVMuls; - unsigned NumBaseAdds; - unsigned ImmCost; - unsigned SetupCost; - unsigned ScaleCost; + TargetTransformInfo::LSRCost C; public: - Cost() - : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0), ScaleCost(0) {} + Cost() { + C.Insns = 0; + C.NumRegs = 0; + C.AddRecCost = 0; + C.NumIVMuls = 0; + C.NumBaseAdds = 0; + C.ImmCost = 0; + C.SetupCost = 0; + C.ScaleCost = 0; + } - bool operator<(const Cost &Other) const; + bool isLower(Cost &Other, const TargetTransformInfo &TTI); void Lose(); #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. bool isValid() { - return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost | ScaleCost) != ~0u) - || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost & ScaleCost) == ~0u); + return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls + | C.NumBaseAdds | C.ImmCost | C.SetupCost + | C.ScaleCost) != ~0u) + || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls + & C.NumBaseAdds & C.ImmCost & C.SetupCost + & C.ScaleCost) == ~0u); } #endif bool isLoser() { assert(isValid() && "invalid cost"); - return NumRegs == ~0u; + return C.NumRegs == ~0u; } void RateFormula(const TargetTransformInfo &TTI, @@ -1095,8 +1117,7 @@ Lose(); return; } - AddRecCost += 1; /// TODO: This should be a function of the stride. - + C.AddRecCost += 1; /// TODO: This should be a function of the stride. // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. if (!AR->isAffine() || !isa(AR->getOperand(1))) { @@ -1107,7 +1128,7 @@ } } } - ++NumRegs; + ++C.NumRegs; // Rough heuristic; favor registers which don't require extra setup // instructions in the preheader. @@ -1116,9 +1137,9 @@ !(isa(Reg) && (isa(cast(Reg)->getStart()) || isa(cast(Reg)->getStart())))) - ++SetupCost; + ++C.SetupCost; - NumIVMuls += isa(Reg) && + C.NumIVMuls += isa(Reg) && SE.hasComputableLoopEvolution(Reg, L); } @@ -1151,6 +1172,9 @@ SmallPtrSetImpl *LoserRegs) { assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. + unsigned AddRecCost = C.AddRecCost; + unsigned NumRegs = C.NumRegs; + unsigned NumBaseAdds = C.NumBaseAdds; if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { Lose(); @@ -1170,72 +1194,90 @@ return; } + // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as + // additional instruction. + unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1; + if (C.NumRegs > TTIRegNum) { + // Cost already exceeded TTIRegNum, then only newly added register can add + // new instructions. + if (NumRegs > TTIRegNum) + C.Insns += (C.NumRegs - NumRegs); + else + C.Insns += (C.NumRegs - TTIRegNum); + } + // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.getNumRegs(); if (NumBaseParts > 1) // Do not count the base and a possible second register if the target // allows to fold 2 registers. - NumBaseAdds += + C.NumBaseAdds += NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); - NumBaseAdds += (F.UnfoldedOffset != 0); - + C.NumBaseAdds += (F.UnfoldedOffset != 0); // Accumulate non-free scaling amounts. - ScaleCost += getScalingFactorCost(TTI, LU, F); - + C.ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { int64_t O = Fixup.Offset; int64_t Offset = (uint64_t)O + F.BaseOffset; if (F.BaseGV) - ImmCost += 64; // Handle symbolic values conservatively. + C.ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. else if (Offset != 0) - ImmCost += APInt(64, Offset, true).getMinSignedBits(); + C.ImmCost += APInt(64, Offset, true).getMinSignedBits(); // Check with target if this offset with this instruction is // specifically not supported. if ((isa(Fixup.UserInst) || isa(Fixup.UserInst)) && !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) - NumBaseAdds++; + C.NumBaseAdds++; } + + // Each new AddRec adds 1 instruction to calculation. + C.Insns += (C.AddRecCost - AddRecCost); + // ICmpZero adds no Insns if it ends with zero. + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd()) + C.Insns++; + // BaseAdds adds instructions for unfolded registers. + if (LU.Kind != LSRUse::ICmpZero) + C.Insns += C.NumBaseAdds - NumBaseAdds; assert(isValid() && "invalid cost"); } /// Set this cost to a losing value. void Cost::Lose() { - NumRegs = ~0u; - AddRecCost = ~0u; - NumIVMuls = ~0u; - NumBaseAdds = ~0u; - ImmCost = ~0u; - SetupCost = ~0u; - ScaleCost = ~0u; + C.Insns = ~0u; + C.NumRegs = ~0u; + C.AddRecCost = ~0u; + C.NumIVMuls = ~0u; + C.NumBaseAdds = ~0u; + C.ImmCost = ~0u; + C.SetupCost = ~0u; + C.ScaleCost = ~0u; } /// Choose the lower cost. -bool Cost::operator<(const Cost &Other) const { - return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, - ImmCost, SetupCost) < - std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, - Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost, - Other.SetupCost); +bool Cost::isLower(Cost &Other, const TargetTransformInfo &TTI) { + return TTI.isLSRCostLower(C, Other.C); } void Cost::print(raw_ostream &OS) const { - OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); - if (AddRecCost != 0) - OS << ", with addrec cost " << AddRecCost; - if (NumIVMuls != 0) - OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); - if (NumBaseAdds != 0) - OS << ", plus " << NumBaseAdds << " base add" - << (NumBaseAdds == 1 ? "" : "s"); - if (ScaleCost != 0) - OS << ", plus " << ScaleCost << " scale cost"; - if (ImmCost != 0) - OS << ", plus " << ImmCost << " imm cost"; - if (SetupCost != 0) - OS << ", plus " << SetupCost << " setup cost"; + OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s "); + OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s"); + if (C.AddRecCost != 0) + OS << ", with addrec cost " << C.AddRecCost; + if (C.NumIVMuls != 0) + OS << ", plus " << C.NumIVMuls << " IV mul" + << (C.NumIVMuls == 1 ? "" : "s"); + if (C.NumBaseAdds != 0) + OS << ", plus " << C.NumBaseAdds << " base add" + << (C.NumBaseAdds == 1 ? "" : "s"); + if (C.ScaleCost != 0) + OS << ", plus " << C.ScaleCost << " scale cost"; + if (C.ImmCost != 0) + OS << ", plus " << C.ImmCost << " imm cost"; + if (C.SetupCost != 0) + OS << ", plus " << C.SetupCost << " setup cost"; } LLVM_DUMP_METHOD @@ -1763,6 +1805,7 @@ void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateCrossUseConstantOffsets(); + void GenerateCrossUseICmpZero(); void GenerateAllReuseFormulae(); void FilterOutUndesirableDedicatedRegisters(); @@ -3696,6 +3739,85 @@ print(errs()); errs() << '\n'; } +/// Look for ICmp AddRecExpr that ends with zero and try to reuse them in +/// other formulas. +/// For the following: +/// ICmpZero {-40,+,4} +/// Address {%a,+,4} +/// Algorithm will add 1 Address Formula: +/// ICmpZero {-40,+,4} +/// Address {%a} + {0,+,4} +/// 40 + {%a} + {-40,+,4} +/// +void LSRInstance::GenerateCrossUseICmpZero() { + SmallVector Sequence; + // Get all ICmpZero registers that ens with zero. + for (LSRUse &LU : Uses) { + if (LU.Kind != LSRUse::ICmpZero) + continue; + for (const Formula &F : LU.Formulae) { + if (!F.hasZeroEnd()) + continue; + const SCEVAddRecExpr *Reg = dyn_cast(F.BaseRegs[0]); + if (!Reg || !isa(Reg->getStart())) + continue; + Sequence.push_back(F.BaseRegs[0]); + } + } + if (Sequence.empty()) + return; + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + if (LU.Kind == LSRUse::ICmpZero) + continue; + // If we found AddRecExpr register in LSR use that has same step, + // try to make it the same by shifting constant start. + for (const SCEV *CmpReg : Sequence) { + const SCEVAddRecExpr *RegAR = cast(CmpReg); + const SCEVConstant *RegStart = cast(RegAR->getStart()); + for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) { + Formula F = LU.Formulae[L]; + F.unscale(); + Formula NewF = F; + bool Changed = false; + for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) { + const SCEVAddRecExpr *BaseRegAR = + dyn_cast(F.BaseRegs[N]); + if (!BaseRegAR) + continue; + if (!hasSameConstValue(BaseRegAR->getStepRecurrence(SE), + RegAR->getStepRecurrence(SE))) + continue; + const SCEVConstant *BaseRegStart = + dyn_cast(BaseRegAR->getStart()); + if (!BaseRegStart) + continue; + int64_t RegDiff = BaseRegStart->getAPInt().getSExtValue() - + RegStart->getAPInt().getSExtValue(); + Type *IntTy = SE.getEffectiveSCEVType(F.BaseRegs[N]->getType()); + const SCEV *NegRegDiff = + SE.getSCEV(ConstantInt::get(IntTy, -RegDiff)); + NewF.BaseOffset += RegDiff; + if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, + LU.Kind, LU.AccessTy, NewF)) { + if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + + RegDiff)) + continue; + NewF = F; + NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + RegDiff; + } + NewF.BaseRegs[N] = SE.getAddExpr(NegRegDiff, F.BaseRegs[N]); + Changed = true; + } + if (!Changed) + continue; + NewF.canonicalize(); + (void)InsertFormula(LU, LUIdx, NewF); + } + } + } +} + /// Look for registers which are a constant distance apart and try to form reuse /// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { @@ -3885,7 +4007,7 @@ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i) GenerateTruncates(LU, LUIdx, LU.Formulae[i]); } - + GenerateCrossUseICmpZero(); GenerateCrossUseConstantOffsets(); DEBUG(dbgs() << "\n" @@ -3961,7 +4083,7 @@ Cost CostBest; Regs.clear(); CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU); - if (CostF < CostBest) + if (CostF.isLower(CostBest, TTI)) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" @@ -4288,7 +4410,7 @@ NewCost = CurCost; NewRegs = CurRegs; NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU); - if (NewCost < SolutionCost) { + if (NewCost.isLower(SolutionCost, TTI)) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { SolveRecurse(Solution, SolutionCost, Workspace, NewCost, Index: test/CodeGen/X86/2006-05-11-InstrSched.ll =================================================================== --- test/CodeGen/X86/2006-05-11-InstrSched.ll +++ test/CodeGen/X86/2006-05-11-InstrSched.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts ; RUN: llc < %s -march=x86 -mtriple=i386-linux-gnu -mcpu=penryn -mattr=+sse2 -stats 2>&1 | \ -; RUN: grep "asm-printer" | grep 35 +; RUN: grep "asm-printer" | grep 33 target datalayout = "e-p:32:32" define void @foo(i32* %mc, i32* %bp, i32* %ms, i32* %xmb, i32* %mpp, i32* %tpmm, i32* %ip, i32* %tpim, i32* %dpp, i32* %tpdm, i32* %bpi, i32 %M) nounwind { Index: test/CodeGen/X86/atom-fixup-lea3.ll =================================================================== --- test/CodeGen/X86/atom-fixup-lea3.ll +++ test/CodeGen/X86/atom-fixup-lea3.ll @@ -1,6 +1,8 @@ ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux | FileCheck %s -; CHECK: addl ([[reg:%[a-z]+]]) -; CHECK-NEXT: addl $4, [[reg]] +; CHECK: addl ({{%[a-z]+}},[[reg:%[a-z]+]],4) +; CHECK-NEXT: movl +; CHECK-NEXT: addl 4({{%[a-z]+}},[[reg:%[a-z]+]],4) +; CHECK-NEXT: incl ; Test for the FixupLEAs pre-emit pass. ; An LEA should NOT be substituted for the ADD instruction @@ -20,7 +22,7 @@ ; return sum; ;} -define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %m, i32* nocapture %array2) #0 { +define i32 @test(i32 %n, i32* nocapture %array, i32* nocapture %k, i32* nocapture %l, i32* nocapture %m, i32* nocapture %array2) #0 { entry: %cmp7 = icmp sgt i32 %n, 0 br i1 %cmp7, label %for.body.lr.ph, label %for.end @@ -35,6 +37,9 @@ %j.09 = phi i32 [ 0, %for.body.lr.ph ], [ %inc1, %for.body ] %inc1 = add nsw i32 %j.09, 1 %arrayidx = getelementptr inbounds i32, i32* %array2, i32 %j.09 + store i32 %0, i32* %m, align 4 + store i32 %sum.010, i32* %m, align 4 + store i32 %0, i32* %m, align 4 %1 = load i32, i32* %arrayidx, align 4 %add = add nsw i32 %0, %1 store i32 %add, i32* %m, align 4 Index: test/CodeGen/X86/avoid_complex_am.ll =================================================================== --- test/CodeGen/X86/avoid_complex_am.ll +++ test/CodeGen/X86/avoid_complex_am.ll @@ -8,7 +8,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx" -define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) { +define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c, i32 %n) { ; CHECK: @mulDouble entry: br label %for.body @@ -30,9 +30,7 @@ %arrayidx4 = getelementptr inbounds double, double* %a, i64 %indvars.iv store double %mul, double* %arrayidx4, align 8 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 -; Comparison should be 19 * 1 = 19. -; CHECK: icmp eq i32 {{%[^,]+}}, 19 - %exitcond = icmp eq i32 %lftr.wideiv, 20 + %exitcond = icmp eq i32 %lftr.wideiv, %n br i1 %exitcond, label %for.end, label %for.body for.end: ; preds = %for.body Index: test/CodeGen/X86/compact-unwind.ll =================================================================== --- test/CodeGen/X86/compact-unwind.ll +++ test/CodeGen/X86/compact-unwind.ll @@ -66,12 +66,12 @@ ; NOFP-CU: Entry at offset 0x20: ; NOFP-CU-NEXT: start: 0x1d _test1 -; NOFP-CU-NEXT: length: 0x42 +; NOFP-CU-NEXT: length: 0x4b ; NOFP-CU-NEXT: compact encoding: 0x02040c0a ; NOFP-FROM-ASM: Entry at offset 0x20: ; NOFP-FROM-ASM-NEXT: start: 0x1d _test1 -; NOFP-FROM-ASM-NEXT: length: 0x42 +; NOFP-FROM-ASM-NEXT: length: 0x4b ; NOFP-FROM-ASM-NEXT: compact encoding: 0x02040c0a define void @test1(%class.ImageLoader* %image) optsize ssp uwtable { Index: test/CodeGen/X86/full-lsr.ll =================================================================== --- test/CodeGen/X86/full-lsr.ll +++ test/CodeGen/X86/full-lsr.ll @@ -1,16 +1,10 @@ ; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s -; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck -check-prefix=ATOM %s +; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s define void @foo(float* nocapture %A, float* nocapture %B, float* nocapture %C, i32 %N) nounwind { -; ATOM: foo -; ATOM: addl -; ATOM: addl -; ATOM: leal ; CHECK: foo -; CHECK: addl -; CHECK: addl -; CHECK: addl +; CHECK: incl entry: %0 = icmp sgt i32 %N, 0 ; [#uses=1] Index: test/CodeGen/X86/loop-strength-reduce4.ll =================================================================== --- test/CodeGen/X86/loop-strength-reduce4.ll +++ test/CodeGen/X86/loop-strength-reduce4.ll @@ -4,16 +4,19 @@ ; By starting the IV at -64 instead of 0, a cmp is eliminated, ; as the flags from the add can be used directly. -; STATIC: movl $-64, [[ECX:%e..]] +; STATIC: movl $-64, [[EAX:%e..]] -; STATIC: movl [[EAX:%e..]], _state+76([[ECX]]) -; STATIC: addl $16, [[ECX]] +; STATIC: movl %{{.+}}, _state+76([[EAX]]) +; STATIC: addl $16, [[EAX]] ; STATIC: jne -; In PIC mode the symbol can't be folded, so the change-compare-stride -; trick applies. +; The same for PIC mode. -; PIC: cmpl $64 +; PIC: movl $-64, [[EAX:%e..]] + +; PIC: movl %{{.+}}, 76(%{{.+}},[[EAX]]) +; PIC: addl $16, [[EAX]] +; PIC: jne @state = external global [0 x i32] ; <[0 x i32]*> [#uses=4] @S = external global [0 x i32] ; <[0 x i32]*> [#uses=4] Index: test/CodeGen/X86/masked-iv-safe.ll =================================================================== --- test/CodeGen/X86/masked-iv-safe.ll +++ test/CodeGen/X86/masked-iv-safe.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: count_up ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: incq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_up(double* %d, i64 %n) nounwind { @@ -38,7 +38,7 @@ ; CHECK-LABEL: count_down ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_down(double* %d, i64 %n) nounwind { @@ -71,7 +71,7 @@ ; CHECK-LABEL: count_up_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: incq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_up_signed(double* %d, i64 %n) nounwind { @@ -106,7 +106,7 @@ ; CHECK-LABEL: count_down_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @count_down_signed(double* %d, i64 %n) nounwind { @@ -141,7 +141,7 @@ ; CHECK-LABEL: another_count_up ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_up(double* %d, i64 %n) nounwind { @@ -174,7 +174,7 @@ ; CHECK-LABEL: another_count_down ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq $-8, +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_down(double* %d, i64 %n) nounwind { @@ -207,7 +207,7 @@ ; CHECK-LABEL: another_count_up_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: addq +; CHECK: addq $8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_up_signed(double* %d, i64 %n) nounwind { @@ -242,7 +242,7 @@ ; CHECK-LABEL: another_count_down_signed ; CHECK-NOT: {{and|movz|sar|shl}} -; CHECK: decq +; CHECK: addq $-8 ; CHECK-NOT: {{and|movz|sar|shl}} ; CHECK: jne define void @another_count_down_signed(double* %d, i64 %n) nounwind { Index: test/CodeGen/X86/misched-matrix.ll =================================================================== --- test/CodeGen/X86/misched-matrix.ll +++ test/CodeGen/X86/misched-matrix.ll @@ -16,19 +16,19 @@ ; alias analysis ability (that doesn't require any AliasAnalysis pass). ; ; TOPDOWN-LABEL: %for.body -; TOPDOWN: movl %{{.*}}, ( +; TOPDOWN: movl %{{.*}}, 64( ; TOPDOWN: imull {{[0-9]*}}( -; TOPDOWN: movl %{{.*}}, 4( +; TOPDOWN: movl %{{.*}}, 68( ; TOPDOWN: imull {{[0-9]*}}( -; TOPDOWN: movl %{{.*}}, 8( -; TOPDOWN: movl %{{.*}}, 12( +; TOPDOWN: movl %{{.*}}, 72( +; TOPDOWN: movl %{{.*}}, 76( ; TOPDOWN-LABEL: %for.end ; ; For -misched=ilpmin, verify that each expression subtree is ; scheduled independently, and that the imull/adds are interleaved. ; ; ILPMIN-LABEL: %for.body -; ILPMIN: movl %{{.*}}, ( +; ILPMIN: movl %{{.*}}, 64( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -36,7 +36,7 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 4( +; ILPMIN: movl %{{.*}}, 68( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -44,7 +44,7 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 8( +; ILPMIN: movl %{{.*}}, 72( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -52,14 +52,14 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 12( +; ILPMIN: movl %{{.*}}, 76( ; ILPMIN-LABEL: %for.end ; ; For -misched=ilpmax, verify that each expression subtree is ; scheduled independently, and that the imull/adds are clustered. ; ; ILPMAX-LABEL: %for.body -; ILPMAX: movl %{{.*}}, ( +; ILPMAX: movl %{{.*}}, 64( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -67,7 +67,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 4( +; ILPMAX: movl %{{.*}}, 68( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -75,7 +75,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 8( +; ILPMAX: movl %{{.*}}, 72( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -83,7 +83,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 12( +; ILPMAX: movl %{{.*}}, 76( ; ILPMAX-LABEL: %for.end define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2, Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -162,10 +162,10 @@ ; Consequently, we should *not* form any chains. ; ; X64: foldedidx: -; X64: movzbl -3( +; X64: movzbl 400( ; ; X32: foldedidx: -; X32: movzbl -3( +; X32: movzbl 400( define void @foldedidx(i8* nocapture %a, i8* nocapture %b, i8* nocapture %c) nounwind ssp { entry: br label %for.body @@ -277,7 +277,7 @@ ; ; X32: @testCmpZero ; X32: %for.body82.us -; X32: dec +; X32: cmp ; X32: jne define void @testCmpZero(i8* %src, i8* %dst, i32 %srcidx, i32 %dstidx, i32 %len) nounwind ssp { entry: