Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -293,6 +293,11 @@ bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) const; + /// \brief Return true when the AddRec can be folded into other insn. This is + /// possible when the target supports pre/postinc and the stride of the AddRec + /// is an acceptable const. + bool isAddRecFoldable(int64_t stride) const; + /// \brief Return true if the target works with masked instruction /// AVX2 allows masks for consecutive load and store for i32 and i64 elements. /// AVX-512 architecture will also allow masks for non-consecutive memory @@ -524,6 +529,7 @@ virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; + virtual bool isAddRecFoldable(int64_t stride) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) = 0; @@ -631,6 +637,9 @@ bool isLegalICmpImmediate(int64_t Imm) override { return Impl.isLegalICmpImmediate(Imm); } + bool isAddRecFoldable(int64_t stride) override { + return Impl.isAddRecFoldable(stride); + } bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) override { return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -204,6 +204,8 @@ bool isLegalICmpImmediate(int64_t Imm) { return false; } + bool isAddRecFoldable(int64_t stride) { return false; } + bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Guess that reg+reg addressing is allowed. This heuristic is taken from Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -122,6 +122,10 @@ return getTLI()->isLegalICmpImmediate(imm); } + bool isAddRecFoldable(int64_t stride) { + return getTLI()->isAddRecFoldable(stride); + } + bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { TargetLoweringBase::AddrMode AM; Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1461,6 +1461,11 @@ return true; } + /// \brief Return true when the AddRec can be folded into other insn. This is + /// possible when the target supports pre/postinc and the stride of the AddRec + /// is an acceptable const. + virtual bool isAddRecFoldable(int64_t stride) const { return false; } + /// Return true if the specified immediate is legal add immediate, that is the /// target has add instructions which can add a register with the immediate /// without having to materialize the immediate into a register. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -93,6 +93,10 @@ return TTIImpl->isLegalICmpImmediate(Imm); } +bool TargetTransformInfo::isAddRecFoldable(int64_t stride) const { + return TTIImpl->isAddRecFoldable(stride); +} + bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -307,6 +307,7 @@ bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; + bool isAddRecFoldable(int64_t stride) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6682,6 +6682,12 @@ return isLegalAddImmediate(Immed); } +/// AArch64 supports pre/postinc so the stride of the AddRec may be folded +/// into other load/store insn. +bool AArch64TargetLowering::isAddRecFoldable(int64_t stride) const { + return true; +} + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -294,6 +294,10 @@ /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; + /// isAddRecFoldable - Return true if the const stride of AddRec can be + /// folded into previous load/store insn via pre/postinc addressing mode. + bool isAddRecFoldable(int64_t stride) const override; + /// isLegalAddImmediate - Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -10188,6 +10188,10 @@ return Imm >= 0 && Imm <= 255; } +/// ARM supports pre/postinc so the stride of the AddRec may be folded +/// into other load/store insn. +bool ARMTargetLowering::isAddRecFoldable(int64_t stride) const { return true; } + /// isLegalAddImmediate - Return true if the specified immediate is a legal add /// *or sub* immediate, that is the target has add or sub instructions which can /// add a register with the immediate without having to materialize the Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -851,7 +851,9 @@ const LSRUse &LU, const Formula &F); // Get the cost of the scaling factor used in F for LU. static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F); + const LSRUse &LU, const Formula &F, + unsigned &InstNumCost); +static bool isAddressType(const LSRUse &LU); namespace { @@ -859,18 +861,23 @@ class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. + + /// How many regs used so far for the candidate fomulae or solution. unsigned NumRegs; - unsigned AddRecCost; - unsigned NumIVMuls; - unsigned NumBaseAdds; - unsigned ImmCost; + + /// Inst number cost. + unsigned InstNumCost; + /// Cost of regs which may spill. + unsigned SpillRegsCost; + /// Inst complexity cost. + unsigned InstCmplxCost; + /// Inst cost inserted in loop preheader. unsigned SetupCost; - unsigned ScaleCost; public: Cost() - : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0), ScaleCost(0) {} + : NumRegs(0), InstNumCost(0), SpillRegsCost(0), InstCmplxCost(0), + SetupCost(0) {} bool operator<(const Cost &Other) const; @@ -879,16 +886,15 @@ #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. bool isValid() { - return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost | ScaleCost) != ~0u) - || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost & ScaleCost) == ~0u); + return ( + ((InstNumCost | SpillRegsCost | InstCmplxCost | SetupCost) != ~0u) || + ((InstNumCost & SpillRegsCost & InstCmplxCost & SetupCost) == ~0u)); } #endif bool isLoser() { assert(isValid() && "invalid cost"); - return NumRegs == ~0u; + return InstNumCost == ~0u; } void RateFormula(const TargetTransformInfo &TTI, @@ -905,24 +911,25 @@ void dump() const; private: - void RateRegister(const SCEV *Reg, - SmallPtrSetImpl &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT); - void RatePrimaryRegister(const SCEV *Reg, - SmallPtrSetImpl &Regs, - const Loop *L, + void RateRegister(const TargetTransformInfo &TTI, const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, + ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, + bool &IsIndexedLoadStore); + void RatePrimaryRegister(const TargetTransformInfo &TTI, const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl *LoserRegs); + SmallPtrSetImpl *LoserRegs, + const LSRUse &LU, bool &IsIndexedLoadStore); + void UpdateSpillRegsCost(const TargetTransformInfo &TTI); }; } /// RateRegister - Tally up interesting quantities from the given register. -void Cost::RateRegister(const SCEV *Reg, - SmallPtrSetImpl &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { +void Cost::RateRegister(const TargetTransformInfo &TTI, const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, + ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, bool &IsIndexedLoadStore) { if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { // If this is an addrec for another loop, don't second-guess its addrec phi // nodes. LSR isn't currently smart enough to reason about more than one @@ -937,13 +944,50 @@ Lose(); return; } - AddRecCost += 1; /// TODO: This should be a function of the stride. + + // Usually we need a separate add/sub insn for every AddRec. But + // if the target support pre/postinc addressing mode, the following + // pattern (use AARCH64 as an example): + // adrp x10, a + // add x10, x10, :lo12:a + // .LBB0_2: + // add x12, x10, x9, lsl #2 // add insn for AddRec used by store + // add x9, x9, #1 + // str w11, [x12, #48] + // cmp x9, x8 + // b.lt .LBB0_2 + // can be optimized to this form using postinc addressing mode: + // adrp x10, a + // add x10, x10, :lo12:a + // add x10, x10, #48 + // .LBB0_2: + // str w11, [x10], #4 + // add x9, x9, #1 + // cmp x9, x8 + // b.lt .LBB0_2 + // And no separate add insn is required for the AddRec. If pre/postinc + // addressing mode is used, we record it in IsIndexedLoadStore because + // it usually means the load/store cannot fold other regs or offsets + // into the same insn. + if (AR->isAffine() && isa(AR->getOperand(1))) { + int64_t stride = dyn_cast_or_null(AR->getOperand(1)) + ->getValue() + ->getValue() + .getSExtValue(); + if (stride != 0 && TTI.isAddRecFoldable(stride) && isAddressType(LU)) + IsIndexedLoadStore = true; + else + InstNumCost += 1; + } else { + InstNumCost += 1; + } // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. if (!AR->isAffine() || !isa(AR->getOperand(1))) { if (!Regs.count(AR->getOperand(1))) { - RateRegister(AR->getOperand(1), Regs, L, SE, DT); + RateRegister(TTI, AR->getOperand(1), Regs, L, SE, DT, LU, + IsIndexedLoadStore); if (isLoser()) return; } @@ -960,24 +1004,24 @@ isa(cast(Reg)->getStart())))) ++SetupCost; - NumIVMuls += isa(Reg) && - SE.hasComputableLoopEvolution(Reg, L); + InstNumCost += isa(Reg) && SE.hasComputableLoopEvolution(Reg, L); } /// RatePrimaryRegister - Record this register in the set. If we haven't seen it /// before, rate it. Optional LoserRegs provides a way to declare any formula /// that refers to one of those regs an instant loser. -void Cost::RatePrimaryRegister(const SCEV *Reg, +void Cost::RatePrimaryRegister(const TargetTransformInfo &TTI, const SCEV *Reg, SmallPtrSetImpl &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl *LoserRegs) { + const Loop *L, ScalarEvolution &SE, + DominatorTree &DT, + SmallPtrSetImpl *LoserRegs, + const LSRUse &LU, bool &IsIndexedLoadStore) { if (LoserRegs && LoserRegs->count(Reg)) { Lose(); return; } if (Regs.insert(Reg).second) { - RateRegister(Reg, Regs, L, SE, DT); + RateRegister(TTI, Reg, Regs, L, SE, DT, LU, IsIndexedLoadStore); if (LoserRegs && isLoser()) LoserRegs->insert(Reg); } @@ -992,6 +1036,8 @@ ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl *LoserRegs) { + // Show whether pre/postinc addressing mode is used for the formula. + bool IsIndexedLoadStore = false; assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -999,7 +1045,8 @@ Lose(); return; } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); + RatePrimaryRegister(TTI, ScaledReg, Regs, L, SE, DT, LoserRegs, LU, + IsIndexedLoadStore); if (isLoser()) return; } @@ -1010,71 +1057,76 @@ Lose(); return; } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); + RatePrimaryRegister(TTI, BaseReg, Regs, L, SE, DT, LoserRegs, LU, + IsIndexedLoadStore); if (isLoser()) return; } // Determine how many (unfolded) adds we'll need inside the loop. size_t NumBaseParts = F.getNumRegs(); + unsigned NumBaseAdds = 0; if (NumBaseParts > 1) // Do not count the base and a possible second register if the target // allows to fold 2 registers. NumBaseAdds += - NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); + NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F) && + !IsIndexedLoadStore)); NumBaseAdds += (F.UnfoldedOffset != 0); + InstNumCost += NumBaseAdds; // Accumulate non-free scaling amounts. - ScaleCost += getScalingFactorCost(TTI, LU, F); + InstCmplxCost += getScalingFactorCost(TTI, LU, F, InstNumCost); // Tally up the non-zero immediates. + unsigned ImmCost = 0; for (SmallVectorImpl::const_iterator I = Offsets.begin(), E = Offsets.end(); I != E; ++I) { int64_t Offset = (uint64_t)*I + F.BaseOffset; - if (F.BaseGV) - ImmCost += 64; // Handle symbolic values conservatively. - // TODO: This should probably be the pointer size. - else if (Offset != 0) - ImmCost += APInt(64, Offset, true).getMinSignedBits(); + if (Offset != 0 && IsIndexedLoadStore) + ImmCost += 1; } + InstNumCost += ImmCost; + + UpdateSpillRegsCost(TTI); assert(isValid() && "invalid cost"); } /// Lose - Set this cost to a losing value. void Cost::Lose() { NumRegs = ~0u; - AddRecCost = ~0u; - NumIVMuls = ~0u; - NumBaseAdds = ~0u; - ImmCost = ~0u; + InstNumCost = ~0u; + SpillRegsCost = ~0u; + InstCmplxCost = ~0u; SetupCost = ~0u; - ScaleCost = ~0u; +} + +/// UpdateSpillRegsCost -- Update the SpillRegsCost according to +/// register pressure. +void Cost::UpdateSpillRegsCost(const TargetTransformInfo &TTI) { + SpillRegsCost = (NumRegs > TTI.getNumberOfRegisters(false)) + ? NumRegs - TTI.getNumberOfRegisters(false) + : 0; } /// operator< - Choose the lower cost. bool Cost::operator<(const Cost &Other) const { - return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, - ImmCost, SetupCost) < - std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, - Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost, - Other.SetupCost); + unsigned InstCost = InstNumCost + SpillRegsCost; + unsigned OtherInstCost = Other.InstNumCost + Other.SpillRegsCost; + return std::tie(InstCost, InstCmplxCost, SetupCost) < + std::tie(OtherInstCost, Other.InstCmplxCost, Other.SetupCost); } void Cost::print(raw_ostream &OS) const { - OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); - if (AddRecCost != 0) - OS << ", with addrec cost " << AddRecCost; - if (NumIVMuls != 0) - OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); - if (NumBaseAdds != 0) - OS << ", plus " << NumBaseAdds << " base add" - << (NumBaseAdds == 1 ? "" : "s"); - if (ScaleCost != 0) - OS << ", plus " << ScaleCost << " scale cost"; - if (ImmCost != 0) - OS << ", plus " << ImmCost << " imm cost"; + OS << NumRegs << " Reg" << (NumRegs == 1 ? "" : "s"); + if (InstNumCost != 0) + OS << ", plus " << InstNumCost << " InstNumCost"; + if (SpillRegsCost != 0) + OS << ", plus " << InstNumCost << " SpillRegsCost"; + if (InstCmplxCost != 0) + OS << ", plus " << InstCmplxCost << " InstCmplxCost"; if (SetupCost != 0) - OS << ", plus " << SetupCost << " setup cost"; + OS << ", plus " << SetupCost << " SetupCost"; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -1497,16 +1549,23 @@ F.Scale); } +static bool isAddressType(const LSRUse &LU) { + return LU.Kind == LSRUse::Address; +} + static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F) { + const LSRUse &LU, const Formula &F, + unsigned &InstNumCost) { if (!F.Scale) return 0; // If the use is not completely folded in that instruction, we will have to // pay an extra cost only for scale != 1. if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, F)) - return F.Scale != 1; + LU.AccessTy, F)) { + InstNumCost += 1; + return 0; + } switch (LU.Kind) { case LSRUse::Address: { @@ -1527,8 +1586,8 @@ case LSRUse::ICmpZero: case LSRUse::Basic: case LSRUse::Special: - // The use is completely folded, i.e., everything is folded into the - // instruction. + if (!(F.getNumRegs() == 1 && F.BaseOffset == 0 && F.UnfoldedOffset == 0)) + InstNumCost += 1; return 0; } @@ -1773,6 +1832,8 @@ DenseSet &VisitedRegs) const; void Solve(SmallVectorImpl &Solution) const; + void DumpSolution(SmallVectorImpl &Solution, + Cost &SolutionCost, std::string HeadStr) const; BasicBlock::iterator HoistInsertPosition(BasicBlock::iterator IP, const SmallVectorImpl &Inputs) const; @@ -4296,42 +4357,12 @@ const LSRUse &LU = Uses[Workspace.size()]; - // If this use references any register that's already a part of the - // in-progress solution, consider it a requirement that a formula must - // reference that register in order to be considered. This prunes out - // unprofitable searching. - SmallSetVector ReqRegs; - for (const SCEV *S : CurRegs) - if (LU.Regs.count(S)) - ReqRegs.insert(S); - SmallPtrSet NewRegs; Cost NewCost; for (SmallVectorImpl::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; - // Ignore formulae which may not be ideal in terms of register reuse of - // ReqRegs. The formula should use all required registers before - // introducing new ones. - int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size()); - for (SmallSetVector::const_iterator J = ReqRegs.begin(), - JE = ReqRegs.end(); J != JE; ++J) { - const SCEV *Reg = *J; - if ((F.ScaledReg && F.ScaledReg == Reg) || - std::find(F.BaseRegs.begin(), F.BaseRegs.end(), Reg) != - F.BaseRegs.end()) { - --NumReqRegsToFind; - if (NumReqRegsToFind == 0) - break; - } - } - if (NumReqRegsToFind != 0) { - // If none of the formulae satisfied the required registers, then we could - // clear ReqRegs and try again. Currently, we simply give up in this case. - continue; - } - // Evaluate the cost of the current formula. If it's already worse than // the current best, prune the search at that point. NewCost = CurCost; @@ -4346,14 +4377,9 @@ if (F.getNumRegs() == 1 && Workspace.size() == 1) VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); } else { - DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); - dbgs() << ".\n Regs:"; - for (const SCEV *S : NewRegs) - dbgs() << ' ' << *S; - dbgs() << '\n'); - SolutionCost = NewCost; Solution = Workspace; + DumpSolution(Solution, SolutionCost, "New best at "); } Workspace.pop_back(); } @@ -4380,9 +4406,15 @@ } // Ok, we've now made all our decisions. - DEBUG(dbgs() << "\n" - "The chosen solution requires "; SolutionCost.print(dbgs()); - dbgs() << ":\n"; + DumpSolution(Solution, SolutionCost, "The chosen solution requires "); + + assert(Solution.size() == Uses.size() && "Malformed solution!"); +} + +/// DumpSolution - Dump the existing solution. +void LSRInstance::DumpSolution(SmallVectorImpl &Solution, + Cost &SolutionCost, std::string HeadStr) const { + DEBUG(dbgs() << "\n" << HeadStr; SolutionCost.print(dbgs()); dbgs() << ":\n"; for (size_t i = 0, e = Uses.size(); i != e; ++i) { dbgs() << " "; Uses[i].print(dbgs()); @@ -4391,8 +4423,6 @@ Solution[i]->print(dbgs()); dbgs() << '\n'; }); - - assert(Solution.size() == Uses.size() && "Malformed solution!"); } /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up