Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -305,6 +305,11 @@ bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) const; + /// \brief Return true when the AddRec can be folded into other insn. This is + /// possible when the target supports pre/postinc and the stride of the AddRec + /// is an acceptable const. + bool isAddRecFoldable(int64_t stride) const; + /// \brief Return true if the target works with masked instruction /// AVX2 allows masks for consecutive load and store for i32 and i64 elements. /// AVX-512 architecture will also allow masks for non-consecutive memory @@ -537,6 +542,7 @@ virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) = 0; virtual bool isLegalAddImmediate(int64_t Imm) = 0; virtual bool isLegalICmpImmediate(int64_t Imm) = 0; + virtual bool isAddRecFoldable(int64_t stride) = 0; virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) = 0; @@ -647,6 +653,9 @@ bool isLegalICmpImmediate(int64_t Imm) override { return Impl.isLegalICmpImmediate(Imm); } + bool isAddRecFoldable(int64_t stride) override { + return Impl.isAddRecFoldable(stride); + } bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) override { return Impl.isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -206,6 +206,8 @@ bool isLegalICmpImmediate(int64_t Imm) { return false; } + bool isAddRecFoldable(int64_t stride) { return false; } + bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Guess that reg+reg addressing is allowed. This heuristic is taken from Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -124,6 +124,10 @@ return getTLI()->isLegalICmpImmediate(imm); } + bool isAddRecFoldable(int64_t stride) { + return getTLI()->isAddRecFoldable(stride); + } + bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { TargetLoweringBase::AddrMode AM; Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1462,6 +1462,11 @@ return true; } + /// \brief Return true when the AddRec can be folded into other insn. This is + /// possible when the target supports pre/postinc and the stride of the AddRec + /// is an acceptable const. + virtual bool isAddRecFoldable(int64_t stride) const { return false; } + /// Return true if the specified immediate is legal add immediate, that is the /// target has add instructions which can add a register with the immediate /// without having to materialize the immediate into a register. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -97,6 +97,10 @@ return TTIImpl->isLegalICmpImmediate(Imm); } +bool TargetTransformInfo::isAddRecFoldable(int64_t stride) const { + return TTIImpl->isAddRecFoldable(stride); +} + bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -307,6 +307,7 @@ bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; + bool isAddRecFoldable(int64_t stride) const override; EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -6766,6 +6766,12 @@ return isLegalAddImmediate(Immed); } +/// AArch64 supports pre/postinc so the stride of the AddRec may be folded +/// into other load/store insn. +bool AArch64TargetLowering::isAddRecFoldable(int64_t stride) const { + return true; +} + /// isLegalAddressingMode - Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -294,6 +294,10 @@ /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; + /// isAddRecFoldable - Return true if the const stride of AddRec can be + /// folded into previous load/store insn via pre/postinc addressing mode. + bool isAddRecFoldable(int64_t stride) const override; + /// isLegalAddImmediate - Return true if the specified immediate is legal /// add immediate, that is the target has add instructions which can /// add a register and the immediate without having to materialize Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -10299,6 +10299,10 @@ return Imm >= 0 && Imm <= 255; } +/// ARM supports pre/postinc so the stride of the AddRec may be folded +/// into other load/store insn. +bool ARMTargetLowering::isAddRecFoldable(int64_t stride) const { return true; } + /// isLegalAddImmediate - Return true if the specified immediate is a legal add /// *or sub* immediate, that is the target has add or sub instructions which can /// add a register with the immediate without having to materialize the Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -103,6 +103,23 @@ static bool StressIVChain = false; #endif +enum LSRModel { BasicMod, AggrModUnlimitedReg, AggrModEstReg, AggrModFewReg }; + +static cl::opt LSRCostModel( + "lsrmodel", cl::Hidden, cl::init(BasicMod), + cl::desc("Choose the LSR cost model:"), + cl::values( + clEnumValN(BasicMod, "basic", "enable basic cost model"), + clEnumValN( + AggrModUnlimitedReg, "aggr-unlimited", + "enable aggressive cost model and assume unlimited hard regs"), + clEnumValN(AggrModEstReg, "aggr-estimate", + "enable aggressive cost model and estimate reg pressure"), + clEnumValN( + AggrModFewReg, "aggr-few", + "enable aggressive cost model and assume very few hard regs"), + clEnumValEnd)); + namespace { /// RegSortData - This class holds data which is used to order reuse candidates. @@ -847,36 +864,72 @@ /// accurate cost model. static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F); -// Get the cost of the scaling factor used in F for LU. -static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F); namespace { +/// Types of the cost models. +enum CostTypes { BasicCostType, AggressiveCostType }; + /// Cost - This class is used to measure and compare candidate formulae. class Cost { +protected: + unsigned NumRegs; + unsigned SetupCost; + + /// it shows the class type. + unsigned CostType; + +public: + static Cost &Create(); + static void Delete(Cost &); + + virtual ~Cost(){}; + virtual bool operator<(const Cost &Other) const = 0; + virtual void operator=(const Cost &Other) = 0; + virtual void Lose() = 0; +#ifndef NDEBUG + virtual bool isValid() = 0; +#endif + virtual void clear() = 0; + virtual bool isLoser() = 0; + virtual void + RateFormula(const TargetTransformInfo &TTI, const Formula &F, + SmallPtrSetImpl &Regs, + const DenseSet &VisitedRegs, const Loop *L, + const SmallVectorImpl &Offsets, ScalarEvolution &SE, + DominatorTree &DT, const LSRUse &LU, + SmallPtrSetImpl *LoserRegs = nullptr) = 0; + + virtual void print(raw_ostream &OS) const = 0; + void dump(); + unsigned getCostType() const { return CostType; } +}; + +class BasicCost : public Cost { /// TODO: Some of these could be merged. Also, a lexical ordering /// isn't always optimal. - unsigned NumRegs; unsigned AddRecCost; unsigned NumIVMuls; unsigned NumBaseAdds; unsigned ImmCost; - unsigned SetupCost; unsigned ScaleCost; public: - Cost() - : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0), ScaleCost(0) {} + void clear(); + + BasicCost() { + CostType = BasicCostType; + clear(); + } - bool operator<(const Cost &Other) const; + bool operator<(const Cost &Other) const override; + void operator=(const Cost &Other) override; void Lose(); #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. - bool isValid() { + virtual bool isValid() { return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds | ImmCost | SetupCost | ScaleCost) != ~0u) || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds @@ -889,38 +942,124 @@ return NumRegs == ~0u; } - void RateFormula(const TargetTransformInfo &TTI, - const Formula &F, - SmallPtrSetImpl &Regs, - const DenseSet &VisitedRegs, - const Loop *L, - const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT, - const LSRUse &LU, - SmallPtrSetImpl *LoserRegs = nullptr); + virtual void RateFormula(const TargetTransformInfo &TTI, const Formula &F, + SmallPtrSetImpl &Regs, + const DenseSet &VisitedRegs, + const Loop *L, + const SmallVectorImpl &Offsets, + ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, + SmallPtrSetImpl *LoserRegs = nullptr); + + virtual void print(raw_ostream &OS) const; - void print(raw_ostream &OS) const; - void dump() const; + static inline bool classof(const Cost *CT) { + return CT->getCostType() == BasicCostType; + } private: - void RateRegister(const SCEV *Reg, - SmallPtrSetImpl &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT); - void RatePrimaryRegister(const SCEV *Reg, + void RateRegister(const SCEV *Reg, SmallPtrSetImpl &Regs, + const Loop *L, ScalarEvolution &SE, DominatorTree &DT); + void RatePrimaryRegister(const SCEV *Reg, SmallPtrSetImpl &Regs, + const Loop *L, ScalarEvolution &SE, + DominatorTree &DT, + SmallPtrSetImpl *LoserRegs); + unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); +}; + +class AggressiveCost : public Cost { + /// Inst number cost. + unsigned InstNumCost; + /// Cost of regs which may spill. + unsigned SpillRegsCost; + /// Inst complexity cost. + unsigned InstCmplxCost; + +public: + void clear(); + + AggressiveCost() { + CostType = AggressiveCostType; + clear(); + } + + bool operator<(const Cost &Other) const override; + void operator=(const Cost &Other) override; + + void Lose(); + +#ifndef NDEBUG + // Once any of the metrics loses, they must all remain losers. + bool isValid() { + return ( + ((InstNumCost | SpillRegsCost | InstCmplxCost | SetupCost) != ~0u) || + ((InstNumCost & SpillRegsCost & InstCmplxCost & SetupCost) == ~0u)); + } +#endif + + bool isLoser() { + assert(isValid() && "invalid cost"); + return InstNumCost == ~0u; + } + + virtual void RateFormula(const TargetTransformInfo &TTI, const Formula &F, SmallPtrSetImpl &Regs, + const DenseSet &VisitedRegs, const Loop *L, + const SmallVectorImpl &Offsets, ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl *LoserRegs); + const LSRUse &LU, + SmallPtrSetImpl *LoserRegs = nullptr); + + virtual void print(raw_ostream &OS) const; + + static inline bool classof(const Cost *CT) { + return CT->getCostType() == AggressiveCostType; + } + +private: + void RateRegister(const TargetTransformInfo &TTI, const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, + ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, + bool &IsIndexedLoadStore); + void RatePrimaryRegister(const TargetTransformInfo &TTI, const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, + ScalarEvolution &SE, DominatorTree &DT, + SmallPtrSetImpl *LoserRegs, + const LSRUse &LU, bool &IsIndexedLoadStore); + unsigned getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F); + void UpdateSpillRegsCost(const TargetTransformInfo &TTI); + unsigned EvaluateBaseParts(const Formula &F, ScalarEvolution &SE, + const Loop *L); }; } +Cost &Cost::Create() { + if (LSRCostModel == BasicMod) + return *(new BasicCost()); + else + return *(new AggressiveCost()); +} + +void Cost::Delete(Cost &cost) { delete &cost; } + +void BasicCost::clear() { + NumRegs = 0; + AddRecCost = 0; + NumIVMuls = 0; + NumBaseAdds = 0; + ImmCost = 0; + SetupCost = 0; + ScaleCost = 0; +} + /// RateRegister - Tally up interesting quantities from the given register. -void Cost::RateRegister(const SCEV *Reg, - SmallPtrSetImpl &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { +void BasicCost::RateRegister(const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, + ScalarEvolution &SE, DominatorTree &DT) { if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { // If this is an addrec for another loop, don't second-guess its addrec phi // nodes. LSR isn't currently smart enough to reason about more than one @@ -965,11 +1104,11 @@ /// RatePrimaryRegister - Record this register in the set. If we haven't seen it /// before, rate it. Optional LoserRegs provides a way to declare any formula /// that refers to one of those regs an instant loser. -void Cost::RatePrimaryRegister(const SCEV *Reg, - SmallPtrSetImpl &Regs, - const Loop *L, - ScalarEvolution &SE, DominatorTree &DT, - SmallPtrSetImpl *LoserRegs) { +void BasicCost::RatePrimaryRegister(const SCEV *Reg, + SmallPtrSetImpl &Regs, + const Loop *L, ScalarEvolution &SE, + DominatorTree &DT, + SmallPtrSetImpl *LoserRegs) { if (LoserRegs && LoserRegs->count(Reg)) { Lose(); return; @@ -981,15 +1120,14 @@ } } -void Cost::RateFormula(const TargetTransformInfo &TTI, - const Formula &F, - SmallPtrSetImpl &Regs, - const DenseSet &VisitedRegs, - const Loop *L, - const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT, - const LSRUse &LU, - SmallPtrSetImpl *LoserRegs) { +void BasicCost::RateFormula(const TargetTransformInfo &TTI, const Formula &F, + SmallPtrSetImpl &Regs, + const DenseSet &VisitedRegs, + const Loop *L, + const SmallVectorImpl &Offsets, + ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, + SmallPtrSetImpl *LoserRegs) { assert(F.isCanonical() && "Cost is accurate only for canonical formula"); // Tally up the registers. if (const SCEV *ScaledReg = F.ScaledReg) { @@ -1039,7 +1177,7 @@ } /// Lose - Set this cost to a losing value. -void Cost::Lose() { +void BasicCost::Lose() { NumRegs = ~0u; AddRecCost = ~0u; NumIVMuls = ~0u; @@ -1050,7 +1188,9 @@ } /// operator< - Choose the lower cost. -bool Cost::operator<(const Cost &Other) const { +bool BasicCost::operator<(const Cost &CT) const { + assert(isa(CT) && "Expect a BasicCost object"); + const BasicCost &Other = static_cast(CT); return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, ImmCost, SetupCost) < std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, @@ -1058,7 +1198,20 @@ Other.SetupCost); } -void Cost::print(raw_ostream &OS) const { +void BasicCost::operator=(const Cost &CT) { + assert(isa(CT) && "Expect a BasicCost object"); + const BasicCost &Other = static_cast(CT); + + NumRegs = Other.NumRegs; + AddRecCost = Other.AddRecCost; + NumIVMuls = Other.NumIVMuls; + NumBaseAdds = Other.NumBaseAdds; + ImmCost = Other.ImmCost; + SetupCost = Other.SetupCost; + ScaleCost = Other.ScaleCost; +} + +void BasicCost::print(raw_ostream &OS) const { OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); if (AddRecCost != 0) OS << ", with addrec cost " << AddRecCost; @@ -1075,11 +1228,174 @@ OS << ", plus " << SetupCost << " setup cost"; } -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -void Cost::dump() const { - print(errs()); errs() << '\n'; +void AggressiveCost::clear() { + NumRegs = 0; + SetupCost = 0; + InstNumCost = 0; + SpillRegsCost = 0; + InstCmplxCost = 0; + SetupCost = 0; +} + +/// RatePrimaryRegister - Record this register in the set. If we haven't seen it +/// before, rate it. Optional LoserRegs provides a way to declare any formula +/// that refers to one of those regs an instant loser. +void AggressiveCost::RatePrimaryRegister( + const TargetTransformInfo &TTI, const SCEV *Reg, + SmallPtrSetImpl &Regs, const Loop *L, ScalarEvolution &SE, + DominatorTree &DT, SmallPtrSetImpl *LoserRegs, + const LSRUse &LU, bool &IsIndexedLoadStore) { + if (LoserRegs && LoserRegs->count(Reg)) { + Lose(); + return; + } + if (Regs.insert(Reg).second) { + RateRegister(TTI, Reg, Regs, L, SE, DT, LU, IsIndexedLoadStore); + if (LoserRegs && isLoser()) + LoserRegs->insert(Reg); + } +} + +unsigned AggressiveCost::EvaluateBaseParts(const Formula &F, + ScalarEvolution &SE, const Loop *L) { + unsigned variant = 0; + unsigned invariant = 0; + for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); + I != E; ++I) { + if (SE.isLoopInvariant(*I, L)) + invariant++; + else + variant++; + } + if (F.BaseGV || F.BaseOffset) + invariant++; + if (F.ScaledReg) + variant++; + return variant + (invariant ? 1 : 0); +} + +void AggressiveCost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, + SmallPtrSetImpl &Regs, + const DenseSet &VisitedRegs, + const Loop *L, + const SmallVectorImpl &Offsets, + ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, + SmallPtrSetImpl *LoserRegs) { + // Show whether pre/postinc addressing mode is used for the formula. + bool IsIndexedLoadStore = false; + assert(F.isCanonical() && "Cost is accurate only for canonical formula"); + // Tally up the registers. + if (const SCEV *ScaledReg = F.ScaledReg) { + if (VisitedRegs.count(ScaledReg)) { + Lose(); + return; + } + RatePrimaryRegister(TTI, ScaledReg, Regs, L, SE, DT, LoserRegs, LU, + IsIndexedLoadStore); + if (isLoser()) + return; + } + for (SmallVectorImpl::const_iterator I = F.BaseRegs.begin(), + E = F.BaseRegs.end(); + I != E; ++I) { + const SCEV *BaseReg = *I; + if (VisitedRegs.count(BaseReg)) { + Lose(); + return; + } + RatePrimaryRegister(TTI, BaseReg, Regs, L, SE, DT, LoserRegs, LU, + IsIndexedLoadStore); + if (isLoser()) + return; + } + + // Determine how many (unfolded) adds we'll need inside the loop. + size_t NumBaseParts = EvaluateBaseParts(F, SE, L); + unsigned NumBaseAdds = 0; + if (NumBaseParts > 1) + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += + NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F) && + !IsIndexedLoadStore)); + NumBaseAdds += (F.UnfoldedOffset != 0); + InstNumCost += NumBaseAdds; + + // Accumulate non-free scaling amounts. + InstCmplxCost += getScalingFactorCost(TTI, LU, F); + + // Tally up the non-zero immediates. + unsigned ImmCost = 0; + for (SmallVectorImpl::const_iterator I = Offsets.begin(), + E = Offsets.end(); + I != E; ++I) { + int64_t Offset = (uint64_t)*I + F.BaseOffset; + if (Offset != 0 && IsIndexedLoadStore) + ImmCost += 1; + } + InstNumCost += ImmCost; + + UpdateSpillRegsCost(TTI); + assert(isValid() && "invalid cost"); +} + +/// Lose - Set this cost to a losing value. +void AggressiveCost::Lose() { + NumRegs = ~0u; + InstNumCost = ~0u; + SpillRegsCost = ~0u; + InstCmplxCost = ~0u; + SetupCost = ~0u; +} + +/// UpdateSpillRegsCost -- Update the SpillRegsCost according to +/// LSRCostModel and register pressure. +void AggressiveCost::UpdateSpillRegsCost(const TargetTransformInfo &TTI) { + if (LSRCostModel == AggrModEstReg) + SpillRegsCost = (NumRegs > TTI.getNumberOfRegisters(false)) + ? NumRegs - TTI.getNumberOfRegisters(false) + : 0; + else if (LSRCostModel == AggrModUnlimitedReg) + SpillRegsCost = 0; + else + SpillRegsCost = NumRegs; +} + +/// operator< - Choose the lower cost. +bool AggressiveCost::operator<(const Cost &CT) const { + assert(isa(CT) && "Expect a AggressiveCost object"); + const AggressiveCost &Other = static_cast(CT); + unsigned InstCost = InstNumCost + SpillRegsCost; + unsigned OtherInstCost = Other.InstNumCost + Other.SpillRegsCost; + return std::tie(InstCost, InstCmplxCost, SetupCost) < + std::tie(OtherInstCost, Other.InstCmplxCost, Other.SetupCost); +} + +void AggressiveCost::operator=(const Cost &CT) { + assert(isa(CT) && "Expect a AggressiveCost object"); + const AggressiveCost &Other = static_cast(CT); + + NumRegs = Other.NumRegs; + InstNumCost = Other.InstNumCost; + SpillRegsCost = Other.SpillRegsCost; + InstCmplxCost = Other.InstCmplxCost; + SetupCost = Other.SetupCost; +} + +void AggressiveCost::print(raw_ostream &OS) const { + OS << NumRegs << " Reg" << (NumRegs == 1 ? "" : "s"); + if (InstNumCost != 0) + OS << ", plus " << InstNumCost << " InstNumCost"; + if (SpillRegsCost != 0) + OS << ", plus " << InstNumCost << " SpillRegsCost"; + if (InstCmplxCost != 0) + OS << ", plus " << InstCmplxCost << " InstCmplxCost"; + if (SetupCost != 0) + OS << ", plus " << SetupCost << " SetupCost"; } -#endif namespace { @@ -1269,6 +1585,90 @@ } +/// RateRegister - Tally up interesting quantities from the given register. +void AggressiveCost::RateRegister(const TargetTransformInfo &TTI, + const SCEV *Reg, + SmallPtrSetImpl &Regs, + const Loop *L, ScalarEvolution &SE, + DominatorTree &DT, const LSRUse &LU, + bool &IsIndexedLoadStore) { + if (const SCEVAddRecExpr *AR = dyn_cast(Reg)) { + // If this is an addrec for another loop, don't second-guess its addrec phi + // nodes. LSR isn't currently smart enough to reason about more than one + // loop at a time. LSR has already run on inner loops, will not run on outer + // loops, and cannot be expected to change sibling loops. + if (AR->getLoop() != L) { + // If the AddRec exists, consider it's register free and leave it alone. + if (isExistingPhi(AR, SE)) + return; + + // Otherwise, do not consider this formula at all. + Lose(); + return; + } + + // Usually we need a separate add/sub insn for every AddRec. But + // if the target support pre/postinc addressing mode, the following + // pattern (use AARCH64 as an example): + // adrp x10, a + // add x10, x10, :lo12:a + // .LBB0_2: + // add x12, x10, x9, lsl #2 // add insn for AddRec used by store + // add x9, x9, #1 + // str w11, [x12, #48] + // cmp x9, x8 + // b.lt .LBB0_2 + // can be optimized to this form using postinc addressing mode: + // adrp x10, a + // add x10, x10, :lo12:a + // add x10, x10, #48 + // .LBB0_2: + // str w11, [x10], #4 + // add x9, x9, #1 + // cmp x9, x8 + // b.lt .LBB0_2 + // And no separate add insn is required for the AddRec. If pre/postinc + // addressing mode is used, we record it in IsIndexedLoadStore because + // it usually means the load/store cannot fold other regs or offsets + // into the same insn. + if (AR->isAffine() && isa(AR->getOperand(1))) { + int64_t stride = dyn_cast_or_null(AR->getOperand(1)) + ->getValue() + ->getValue() + .getSExtValue(); + if (stride != 0 && TTI.isAddRecFoldable(stride) && + (LU.Kind == LSRUse::Address)) + IsIndexedLoadStore = true; + else + InstNumCost += 1; + } else { + InstNumCost += 1; + } + + // Add the step value register, if it needs one. + // TODO: The non-affine case isn't precisely modeled here. + if (!AR->isAffine() || !isa(AR->getOperand(1))) { + if (!Regs.count(AR->getOperand(1))) { + RateRegister(TTI, AR->getOperand(1), Regs, L, SE, DT, LU, + IsIndexedLoadStore); + if (isLoser()) + return; + } + } + } + ++NumRegs; + + // Rough heuristic; favor registers which don't require extra setup + // instructions in the preheader. + if (!isa(Reg) && !isa(Reg) && + !(isa(Reg) && + (isa(cast(Reg)->getStart()) || + isa(cast(Reg)->getStart())))) + ++SetupCost; + + InstNumCost += isa(Reg) && SE.hasComputableLoopEvolution(Reg, L); +} + /// HasFormula - Test whether this use as a formula which has the same /// registers as the given formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { @@ -1492,8 +1892,9 @@ F.Scale); } -static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F) { +/// Get the cost of the scaling factor used in F for LU. +unsigned BasicCost::getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, const Formula &F) { if (!F.Scale) return 0; @@ -1530,6 +1931,51 @@ llvm_unreachable("Invalid LSRUse Kind!"); } +unsigned AggressiveCost::getScalingFactorCost(const TargetTransformInfo &TTI, + const LSRUse &LU, + const Formula &F) { + if (!F.Scale) + return 0; + + // If the use is not completely folded in that instruction, we will have to + // pay an extra cost only for scale != 1. + if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, + LU.AccessTy, F)) { + InstNumCost += 1; + return 0; + } + + switch (LU.Kind) { + case LSRUse::Address: { + // Check the scaling factor cost with both the min and max offsets. + int ScaleCostMinOffset = TTI.getScalingFactorCost( + LU.AccessTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, + F.Scale); + int ScaleCostMaxOffset = TTI.getScalingFactorCost( + LU.AccessTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, + F.Scale); + + assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && + "Legal addressing mode has an illegal cost!"); + return std::max(ScaleCostMinOffset, ScaleCostMaxOffset); + } + case LSRUse::ICmpZero: + case LSRUse::Basic: + case LSRUse::Special: + // For ICmpZero, the loop be kept as a count-down loop only when + // F.getNumRegs() == 1 and there is no other offset, or else we + // will need a separate cmp/test instruction. + // For Basic and Special uses, no complex addressing mode is allowed, so + // we need an extra instruction when F contains more than one reg or + // F contains other offset. + if (!(F.getNumRegs() == 1 && F.BaseOffset == 0 && F.UnfoldedOffset == 0)) + InstNumCost += 1; + return 0; + } + + llvm_unreachable("Invalid LSRUse Kind!"); +} + static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, Type *AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, @@ -1768,6 +2214,9 @@ DenseSet &VisitedRegs) const; void Solve(SmallVectorImpl &Solution) const; + void DumpSolution(SmallVectorImpl &Solution, + Cost &SolutionCost, std::string HeadStr) const; + BasicBlock::iterator HoistInsertPosition(BasicBlock::iterator IP, const SmallVectorImpl &Inputs) const; @@ -3921,6 +4370,8 @@ BestFormulaeTy; BestFormulaeTy BestFormulae; + Cost &CostF = Cost::Create(); + Cost &CostBest = Cost::Create(); for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); @@ -3937,8 +4388,8 @@ // avoids the need to recompute this information across formulae using the // same bad AddRec. Passing LoserRegs is also essential unless we remove // the corresponding bad register from the Regs set. - Cost CostF; Regs.clear(); + CostF.clear(); CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { @@ -3973,8 +4424,8 @@ Formula &Best = LU.Formulae[P.first->second]; - Cost CostBest; Regs.clear(); + CostBest.clear(); CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU); if (CostF < CostBest) @@ -4001,6 +4452,10 @@ BestFormulae.clear(); } + // wmi + Cost::Delete(CostF); + Cost::Delete(CostBest); + DEBUG(if (ChangedFormulae) { dbgs() << "\n" "After filtering out undesirable candidates:\n"; @@ -4296,12 +4751,14 @@ // reference that register in order to be considered. This prunes out // unprofitable searching. SmallSetVector ReqRegs; - for (const SCEV *S : CurRegs) - if (LU.Regs.count(S)) - ReqRegs.insert(S); + if (LSRCostModel == BasicMod) { + for (const SCEV *S : CurRegs) + if (LU.Regs.count(S)) + ReqRegs.insert(S); + } SmallPtrSet NewRegs; - Cost NewCost; + Cost &NewCost = Cost::Create(); for (SmallVectorImpl::const_iterator I = LU.Formulae.begin(), E = LU.Formulae.end(); I != E; ++I) { const Formula &F = *I; @@ -4341,27 +4798,23 @@ if (F.getNumRegs() == 1 && Workspace.size() == 1) VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]); } else { - DEBUG(dbgs() << "New best at "; NewCost.print(dbgs()); - dbgs() << ".\n Regs:"; - for (const SCEV *S : NewRegs) - dbgs() << ' ' << *S; - dbgs() << '\n'); - SolutionCost = NewCost; Solution = Workspace; + DumpSolution(Solution, SolutionCost, "New best at "); } Workspace.pop_back(); } } + Cost::Delete(NewCost); } /// Solve - Choose one formula from each use. Return the results in the given /// Solution vector. void LSRInstance::Solve(SmallVectorImpl &Solution) const { SmallVector Workspace; - Cost SolutionCost; + Cost &SolutionCost = Cost::Create(); SolutionCost.Lose(); - Cost CurCost; + Cost &CurCost = Cost::Create(); SmallPtrSet CurRegs; DenseSet VisitedRegs; Workspace.reserve(Uses.size()); @@ -4371,13 +4824,23 @@ CurRegs, VisitedRegs); if (Solution.empty()) { DEBUG(dbgs() << "\nNo Satisfactory Solution\n"); - return; + goto ret; } // Ok, we've now made all our decisions. - DEBUG(dbgs() << "\n" - "The chosen solution requires "; SolutionCost.print(dbgs()); - dbgs() << ":\n"; + DumpSolution(Solution, SolutionCost, "The chosen solution requires "); + + assert(Solution.size() == Uses.size() && "Malformed solution!"); + +ret: + Cost::Delete(SolutionCost); + Cost::Delete(CurCost); +} + +/// DumpSolution - Dump the existing solution. +void LSRInstance::DumpSolution(SmallVectorImpl &Solution, + Cost &SolutionCost, std::string HeadStr) const { + DEBUG(dbgs() << "\n" << HeadStr; SolutionCost.print(dbgs()); dbgs() << ":\n"; for (size_t i = 0, e = Uses.size(); i != e; ++i) { dbgs() << " "; Uses[i].print(dbgs()); @@ -4386,8 +4849,6 @@ Solution[i]->print(dbgs()); dbgs() << '\n'; }); - - assert(Solution.size() == Uses.size() && "Malformed solution!"); } /// HoistInsertPosition - Helper for AdjustInsertPositionForExpand. Climb up Index: test/Transforms/LoopStrengthReduce/new_cost_1.ll =================================================================== --- test/Transforms/LoopStrengthReduce/new_cost_1.ll +++ test/Transforms/LoopStrengthReduce/new_cost_1.ll @@ -0,0 +1,45 @@ +; This test makes sure that loop uses post-increment addressing mode to reduce insns. +; RUN: opt < %s -loop-reduce -lsrmodel=aggr-estimate -S | FileCheck %s + +; Make sure there are two new loop induction variables are used. +; CHECK-LABEL: @foo( +; CHECK: getelementptr{{.*}}lsr.iv, i64 0, i64 1 +; CHECK: getelementptr{{.*}}lsr.iv2, i64 0, i64 1 + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-linux-gnu" + +@N = common global i32 0, align 4 +@a = common global [1000 x i32] zeroinitializer, align 4 +@b = common global [1000 x i32] zeroinitializer, align 4 +@j = common global i32 0, align 4 + +; Function Attrs: nounwind +define void @foo() #0 { +entry: + %0 = load i32, i32* @N, align 4 + %cmp9 = icmp sgt i32 %0, 0 + br i1 %cmp9, label %for.body.lr.ph, label %for.end + +for.body.lr.ph: ; preds = %entry + %conv = sext i32 %0 to i64 + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %i.010 = phi i64 [ 0, %for.body.lr.ph ], [ %add3, %for.body ] + %add = add nuw nsw i64 %i.010, 12 + %arrayidx = getelementptr inbounds [1000 x i32], [1000 x i32]* @a, i64 0, i64 %add + store i32 15, i32* %arrayidx, align 4 + %sub = add nsw i64 %i.010, -2 + %arrayidx2 = getelementptr inbounds [1000 x i32], [1000 x i32]* @b, i64 0, i64 %sub + store i32 15, i32* %arrayidx2, align 4 + %add3 = add nuw nsw i64 %i.010, 1 + %cmp = icmp slt i64 %add3, %conv + br i1 %cmp, label %for.body, label %for.end.loopexit + +for.end.loopexit: ; preds = %for.body + br label %for.end + +for.end: ; preds = %for.end.loopexit, %entry + ret void +} Index: test/Transforms/LoopStrengthReduce/new_cost_2.ll =================================================================== --- test/Transforms/LoopStrengthReduce/new_cost_2.ll +++ test/Transforms/LoopStrengthReduce/new_cost_2.ll @@ -0,0 +1,111 @@ +; This test makes sure that loop uses post-increment addressing mode to reduce insns. +; RUN: opt < %s -loop-reduce -lsrmodel=aggr-estimate -S | FileCheck %s + +; Make sure no lsr variable is generated. All the address references will share the +; basic induction variable. +; CHECK-NOT: lsr + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.G = type { i8 } +%"class.G::H" = type { i8 } +%class.C = type { %class.tuple } +%class.tuple = type { i8 } +%"class.internal::MatrixNaive" = type { i32, %class.J, %class.J.2, %class.J.2 } +%class.J = type { %struct.F } +%struct.F = type { %struct.anon } +%struct.anon = type { %class.J.2* } +%class.J.2 = type { %struct.F.3 } +%struct.F.3 = type { %struct.anon.4 } +%struct.anon.4 = type { float* } + +$_Z6MatrixI2S1IfLi27ELi51EEEvv = comdat any + +$_ZN6Runner15BenchmarkRunnerIN8internal11MatrixNaiveIfLi27ELi51EEEEEvi = comdat any + +@Benchmark = global %class.G zeroinitializer, align 1 + +declare i32* @_Z8CallbackIPFvvEEPiT_(void ()*) #0 + +; Function Attrs: noreturn uwtable +define linkonce_odr void @_Z6MatrixI2S1IfLi27ELi51EEEvv() #1 comdat { +entry: + tail call void @_ZN6Runner15BenchmarkRunnerIN8internal11MatrixNaiveIfLi27ELi51EEEEEvi(i32 undef) + unreachable +} + +declare void @_ZN1G1HC1EPi(%"class.G::H"*, i32*) #0 + +declare void @_ZN1GC1ENS_1HE(%class.G*) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #2 + +; Function Attrs: noreturn uwtable +define linkonce_odr void @_ZN6Runner15BenchmarkRunnerIN8internal11MatrixNaiveIfLi27ELi51EEEEEvi(i32) #1 comdat align 2 { +entry: + %operation = alloca %class.C, align 1 + %1 = getelementptr inbounds %class.C, %class.C* %operation, i64 0, i32 0, i32 0 + call void @llvm.lifetime.start(i64 1, i8* %1) #2 + br label %for.cond + +for.cond: ; preds = %for.cond, %entry + %call1 = call %"class.internal::MatrixNaive"* @_ZN1CIN8internal11MatrixNaiveIfLi27ELi51EEEEptEv(%class.C* %operation) + %matrix_.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 1 + %call.i = call i64 @_ZN1JIS_If1AIfEES0_IS2_EE4sizeEv(%class.J* %matrix_.i) + %Execute_num_rows.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 0 + %2 = load i32, i32* %Execute_num_rows.i, align 4 + %tobool32.i = icmp eq i32 %2, 0 + br i1 %tobool32.i, label %for.cond, label %for.body.lr.ph.i + +for.body.lr.ph.i: ; preds = %for.cond + %conv.i.le = trunc i64 %call.i to i32 + %3 = bitcast %class.J* %matrix_.i to i64** + %cmp29.i = icmp sgt i32 %conv.i.le, 0 + %_M_start.i27.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 3, i32 0, i32 0, i32 0 + %_M_start.i24.i = getelementptr inbounds %"class.internal::MatrixNaive", %"class.internal::MatrixNaive"* %call1, i64 0, i32 2, i32 0, i32 0, i32 0 + br label %for.body.i + +for.body.i: ; preds = %for.cond.cleanup6.i, %for.body.lr.ph.i + %sum.033.i = phi float [ undef, %for.body.lr.ph.i ], [ %sum.1.lcssa.i, %for.cond.cleanup6.i ] + br i1 %cmp29.i, label %for.body7.lr.ph.i, label %for.cond.cleanup6.i + +for.body7.lr.ph.i: ; preds = %for.body.i + %4 = load i64*, i64** %3, align 8 + %5 = load i64, i64* %4, align 8 + %6 = inttoptr i64 %5 to float* + %7 = load float*, float** %_M_start.i24.i, align 8 + br label %for.body7.i + +for.cond.cleanup6.i.loopexit: ; preds = %for.body7.i + br label %for.cond.cleanup6.i + +for.cond.cleanup6.i: ; preds = %for.cond.cleanup6.i.loopexit, %for.body.i + %sum.1.lcssa.i = phi float [ %sum.033.i, %for.body.i ], [ %add.i, %for.cond.cleanup6.i.loopexit ] + %8 = load float*, float** %_M_start.i27.i, align 8 + store float %sum.1.lcssa.i, float* %8, align 4 + br label %for.body.i + +for.body7.i: ; preds = %for.body7.i, %for.body7.lr.ph.i + %indvars.iv.i = phi i64 [ 0, %for.body7.lr.ph.i ], [ %indvars.iv.next.i, %for.body7.i ] + %sum.130.i = phi float [ %sum.033.i, %for.body7.lr.ph.i ], [ %add.i, %for.body7.i ] + %add.ptr.i26.i = getelementptr inbounds float, float* %6, i64 %indvars.iv.i + %9 = load float, float* %add.ptr.i26.i, align 4 + %add.ptr.i.i = getelementptr inbounds float, float* %7, i64 %indvars.iv.i + %10 = load float, float* %add.ptr.i.i, align 4 + %mul.i = fmul float %9, %10 + %add.i = fadd float %sum.130.i, %mul.i + %indvars.iv.next.i = add nuw nsw i64 %indvars.iv.i, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next.i to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %conv.i.le + br i1 %exitcond, label %for.cond.cleanup6.i.loopexit, label %for.body7.i +} + +declare void @llvm.lifetime.end(i64, i8* nocapture) #2 + +declare %"class.internal::MatrixNaive"* @_ZN1CIN8internal11MatrixNaiveIfLi27ELi51EEEEptEv(%class.C*) #0 + +declare i64 @_ZN1JIS_If1AIfEES0_IS2_EE4sizeEv(%class.J*) #0 + +