Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -346,6 +346,8 @@ bool HasBaseReg, int64_t Scale, unsigned AddrSpace = 0) const; + unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) const; + /// \brief Return true if it's free to truncate a value of type Ty1 to type /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 /// by referencing its sub-register AX. @@ -635,6 +637,7 @@ virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; + virtual unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) = 0; virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0; virtual bool isProfitableToHoist(Instruction *I) = 0; virtual bool isTypeLegal(Type *Ty) = 0; @@ -789,6 +792,9 @@ return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, AddrSpace); } + unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) override { + return Impl.increasedLSRFormulaCost(I, Offset); + } bool isTruncateFree(Type *Ty1, Type *Ty2) override { return Impl.isTruncateFree(Ty1, Ty2); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -226,6 +226,8 @@ return -1; } + unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) { return 0; } + bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; } bool isProfitableToHoist(Instruction *I) { return true; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -139,6 +139,10 @@ return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); } + unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) { + return getTLI()->increasedLSRFormulaCost(I, Offset); + } + bool isTruncateFree(Type *Ty1, Type *Ty2) { return getTLI()->isTruncateFree(Ty1, Ty2); } Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1598,6 +1598,10 @@ return -1; } + virtual unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) const { + return 0; + } + /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -144,6 +144,11 @@ return Cost; } +unsigned TargetTransformInfo::increasedLSRFormulaCost(Instruction *I, + int64_t Offset) const { + return TTIImpl->increasedLSRFormulaCost(I, Offset); +} + bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const { return TTIImpl->isTruncateFree(Ty1, Ty2); } Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -388,6 +388,7 @@ bool isLegalAddImmediate(int64_t Imm) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *Fast) const override; Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Support/CommandLine.h" #include "llvm/IR/Intrinsics.h" #include @@ -524,6 +525,23 @@ return AM.Scale == 0 || AM.Scale == 1; } +unsigned SystemZTargetLowering::increasedLSRFormulaCost(Instruction *I, + int64_t Offset) const { + if (const LoadInst *LI = dyn_cast(I)) { + // There is little support for big offsets with FP / vector types. + if (!isUInt<12>(Offset) && + (LI->getType()->isFloatingPointTy() || LI->getType()->isVectorTy())) + return 1; + } + else if (const StoreInst *StI = dyn_cast(I)) { + // There is no vector store with big offset. + if (!isUInt<12>(Offset) && StI->getOperand(0)->getType()->isVectorTy()) + return 1; + } + + return 0; +} + bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const { if (!FromType->isIntegerTy() || !ToType->isIntegerTy()) return false; Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -858,6 +858,38 @@ namespace { +/// An operand value in an instruction which is to be replaced with some +/// equivalent, possibly strength-reduced, replacement. +struct LSRFixup { + /// The instruction which will be updated. + Instruction *UserInst; + + /// The operand of the instruction which will be replaced. The operand may be + /// used more than once; every instance will be replaced. + Value *OperandValToReplace; + + /// If this user is to use the post-incremented value of an induction + /// variable, this variable is non-null and holds the loop associated with the + /// induction variable. + PostIncLoopSet PostIncLoops; + + /// The index of the LSRUse describing the expression which this fixup needs, + /// minus an offset (below). + size_t LUIdx; + + /// A constant offset to be added to the LSRUse expression. This allows + /// multiple fixups to share the same LSRUse with different offsets, for + /// example in an unrolled loop. + int64_t Offset; + + bool isUseFullyOutsideLoop(const Loop *L) const; + + LSRFixup(); + + void print(raw_ostream &OS) const; + void dump() const; +}; + /// This class is used to measure and compare candidate formulae. class Cost { /// TODO: Some of these could be merged. Also, a lexical ordering @@ -902,6 +934,7 @@ const SmallVectorImpl &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, + const SmallVector &Fixups, SmallPtrSetImpl *LoserRegs = nullptr); void print(raw_ostream &OS) const; @@ -986,60 +1019,6 @@ } } -void Cost::RateFormula(const TargetTransformInfo &TTI, - const Formula &F, - SmallPtrSetImpl &Regs, - const DenseSet &VisitedRegs, - const Loop *L, - const SmallVectorImpl &Offsets, - ScalarEvolution &SE, DominatorTree &DT, - const LSRUse &LU, - SmallPtrSetImpl *LoserRegs) { - assert(F.isCanonical() && "Cost is accurate only for canonical formula"); - // Tally up the registers. - if (const SCEV *ScaledReg = F.ScaledReg) { - if (VisitedRegs.count(ScaledReg)) { - Lose(); - return; - } - RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); - if (isLoser()) - return; - } - for (const SCEV *BaseReg : F.BaseRegs) { - if (VisitedRegs.count(BaseReg)) { - Lose(); - return; - } - RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); - if (isLoser()) - return; - } - - // Determine how many (unfolded) adds we'll need inside the loop. - size_t NumBaseParts = F.getNumRegs(); - if (NumBaseParts > 1) - // Do not count the base and a possible second register if the target - // allows to fold 2 registers. - NumBaseAdds += - NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); - NumBaseAdds += (F.UnfoldedOffset != 0); - - // Accumulate non-free scaling amounts. - ScaleCost += getScalingFactorCost(TTI, LU, F); - - // Tally up the non-zero immediates. - for (int64_t O : Offsets) { - int64_t Offset = (uint64_t)O + F.BaseOffset; - if (F.BaseGV) - ImmCost += 64; // Handle symbolic values conservatively. - // TODO: This should probably be the pointer size. - else if (Offset != 0) - ImmCost += APInt(64, Offset, true).getMinSignedBits(); - } - assert(isValid() && "invalid cost"); -} - /// Set this cost to a losing value. void Cost::Lose() { NumRegs = ~0u; @@ -1082,42 +1061,6 @@ print(errs()); errs() << '\n'; } -namespace { - -/// An operand value in an instruction which is to be replaced with some -/// equivalent, possibly strength-reduced, replacement. -struct LSRFixup { - /// The instruction which will be updated. - Instruction *UserInst; - - /// The operand of the instruction which will be replaced. The operand may be - /// used more than once; every instance will be replaced. - Value *OperandValToReplace; - - /// If this user is to use the post-incremented value of an induction - /// variable, this variable is non-null and holds the loop associated with the - /// induction variable. - PostIncLoopSet PostIncLoops; - - /// The index of the LSRUse describing the expression which this fixup needs, - /// minus an offset (below). - size_t LUIdx; - - /// A constant offset to be added to the LSRUse expression. This allows - /// multiple fixups to share the same LSRUse with different offsets, for - /// example in an unrolled loop. - int64_t Offset; - - bool isUseFullyOutsideLoop(const Loop *L) const; - - LSRFixup(); - - void print(raw_ostream &OS) const; - void dump() const; -}; - -} - LSRFixup::LSRFixup() : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), Offset(0) {} @@ -1222,6 +1165,8 @@ int64_t MinOffset; int64_t MaxOffset; + SmallVector FixupIndexes; + /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. bool AllFixupsOutsideLoop; @@ -1263,6 +1208,68 @@ } +void Cost::RateFormula(const TargetTransformInfo &TTI, + const Formula &F, + SmallPtrSetImpl &Regs, + const DenseSet &VisitedRegs, + const Loop *L, + const SmallVectorImpl &Offsets, + ScalarEvolution &SE, DominatorTree &DT, + const LSRUse &LU, + const SmallVector &Fixups, + SmallPtrSetImpl *LoserRegs) { + assert(F.isCanonical() && "Cost is accurate only for canonical formula"); + // Tally up the registers. + if (const SCEV *ScaledReg = F.ScaledReg) { + if (VisitedRegs.count(ScaledReg)) { + Lose(); + return; + } + RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs); + if (isLoser()) + return; + } + for (const SCEV *BaseReg : F.BaseRegs) { + if (VisitedRegs.count(BaseReg)) { + Lose(); + return; + } + RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs); + if (isLoser()) + return; + } + + // Determine how many (unfolded) adds we'll need inside the loop. + size_t NumBaseParts = F.getNumRegs(); + if (NumBaseParts > 1) + // Do not count the base and a possible second register if the target + // allows to fold 2 registers. + NumBaseAdds += + NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); + NumBaseAdds += (F.UnfoldedOffset != 0); + + // Accumulate non-free scaling amounts. + ScaleCost += getScalingFactorCost(TTI, LU, F); + + // Tally up the non-zero immediates. + for (int64_t O : Offsets) { + int64_t Offset = (uint64_t)O + F.BaseOffset; + if (F.BaseGV) + ImmCost += 64; // Handle symbolic values conservatively. + // TODO: This should probably be the pointer size. + else if (Offset != 0) + ImmCost += APInt(64, Offset, true).getMinSignedBits(); + } + assert(isValid() && "invalid cost"); + + // Allow target to penalize any Instruction+Offset combination. + for (auto &Idx : LU.FixupIndexes) { + const LSRFixup &Fixup = Fixups[Idx]; + int64_t Offset = F.BaseOffset + Fixup.Offset; + NumBaseAdds += TTI.increasedLSRFormulaCost(Fixup.UserInst, Offset); + } +} + /// Test whether this use as a formula which has the same registers as the given /// formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { @@ -3014,6 +3021,7 @@ LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; + LU.FixupIndexes.push_back(Fixups.size() - 1); LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < @@ -3157,6 +3165,7 @@ LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; + LU.FixupIndexes.push_back(Fixups.size() - 1); LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < @@ -3892,7 +3901,7 @@ Cost CostF; Regs.clear(); CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, - &LoserRegs); + Fixups, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated // by uses within other loops that have some non-trivial address mode or @@ -3926,7 +3935,7 @@ Cost CostBest; Regs.clear(); CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, - DT, LU); + DT, LU, Fixups); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4266,7 +4275,7 @@ NewCost = CurCost; NewRegs = CurRegs; NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, - LU); + LU, Fixups); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) {