Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -351,6 +351,12 @@ bool HasBaseReg, int64_t Scale, unsigned AddrSpace = 0) const; + /// \brief Return true if target supports the load / store + /// instruction with the given Offset on the form reg + Offset. It + /// may be that Offset is too big for a certain type (register + /// class). + bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const; + /// \brief Return true if it's free to truncate a value of type Ty1 to type /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16 /// by referencing its sub-register AX. @@ -640,6 +646,7 @@ virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) = 0; + virtual bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) = 0; virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0; virtual bool isProfitableToHoist(Instruction *I) = 0; virtual bool isTypeLegal(Type *Ty) = 0; @@ -794,6 +801,9 @@ return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale, AddrSpace); } + bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) override { + return Impl.isFoldableMemAccessOffset(I, Offset); + } bool isTruncateFree(Type *Ty1, Type *Ty2) override { return Impl.isTruncateFree(Ty1, Ty2); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -226,6 +226,8 @@ return -1; } + bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) { return true; } + bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; } bool isProfitableToHoist(Instruction *I) { return true; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -139,6 +139,10 @@ return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); } + bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) { + return getTLI()->isFoldableMemAccessOffset(I, Offset); + } + bool isTruncateFree(Type *Ty1, Type *Ty2) { return getTLI()->isTruncateFree(Ty1, Ty2); } Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1606,6 +1606,10 @@ return -1; } + virtual bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const { + return true; + } + /// Return true if the specified immediate is legal icmp immediate, that is /// the target has icmp instructions which can compare a register against the /// immediate without having to materialize the immediate into a register. Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -145,6 +145,11 @@ return Cost; } +bool TargetTransformInfo::isFoldableMemAccessOffset(Instruction *I, + int64_t Offset) const { + return TTIImpl->isFoldableMemAccessOffset(I, Offset); +} + bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const { return TTIImpl->isTruncateFree(Ty1, Ty2); } Index: lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.h +++ lib/Target/SystemZ/SystemZISelLowering.h @@ -388,6 +388,7 @@ bool isLegalAddImmediate(int64_t Imm) const override; bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; + bool isFoldableMemAccessOffset(Instruction *I, int64_t Offset) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, bool *Fast) const override; Index: lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- lib/Target/SystemZ/SystemZISelLowering.cpp +++ lib/Target/SystemZ/SystemZISelLowering.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" +#include "llvm/Support/CommandLine.h" #include "llvm/IR/Intrinsics.h" #include @@ -524,6 +525,23 @@ return AM.Scale == 0 || AM.Scale == 1; } +bool SystemZTargetLowering::isFoldableMemAccessOffset(Instruction *I, + int64_t Offset) const { + if (const LoadInst *LI = dyn_cast(I)) { + // There is little support for big offsets with FP / vector types. + if (!isUInt<12>(Offset) && + (LI->getType()->isFloatingPointTy() || LI->getType()->isVectorTy())) + return false; + } + else if (const StoreInst *StI = dyn_cast(I)) { + // There is no vector store with big offset. + if (!isUInt<12>(Offset) && StI->getOperand(0)->getType()->isVectorTy()) + return false; + } + + return true; +} + bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const { if (!FromType->isIntegerTy() || !ToType->isIntegerTy()) return false; Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -883,7 +883,6 @@ SmallPtrSetImpl &Regs, const DenseSet &VisitedRegs, const Loop *L, - const SmallVectorImpl &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl *LoserRegs = nullptr); @@ -902,6 +901,143 @@ ScalarEvolution &SE, DominatorTree &DT, SmallPtrSetImpl *LoserRegs); }; + +/// An operand value in an instruction which is to be replaced with some +/// equivalent, possibly strength-reduced, replacement. +struct LSRFixup { + /// The instruction which will be updated. + Instruction *UserInst; + + /// The operand of the instruction which will be replaced. The operand may be + /// used more than once; every instance will be replaced. + Value *OperandValToReplace; + + /// If this user is to use the post-incremented value of an induction + /// variable, this variable is non-null and holds the loop associated with the + /// induction variable. + PostIncLoopSet PostIncLoops; + + /// A constant offset to be added to the LSRUse expression. This allows + /// multiple fixups to share the same LSRUse with different offsets, for + /// example in an unrolled loop. + int64_t Offset; + + bool isUseFullyOutsideLoop(const Loop *L) const; + + LSRFixup(); + + void print(raw_ostream &OS) const; + void dump() const; +}; + + +/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted +/// SmallVectors of const SCEV*. +struct UniquifierDenseMapInfo { + static SmallVector getEmptyKey() { + SmallVector V; + V.push_back(reinterpret_cast(-1)); + return V; + } + + static SmallVector getTombstoneKey() { + SmallVector V; + V.push_back(reinterpret_cast(-2)); + return V; + } + + static unsigned getHashValue(const SmallVector &V) { + return static_cast(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const SmallVector &LHS, + const SmallVector &RHS) { + return LHS == RHS; + } +}; + +/// This class holds the state that LSR keeps for each use in IVUsers, as well +/// as uses invented by LSR itself. It includes information about what kinds of +/// things can be folded into the user, information about the user itself, and +/// information about how the use may be satisfied. TODO: Represent multiple +/// users of the same expression in common? +class LSRUse { + DenseSet, UniquifierDenseMapInfo> Uniquifier; + +public: + /// An enum for a kind of use, indicating what types of scaled and immediate + /// operands it might support. + enum KindType { + Basic, ///< A normal use, with no folding. + Special, ///< A special case of basic, allowing -1 scales. + Address, ///< An address use; folding according to TargetLowering + ICmpZero ///< An equality icmp with both operands folded into one. + // TODO: Add a generic icmp too? + }; + + typedef PointerIntPair SCEVUseKindPair; + + KindType Kind; + MemAccessTy AccessTy; + + /// The list of operands which are to be replaced. + SmallVector Fixups; + + /// Keep track of the min and max offsets of the fixups. + int64_t MinOffset; + int64_t MaxOffset; + + /// This records whether all of the fixups using this LSRUse are outside of + /// the loop, in which case some special-case heuristics may be used. + bool AllFixupsOutsideLoop; + + /// RigidFormula is set to true to guarantee that this use will be associated + /// with a single formula--the one that initially matched. Some SCEV + /// expressions cannot be expanded. This allows LSR to consider the registers + /// used by those expressions without the need to expand them later after + /// changing the formula. + bool RigidFormula; + + /// This records the widest use type for any fixup using this + /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max + /// fixup widths to be equivalent, because the narrower one may be relying on + /// the implicit truncation to truncate away bogus bits. + Type *WidestFixupType; + + /// A list of ways to build a value that can satisfy this user. After the + /// list is populated, one of these is selected heuristically and used to + /// formulate a replacement for OperandValToReplace in UserInst. + SmallVector Formulae; + + /// The set of register candidates used by all formulae in this LSRUse. + SmallPtrSet Regs; + + LSRUse(KindType K, MemAccessTy AT) + : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), RigidFormula(false), + WidestFixupType(nullptr) {} + + LSRFixup &getNewFixup() { + Fixups.push_back(LSRFixup()); + return Fixups.back(); + } + + void pushFixup(LSRFixup &f) { + Fixups.push_back(f); + if (f.Offset > MaxOffset) + MaxOffset = f.Offset; + if (f.Offset < MinOffset) + MinOffset = f.Offset; + } + + bool HasFormulaWithSameRegs(const Formula &F) const; + bool InsertFormula(const Formula &F); + void DeleteFormula(Formula &F); + void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); + + void print(raw_ostream &OS) const; + void dump() const; +}; } @@ -975,7 +1111,6 @@ SmallPtrSetImpl &Regs, const DenseSet &VisitedRegs, const Loop *L, - const SmallVectorImpl &Offsets, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl *LoserRegs) { @@ -1013,13 +1148,19 @@ ScaleCost += getScalingFactorCost(TTI, LU, F); // Tally up the non-zero immediates. - for (int64_t O : Offsets) { + for (const LSRFixup &Fixup : LU.Fixups) { + int64_t O = Fixup.Offset; int64_t Offset = (uint64_t)O + F.BaseOffset; if (F.BaseGV) ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. else if (Offset != 0) ImmCost += APInt(64, Offset, true).getMinSignedBits(); + + // Check with target if this offset with this instruction is + // specifically not supported. + if (!TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) + NumBaseAdds++; } assert(isValid() && "invalid cost"); } @@ -1066,44 +1207,8 @@ print(errs()); errs() << '\n'; } -namespace { - -/// An operand value in an instruction which is to be replaced with some -/// equivalent, possibly strength-reduced, replacement. -struct LSRFixup { - /// The instruction which will be updated. - Instruction *UserInst; - - /// The operand of the instruction which will be replaced. The operand may be - /// used more than once; every instance will be replaced. - Value *OperandValToReplace; - - /// If this user is to use the post-incremented value of an induction - /// variable, this variable is non-null and holds the loop associated with the - /// induction variable. - PostIncLoopSet PostIncLoops; - - /// The index of the LSRUse describing the expression which this fixup needs, - /// minus an offset (below). - size_t LUIdx; - - /// A constant offset to be added to the LSRUse expression. This allows - /// multiple fixups to share the same LSRUse with different offsets, for - /// example in an unrolled loop. - int64_t Offset; - - bool isUseFullyOutsideLoop(const Loop *L) const; - - LSRFixup(); - - void print(raw_ostream &OS) const; - void dump() const; -}; - -} - LSRFixup::LSRFixup() - : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)), + : UserInst(nullptr), OperandValToReplace(nullptr), Offset(0) {} /// Test whether this fixup always uses its value outside of the given loop. @@ -1139,9 +1244,6 @@ PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); } - if (LUIdx != ~size_t(0)) - OS << ", LUIdx=" << LUIdx; - if (Offset != 0) OS << ", Offset=" << Offset; } @@ -1151,102 +1253,6 @@ print(errs()); errs() << '\n'; } -namespace { - -/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted -/// SmallVectors of const SCEV*. -struct UniquifierDenseMapInfo { - static SmallVector getEmptyKey() { - SmallVector V; - V.push_back(reinterpret_cast(-1)); - return V; - } - - static SmallVector getTombstoneKey() { - SmallVector V; - V.push_back(reinterpret_cast(-2)); - return V; - } - - static unsigned getHashValue(const SmallVector &V) { - return static_cast(hash_combine_range(V.begin(), V.end())); - } - - static bool isEqual(const SmallVector &LHS, - const SmallVector &RHS) { - return LHS == RHS; - } -}; - -/// This class holds the state that LSR keeps for each use in IVUsers, as well -/// as uses invented by LSR itself. It includes information about what kinds of -/// things can be folded into the user, information about the user itself, and -/// information about how the use may be satisfied. TODO: Represent multiple -/// users of the same expression in common? -class LSRUse { - DenseSet, UniquifierDenseMapInfo> Uniquifier; - -public: - /// An enum for a kind of use, indicating what types of scaled and immediate - /// operands it might support. - enum KindType { - Basic, ///< A normal use, with no folding. - Special, ///< A special case of basic, allowing -1 scales. - Address, ///< An address use; folding according to TargetLowering - ICmpZero ///< An equality icmp with both operands folded into one. - // TODO: Add a generic icmp too? - }; - - typedef PointerIntPair SCEVUseKindPair; - - KindType Kind; - MemAccessTy AccessTy; - - SmallVector Offsets; - int64_t MinOffset; - int64_t MaxOffset; - - /// This records whether all of the fixups using this LSRUse are outside of - /// the loop, in which case some special-case heuristics may be used. - bool AllFixupsOutsideLoop; - - /// RigidFormula is set to true to guarantee that this use will be associated - /// with a single formula--the one that initially matched. Some SCEV - /// expressions cannot be expanded. This allows LSR to consider the registers - /// used by those expressions without the need to expand them later after - /// changing the formula. - bool RigidFormula; - - /// This records the widest use type for any fixup using this - /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max - /// fixup widths to be equivalent, because the narrower one may be relying on - /// the implicit truncation to truncate away bogus bits. - Type *WidestFixupType; - - /// A list of ways to build a value that can satisfy this user. After the - /// list is populated, one of these is selected heuristically and used to - /// formulate a replacement for OperandValToReplace in UserInst. - SmallVector Formulae; - - /// The set of register candidates used by all formulae in this LSRUse. - SmallPtrSet Regs; - - LSRUse(KindType K, MemAccessTy AT) - : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), RigidFormula(false), - WidestFixupType(nullptr) {} - - bool HasFormulaWithSameRegs(const Formula &F) const; - bool InsertFormula(const Formula &F); - void DeleteFormula(Formula &F); - void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); - - void print(raw_ostream &OS) const; - void dump() const; -}; - -} - /// Test whether this use as a formula which has the same registers as the given /// formula. bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { @@ -1334,9 +1340,9 @@ OS << ", Offsets={"; bool NeedComma = false; - for (int64_t O : Offsets) { + for (const LSRFixup &Fixup : Fixups) { if (NeedComma) OS << ','; - OS << O; + OS << Fixup.Offset; NeedComma = true; } OS << '}'; @@ -1643,9 +1649,6 @@ /// Interesting use types, to facilitate truncation reuse. SmallSetVector Types; - /// The list of operands which are to be replaced. - SmallVector Fixups; - /// The list of interesting uses. SmallVector Uses; @@ -1678,11 +1681,6 @@ void CollectInterestingTypesAndFactors(); void CollectFixupsAndInitialFormulae(); - LSRFixup &getNewFixup() { - Fixups.push_back(LSRFixup()); - return Fixups.back(); - } - // Support for sharing of LSRUses between LSRFixups. typedef DenseMap UseMapTy; UseMapTy UseMap; @@ -1752,16 +1750,16 @@ const LSRUse &LU, SCEVExpander &Rewriter) const; - Value *Expand(const LSRFixup &LF, + Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts) const; - void RewriteForPHI(PHINode *PN, const LSRFixup &LF, + void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts) const; - void Rewrite(const LSRFixup &LF, + void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts) const; @@ -2241,8 +2239,6 @@ LU.MinOffset = NewMinOffset; LU.MaxOffset = NewMaxOffset; LU.AccessTy = NewAccessTy; - if (NewOffset != LU.Offsets.back()) - LU.Offsets.push_back(NewOffset); return true; } @@ -2279,11 +2275,6 @@ Uses.push_back(LSRUse(Kind, AccessTy)); LSRUse &LU = Uses[LUIdx]; - // We don't need to track redundant offsets, but we don't need to go out - // of our way here to avoid them. - if (LU.Offsets.empty() || Offset != LU.Offsets.back()) - LU.Offsets.push_back(Offset); - LU.MinOffset = Offset; LU.MaxOffset = Offset; return std::make_pair(LUIdx, Offset); @@ -2941,33 +2932,28 @@ if (IVIncSet.count(UseI)) continue; - // Record the uses. - LSRFixup &LF = getNewFixup(); - LF.UserInst = UserInst; - LF.OperandValToReplace = U.getOperandValToReplace(); - LF.PostIncLoops = U.getPostIncLoops(); - LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; - if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { + if (isAddressUse(UserInst, U.getOperandValToReplace())) { Kind = LSRUse::Address; - AccessTy = getAccessType(LF.UserInst); + AccessTy = getAccessType(UserInst); } const SCEV *S = IU.getExpr(U); - + PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops(); + // Equality (== and !=) ICmps are special. We can rewrite (i == N) as // (N - i == 0), and this allows (N - i) to be the expression that we work // with rather than just N or i, so we can consider the register // requirements for both N and i at the same time. Limiting this code to // equality icmps is not a problem because all interesting loops use // equality icmps, thanks to IndVarSimplify. - if (ICmpInst *CI = dyn_cast(LF.UserInst)) + if (ICmpInst *CI = dyn_cast(UserInst)) if (CI->isEquality()) { // Swap the operands if needed to put the OperandValToReplace on the // left, for consistency. Value *NV = CI->getOperand(1); - if (NV == LF.OperandValToReplace) { + if (NV == U.getOperandValToReplace()) { CI->setOperand(1, CI->getOperand(0)); CI->setOperand(0, NV); NV = CI->getOperand(1); @@ -2980,7 +2966,7 @@ // S is normalized, so normalize N before folding it into S // to keep the result normalized. N = TransformForPostIncUse(Normalize, N, CI, nullptr, - LF.PostIncLoops, SE, DT); + TmpPostIncLoops, SE, DT); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); } @@ -2993,12 +2979,20 @@ Factors.insert(-1); } - // Set up the initial formula for this use. + // Get or create an LSRUse. std::pair P = getUse(S, Kind, AccessTy); - LF.LUIdx = P.first; - LF.Offset = P.second; - LSRUse &LU = Uses[LF.LUIdx]; + size_t LUIdx = P.first; + int64_t Offset = P.second; + LSRUse &LU = Uses[LUIdx]; + + // Record the fixup. + LSRFixup &LF = LU.getNewFixup(); + LF.UserInst = UserInst; + LF.OperandValToReplace = U.getOperandValToReplace(); + LF.PostIncLoops = TmpPostIncLoops; + LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); + if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) @@ -3006,8 +3000,8 @@ // If this is the first use of this LSRUse, give it a formula. if (LU.Formulae.empty()) { - InsertInitialFormula(S, LU, LF.LUIdx); - CountRegisters(LU.Formulae.back(), LF.LUIdx); + InsertInitialFormula(S, LU, LUIdx); + CountRegisters(LU.Formulae.back(), LUIdx); } } @@ -3133,20 +3127,21 @@ continue; } - LSRFixup &LF = getNewFixup(); - LF.UserInst = const_cast(UserInst); - LF.OperandValToReplace = U; std::pair P = getUse( S, LSRUse::Basic, MemAccessTy()); - LF.LUIdx = P.first; - LF.Offset = P.second; - LSRUse &LU = Uses[LF.LUIdx]; + size_t LUIdx = P.first; + int64_t Offset = P.second; + LSRUse &LU = Uses[LUIdx]; + LSRFixup &LF = LU.getNewFixup(); + LF.UserInst = const_cast(UserInst); + LF.OperandValToReplace = U; + LF.Offset = Offset; LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L); if (!LU.WidestFixupType || SE.getTypeSizeInBits(LU.WidestFixupType) < SE.getTypeSizeInBits(LF.OperandValToReplace->getType())) LU.WidestFixupType = LF.OperandValToReplace->getType(); - InsertSupplementalFormula(US, LU, LF.LUIdx); + InsertSupplementalFormula(US, LU, LUIdx); CountRegisters(LU.Formulae.back(), Uses.size() - 1); break; } @@ -3875,8 +3870,7 @@ // the corresponding bad register from the Regs set. Cost CostF; Regs.clear(); - CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU, - &LoserRegs); + CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, SE, DT, LU, &LoserRegs); if (CostF.isLoser()) { // During initial formula generation, undesirable formulae are generated // by uses within other loops that have some non-trivial address mode or @@ -3909,8 +3903,7 @@ Cost CostBest; Regs.clear(); - CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE, - DT, LU); + CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU); if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); @@ -4056,25 +4049,13 @@ LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; - // Update the relocs to reference the new use. - for (LSRFixup &Fixup : Fixups) { - if (Fixup.LUIdx == LUIdx) { - Fixup.LUIdx = LUThatHas - &Uses.front(); - Fixup.Offset += F.BaseOffset; - // Add the new offset to LUThatHas' offset list. - if (LUThatHas->Offsets.back() != Fixup.Offset) { - LUThatHas->Offsets.push_back(Fixup.Offset); - if (Fixup.Offset > LUThatHas->MaxOffset) - LUThatHas->MaxOffset = Fixup.Offset; - if (Fixup.Offset < LUThatHas->MinOffset) - LUThatHas->MinOffset = Fixup.Offset; - } - DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); - } - if (Fixup.LUIdx == NumUses-1) - Fixup.LUIdx = LUIdx; + // Transfer the fixups of LU to LUThatHas. + for (LSRFixup &Fixup : LU.Fixups) { + Fixup.Offset += F.BaseOffset; + LUThatHas->pushFixup(Fixup); + DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n'); } - + // Delete formulae from the new use which are no longer legal. bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { @@ -4249,8 +4230,7 @@ // the current best, prune the search at that point. NewCost = CurCost; NewRegs = CurRegs; - NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT, - LU); + NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU); if (NewCost < SolutionCost) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { @@ -4433,12 +4413,12 @@ /// Emit instructions for the leading candidate expression for this LSRUse (this /// is called "expanding"). -Value *LSRInstance::Expand(const LSRFixup &LF, +Value *LSRInstance::Expand(const LSRUse &LU, + const LSRFixup &LF, const Formula &F, BasicBlock::iterator IP, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts) const { - const LSRUse &LU = Uses[LF.LUIdx]; if (LU.RigidFormula) return LF.OperandValToReplace; @@ -4619,6 +4599,7 @@ /// effectively happens in their predecessor blocks, so the expression may need /// to be expanded in multiple places. void LSRInstance::RewriteForPHI(PHINode *PN, + const LSRUse &LU, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, @@ -4672,7 +4653,7 @@ if (!Pair.second) PN->setIncomingValue(i, Pair.first->second); else { - Value *FullV = Expand(LF, F, BB->getTerminator()->getIterator(), + Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. @@ -4693,17 +4674,18 @@ /// Emit instructions for the leading candidate expression for this LSRUse (this /// is called "expanding"), and update the UserInst to reference the newly /// expanded value. -void LSRInstance::Rewrite(const LSRFixup &LF, +void LSRInstance::Rewrite(const LSRUse &LU, + const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, SmallVectorImpl &DeadInsts) const { // First, find an insertion point that dominates UserInst. For PHI nodes, // find the nearest block which dominates all the relevant uses. if (PHINode *PN = dyn_cast(LF.UserInst)) { - RewriteForPHI(PN, LF, F, Rewriter, DeadInsts); + RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts); } else { Value *FullV = - Expand(LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); + Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts); // If this is reuse-by-noop-cast, insert the noop cast. Type *OpTy = LF.OperandValToReplace->getType(); @@ -4719,7 +4701,7 @@ // its new value may happen to be equal to LF.OperandValToReplace, in // which case doing replaceUsesOfWith leads to replacing both operands // with the same value. TODO: Reorganize this. - if (Uses[LF.LUIdx].Kind == LSRUse::ICmpZero) + if (LU.Kind == LSRUse::ICmpZero) LF.UserInst->setOperand(0, FullV); else LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV); @@ -4752,11 +4734,11 @@ } // Expand the new value definitions and update the users. - for (const LSRFixup &Fixup : Fixups) { - Rewrite(Fixup, *Solution[Fixup.LUIdx], Rewriter, DeadInsts); - - Changed = true; - } + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) + for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) { + Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts); + Changed = true; + } for (const IVChain &Chain : IVChainVec) { GenerateIVChain(Chain, Rewriter, DeadInsts); @@ -4900,11 +4882,12 @@ void LSRInstance::print_fixups(raw_ostream &OS) const { OS << "LSR is examining the following fixup sites:\n"; - for (const LSRFixup &LF : Fixups) { - dbgs() << " "; - LF.print(OS); - OS << '\n'; - } + for (const LSRUse &LU : Uses) + for (const LSRFixup &LF : LU.Fixups) { + dbgs() << " "; + LF.print(OS); + OS << '\n'; + } } void LSRInstance::print_uses(raw_ostream &OS) const { Index: test/CodeGen/SystemZ/loop-01.ll =================================================================== --- test/CodeGen/SystemZ/loop-01.ll +++ test/CodeGen/SystemZ/loop-01.ll @@ -1,6 +1,8 @@ ; Test loop tuning. ; ; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 \ +; RUN: | FileCheck %s -check-prefix=CHECK-Z13 ; Test that strength reduction is applied to addresses with a scale factor, ; but that indexed addressing can still be used. @@ -122,3 +124,118 @@ exit: ret void } + +; Test that negative offsets are avoided for loads of floating point. +%s.float = type { float, float, float } +define void @f5(%s.float* nocapture %a, + %s.float* nocapture readonly %b, + i32 zeroext %S) { +; CHECK-LABEL: f5: +; CHECK-NOT: -{{[0-9]+}}(%r + +entry: + %cmp9 = icmp eq i32 %S, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %a1 = getelementptr inbounds %s.float, %s.float* %b, i64 %indvars.iv, i32 0 + %0 = load float, float* %a1, align 4 + %b4 = getelementptr inbounds %s.float, %s.float* %b, i64 %indvars.iv, i32 1 + %1 = load float, float* %b4, align 4 + %add = fadd float %0, %1 + %c = getelementptr inbounds %s.float, %s.float* %b, i64 %indvars.iv, i32 2 + %2 = load float, float* %c, align 4 + %add7 = fadd float %add, %2 + %a10 = getelementptr inbounds %s.float, %s.float* %a, i64 %indvars.iv, i32 0 + store float %add7, float* %a10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %S + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; Test that negative offsets are avoided for loads of double. +%s.double = type { double, double, double } +define void @f6(%s.double* nocapture %a, + %s.double* nocapture readonly %b, + i32 zeroext %S) { +; CHECK-LABEL: f6: +; CHECK-NOT: -{{[0-9]+}}(%r +entry: + %cmp9 = icmp eq i32 %S, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %a1 = getelementptr inbounds %s.double, %s.double* %b, i64 %indvars.iv, i32 0 + %0 = load double, double* %a1, align 4 + %b4 = getelementptr inbounds %s.double, %s.double* %b, i64 %indvars.iv, i32 1 + %1 = load double, double* %b4, align 4 + %add = fadd double %0, %1 + %c = getelementptr inbounds %s.double, %s.double* %b, i64 %indvars.iv, i32 2 + %2 = load double, double* %c, align 4 + %add7 = fadd double %add, %2 + %a10 = getelementptr inbounds %s.double, %s.double* %a, i64 %indvars.iv, i32 0 + store double %add7, double* %a10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %S + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; Test that negative offsets are avoided for memory accesses of vector type. +%s.vec = type { <4 x i32>, <4 x i32>, <4 x i32> } +define void @f7(%s.vec* nocapture %a, + %s.vec* nocapture readonly %b, + i32 zeroext %S) { +; CHECK-Z13-LABEL: f7: +; CHECK-Z13-NOT: -{{[0-9]+}}(%r +entry: + %cmp9 = icmp eq i32 %S, 0 + br i1 %cmp9, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + ret void + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ] + %a1 = getelementptr inbounds %s.vec, %s.vec* %b, i64 %indvars.iv, i32 0 + %0 = load <4 x i32>, <4 x i32>* %a1, align 4 + %b4 = getelementptr inbounds %s.vec, %s.vec* %b, i64 %indvars.iv, i32 1 + %1 = load <4 x i32>, <4 x i32>* %b4, align 4 + %add = add <4 x i32> %0, %1 + %c = getelementptr inbounds %s.vec, %s.vec* %b, i64 %indvars.iv, i32 2 + %2 = load <4 x i32>, <4 x i32>* %c, align 4 + %add7 = add <4 x i32> %add, %2 + %a10 = getelementptr inbounds %s.vec, %s.vec* %a, i64 %indvars.iv, i32 0 + store <4 x i32> %add7, <4 x i32>* %a10, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %S + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +}