Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -63,6 +63,26 @@ Value *PtrVal; }; +/// Decision that was taken by vectorizer during cost calculation for memory +/// instruction. +enum InstWidening { + CM_Unknown, + CM_Widen, + CM_Interleave, + CM_GatherScatter, + CM_Scalarize +}; + +/// A map for cost model vectorization decision and cost for instructions. +struct DecisionList : DenseMap, + std::pair> { + + /// Return the cost model decision for the given instruction \p I and + /// vector width \p VF. Return CM_Unknown if this instruction did not + /// pass through the cost modeling. + InstWidening getWideningDecision(Instruction *I, unsigned VF); +}; + /// \brief This pass provides access to the codegen interfaces that are needed /// for IR-level transformations. class TargetTransformInfo { @@ -434,6 +454,16 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// If a target wants to avoid the unrolling / widening of loops in general + /// based on the number of stores in the resulting loop, it should return a + /// pair of values here that represent the maximum number of stores in a + /// resulting loop, and the actual counted number of stores in L. + /// WideningDecisions is provided because if a store is scalarized, it will + /// result in VF stores. This is needed when a target is known to suffer + /// from store-tags depletion in cases of loops with too many stores. + std::pair getStoresLimOfResultingLoop(Loop *L, + DecisionList WideningDecisions, unsigned VF) const; + /// \brief Don't restrict interleaved unrolling to small loops. bool enableAggressiveInterleaving(bool LoopHasReductions) const; @@ -771,6 +801,8 @@ getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual std::pair getStoresLimOfResultingLoop(Loop *L, + DecisionList WideningDecisions, unsigned VF) = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; virtual bool isFPVectorizationPotentiallyUnsafe() = 0; @@ -974,7 +1006,10 @@ unsigned VF) override { return Impl.getOperandsScalarizationOverhead(Args, VF); } - + std::pair getStoresLimOfResultingLoop(Loop *L, + DecisionList WideningDecisions, unsigned VF) override { + return Impl.getStoresLimOfResultingLoop(L, WideningDecisions, VF); + } bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -262,6 +262,11 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + std::pair getStoresLimOfResultingLoop(Loop *L, + DecisionList WideningDecisions, unsigned VF) { + return std::make_pair(0, 0); + } + bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } bool enableInterleavedAccessVectorization() { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -197,6 +197,22 @@ return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +InstWidening DecisionList:: +getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); + auto Itr = find(InstOnVF); + if (Itr == end()) + return CM_Unknown; + return Itr->second.first; +} + +std::pair TargetTransformInfo:: +getStoresLimOfResultingLoop(Loop *L, + DecisionList WideningDecisions, unsigned VF) const { + return TTIImpl->getStoresLimOfResultingLoop(L, WideningDecisions, VF); +} + bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const { return TTIImpl->enableAggressiveInterleaving(LoopHasReductions); } Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -53,6 +53,9 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); + std::pair getStoresLimOfResultingLoop(Loop *L, + DecisionList WideningDecisions, unsigned VF); + /// @} }; Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -313,3 +313,22 @@ return 0; } +std::pair SystemZTTIImpl::getStoresLimOfResultingLoop( + Loop *L, DecisionList WideningDecisions, unsigned VF) { + unsigned NumStores = 0; + for (BasicBlock *BB : L->blocks()) { + for (Instruction &I : *BB) { + if (isa(&I)) { + Type *MemAccessTy = I.getOperand(0)->getType(); + unsigned N = getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0); + if (VF > 1 && + WideningDecisions.getWideningDecision(&I, VF) == + InstWidening::CM_Scalarize) + N *= VF; + NumStores += N; + } + } + } + return std::make_pair(12, NumStores); +} + Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1946,15 +1946,6 @@ !isScalarAfterVectorization(I, VF); } - /// Decision that was taken during cost calculation for memory instruction. - enum InstWidening { - CM_Unknown, - CM_Widen, - CM_Interleave, - CM_GatherScatter, - CM_Scalarize - }; - /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, @@ -1984,12 +1975,7 @@ /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); - auto Itr = WideningDecisions.find(InstOnVF); - if (Itr == WideningDecisions.end()) - return CM_Unknown; - return Itr->second.first; + return WideningDecisions.getWideningDecision(I, VF); } /// Return the vectorization cost for the given instruction \p I and vector @@ -2153,12 +2139,40 @@ /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - typedef DenseMap, - std::pair> - DecisionList; - DecisionList WideningDecisions; + // Do an extra check to see if VF is ok, in the context of memory + // accesses. If the target has specified a limit for the number of stores + // in the resulting loop, the stores will be counted (and multiplied by VF + // in case of scalarization), and then true will be returned only if the + // sum is less than the limit. + bool checkVectorizationFactorForMem(unsigned VF) { + std::pair StoresVals = + TTI.getStoresLimOfResultingLoop(TheLoop, WideningDecisions, VF); + unsigned MaxNumStores = StoresVals.first; + unsigned NumStores = StoresVals.second; + + if (!MaxNumStores) + return true; + return (NumStores <= MaxNumStores); + } + + // Similar to above, except that this involves the interleaving factor + // (unrolling) of the loop after VF has been decided on. If the target + // specifies a limit for the number of stores, a limit for the interleave + // factor is returned. + unsigned limitUnrollForMem(unsigned VF) { + std::pair StoresVals = + TTI.getStoresLimOfResultingLoop(TheLoop, WideningDecisions, VF); + unsigned MaxNumStores = StoresVals.first; + unsigned NumStores = StoresVals.second; + + if (!MaxNumStores) + return UINT_MAX; + unsigned Max = (NumStores ? (MaxNumStores / NumStores) : UINT_MAX); + return (Max > 0 ? Max : 1); + } + public: /// The loop that we evaluate. Loop *TheLoop; @@ -2929,11 +2943,10 @@ assert((LI || SI) && "Invalid Load/Store instruction"); - LoopVectorizationCostModel::InstWidening Decision = - Cost->getWideningDecision(Instr, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && + InstWidening Decision = Cost->getWideningDecision(Instr, VF); + assert(Decision != InstWidening::CM_Unknown && "CM decision should be taken at this point"); - if (Decision == LoopVectorizationCostModel::CM_Interleave) + if (Decision == InstWidening::CM_Interleave) return vectorizeInterleaveGroup(Instr); Type *ScalarDataTy = getMemInstValueType(Instr); @@ -2948,7 +2961,7 @@ unsigned AddressSpace = getMemInstAddressSpace(Instr); // Scalarize the memory instruction if necessary. - if (Decision == LoopVectorizationCostModel::CM_Scalarize) + if (Decision == InstWidening::CM_Scalarize) return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); // Determine if the pointer operand of the access is either consecutive or @@ -2956,7 +2969,7 @@ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; bool CreateGatherScatter = - (Decision == LoopVectorizationCostModel::CM_GatherScatter); + (Decision == InstWidening::CM_GatherScatter); VectorParts VectorGep; @@ -6272,6 +6285,13 @@ // we need to divide the cost of the vector loops by the width of // the vector elements. VectorizationCostTy C = expectedCost(i); + + // Target may put a limit on memory intenisve loops. + if (!checkVectorizationFactorForMem(i)) { + DEBUG (dbgs() << "LV: VF limited for memory intensive loop.\n"); + break; + } + float VectorCost = C.first / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -6420,6 +6440,13 @@ // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + // Target may put a limit on memory intenisve loops. + unsigned Lim = limitUnrollForMem(VF); + if (Lim < MaxInterleaveCount) { + DEBUG (dbgs() << "LV: Interleaving limited for memory intensive loop.\n"); + MaxInterleaveCount = Lim; + } + // Check if the user has overridden the max. if (VF == 1) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)