Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -63,6 +63,26 @@ Value *PtrVal; }; +/// Decision that was taken by vectorizer during cost calculation for memory +/// instruction. +enum InstWidening { + CM_Unknown, + CM_Widen, + CM_Interleave, + CM_GatherScatter, + CM_Scalarize +}; + +/// A map for cost model vectorization decision and cost for instructions. +struct DecisionList : DenseMap, + std::pair> { + + /// Return the cost model decision for the given instruction \p I and + /// vector width \p VF. Return CM_Unknown if this instruction did not + /// pass through the cost modeling. + InstWidening getWideningDecision(Instruction *I, unsigned VF); +}; + /// \brief This pass provides access to the codegen interfaces that are needed /// for IR-level transformations. class TargetTransformInfo { @@ -434,6 +454,16 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) const; + /// If a target wants to avoid vectorization of a loop due to memory + /// considerations, it should return false from this method. + /// WideningDecisions is provided because if e.g. a store is scalarized, it + /// will result in VF stores. This is needed when a target is known to + /// suffer from store-tags depletion in cases of loops with too many + /// stores. + bool checkVectorizationFactorForMem(DecisionList &WideningDecisions, + unsigned VF, unsigned Cost, + float ScalarCost) const; + bool supportsVectorElementLoadStore() const; /// \brief Don't restrict interleaved unrolling to small loops. @@ -545,7 +575,8 @@ /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. - unsigned getMaxInterleaveFactor(unsigned VF) const; + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr) const; /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc. /// \p Args is an optional argument which holds the instruction operands @@ -783,6 +814,9 @@ getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) = 0; + virtual bool checkVectorizationFactorForMem(DecisionList &WideningDecisions, + unsigned VF, unsigned Cost, + float ScalarCost) = 0; virtual bool supportsVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual bool enableInterleavedAccessVectorization() = 0; @@ -808,7 +842,8 @@ virtual unsigned getPrefetchDistance() = 0; virtual unsigned getMinPrefetchStride() = 0; virtual unsigned getMaxPrefetchIterationsAhead() = 0; - virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0; + virtual unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions) = 0; virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, OperandValueKind Opd2Info, @@ -987,11 +1022,15 @@ unsigned VF) override { return Impl.getOperandsScalarizationOverhead(Args, VF); } - + bool checkVectorizationFactorForMem(DecisionList &WideningDecisions, + unsigned VF, unsigned Cost, + float ScalarCost) override { + return Impl.checkVectorizationFactorForMem(WideningDecisions, VF, + Cost, ScalarCost); + } bool supportsVectorElementLoadStore() override { return Impl.supportsVectorElementLoadStore(); } - bool enableAggressiveInterleaving(bool LoopHasReductions) override { return Impl.enableAggressiveInterleaving(LoopHasReductions); } @@ -1046,8 +1085,9 @@ unsigned getMaxPrefetchIterationsAhead() override { return Impl.getMaxPrefetchIterationsAhead(); } - unsigned getMaxInterleaveFactor(unsigned VF) override { - return Impl.getMaxInterleaveFactor(VF); + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions) override { + return Impl.getMaxInterleaveFactor(VF, WideningDecisions); } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info, Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -262,6 +262,12 @@ unsigned getOperandsScalarizationOverhead(ArrayRef Args, unsigned VF) { return 0; } + bool checkVectorizationFactorForMem(DecisionList &WideningDecisions, + unsigned VF, unsigned Cost, + float ScalarCost) { + return true; + } + bool supportsVectorElementLoadStore() { return false; } bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; } @@ -313,7 +319,10 @@ unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; } - unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr) { + return 1; + } unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -349,7 +349,10 @@ return Cost; } - unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr) { + return 1; + } unsigned getArithmeticInstrCost( unsigned Opcode, Type *Ty, Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -197,6 +197,23 @@ return TTIImpl->getOperandsScalarizationOverhead(Args, VF); } +InstWidening DecisionList:: +getWideningDecision(Instruction *I, unsigned VF) { + assert(VF >= 2 && "Expected VF >=2"); + std::pair InstOnVF = std::make_pair(I, VF); + auto Itr = find(InstOnVF); + if (Itr == end()) + return CM_Unknown; + return Itr->second.first; +} + +bool TargetTransformInfo::checkVectorizationFactorForMem( + DecisionList &WideningDecisions, unsigned VF, unsigned Cost, + float ScalarCost) const { + return TTIImpl->checkVectorizationFactorForMem(WideningDecisions, VF, + Cost, ScalarCost); +} + bool TargetTransformInfo::supportsVectorElementLoadStore() const { return TTIImpl->supportsVectorElementLoadStore(); } @@ -289,8 +306,9 @@ return TTIImpl->getMaxPrefetchIterationsAhead(); } -unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const { - return TTIImpl->getMaxInterleaveFactor(VF); +unsigned TargetTransformInfo:: +getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) const { + return TTIImpl->getMaxInterleaveFactor(VF, WideningDecisions); } int TargetTransformInfo::getArithmeticInstrCost( Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -84,7 +84,7 @@ return 64; } - unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions = nullptr); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I = nullptr); Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -532,7 +532,8 @@ return Cost; } -unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned AArch64TTIImpl:: +getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) { return ST->getMaxInterleaveFactor(); } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -89,7 +89,7 @@ unsigned Alignment, unsigned AddrSpace) const; - unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions = nullptr); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -154,7 +154,8 @@ return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace); } -unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned AMDGPUTTIImpl:: +getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) { // Disable unrolling if the loop is not vectorized. if (VF == 1) return 1; Index: lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.h +++ lib/Target/ARM/ARMTargetTransformInfo.h @@ -88,7 +88,8 @@ return 32; } - unsigned getMaxInterleaveFactor(unsigned VF) { + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr) { return ST->getMaxInterleaveFactor(); } Index: lib/Target/PowerPC/PPCTargetTransformInfo.h =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.h +++ lib/Target/PowerPC/PPCTargetTransformInfo.h @@ -65,7 +65,8 @@ unsigned getRegisterBitWidth(bool Vector); unsigned getCacheLineSize(); unsigned getPrefetchDistance(); - unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetTransformInfo.cpp +++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp @@ -250,7 +250,8 @@ return 300; } -unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned PPCTTIImpl:: +getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) { unsigned Directive = ST->getDarwinDirective(); // The 440 has no SIMD support, but floating-point instructions // have a 5-cycle latency, so unroll by 5x for latency hiding. Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -55,8 +55,13 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - bool supportsVectorElementLoadStore() { return true; } + bool checkVectorizationFactorForMem(DecisionList &WideningDecisions, + unsigned VF, unsigned Cost, float ScalarCost); + bool supportsVectorElementLoadStore() { return true; } + bool enableAggressiveInterleaving(bool LoopHasReductions) { + return LoopHasReductions; + } bool enableInterleavedAccessVectorization() { return true; } bool isFPVectorizationPotentiallyUnsafe() { return false; } @@ -83,6 +88,13 @@ ArrayRef Indices, unsigned Alignment, unsigned AddressSpace); + + std::pair + countStores(DecisionList &WideningDecisions, unsigned VF); + + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr); + /// @} }; Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp =================================================================== --- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -25,6 +25,19 @@ #define DEBUG_TYPE "systemztti" +static cl::opt MaxILFactor("max-interleave-factor", + cl::Hidden, + cl::init(1)); + +// Experimental +static cl::opt SafeStoresRatio("safe-stores-ratio", + cl::Hidden, + cl::init(0.125)); + +static cl::opt MaxStoresRatioIncrease("max-stores-ratio-increase", + cl::Hidden, + cl::init(0.15)); + //===----------------------------------------------------------------------===// // // SystemZ cost model. @@ -781,3 +794,106 @@ return Cost + NumOps; } + +std::pair SystemZTTIImpl:: +countStores(DecisionList &WideningDecisions, unsigned VF) { + unsigned NumScalarStores = 0; + unsigned NumVecStores = 0; + unsigned InterleavedBits = 0; + + for (auto &I : WideningDecisions) { + Instruction *Ins = I.first.first; + if (I.first.second != VF) + continue; + InstWidening Decision = I.second.first; + + if (Ins->getOpcode() != Instruction::Store) + continue; + + Type *MemAccessTy = Ins->getOperand(0)->getType(); + VectorType *VecTy = VectorType::get(MemAccessTy, VF); + unsigned ScalarCount = getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0); + NumScalarStores += ScalarCount; + + switch (Decision) { + case InstWidening::CM_Scalarize: + NumVecStores += (ScalarCount * VF); + break; + case InstWidening::CM_Widen: + NumVecStores += getMemoryOpCost(Instruction::Store, VecTy, 0, 0); + break; + case InstWidening::CM_Interleave: + InterleavedBits += VecTy->getBitWidth(); + break; + case InstWidening::CM_GatherScatter: + assert(0 && "GatherScatter needs handling here"); + break; + case InstWidening::CM_Unknown: + assert(0 && "Store not processed?"); + break; + } + } + + NumVecStores += InterleavedBits / 128; + if (InterleavedBits % 128) + NumVecStores++; + + return std::make_pair(NumScalarStores, NumVecStores); +} + +bool SystemZTTIImpl::checkVectorizationFactorForMem( + DecisionList &WideningDecisions, unsigned VF, + unsigned Cost, float ScalarCost) { + std::pair StoresCount = countStores(WideningDecisions, VF); + unsigned NumScalarStores = StoresCount.first; + unsigned NumVecStores = StoresCount.second; + + float StoresRatioScalar = NumScalarStores / ScalarCost; + float StoresRatioVec = NumVecStores / (float) Cost; + assert (StoresRatioScalar <= 1.0 && StoresRatioVec <= 1.0 && + "Bad ratio calculation."); + + if (StoresRatioVec < SafeStoresRatio) + return true; + + if (StoresRatioVec <= StoresRatioScalar) + return true; + + float StoresRatioIncrease = StoresRatioVec - StoresRatioScalar; + + float Improvement = (ScalarCost / (Cost / (float) VF)); + DEBUG (dbgs() << "STTI: Increased stores ratio -- " << "R_vec: " << StoresRatioVec + << " / R_scal: " << StoresRatioScalar + << " Diff: " << StoresRatioIncrease + << " Imp: " << Improvement << "\n"); + + if (NumVecStores <= 12 && StoresRatioIncrease < MaxStoresRatioIncrease) + return true; + + return false; +} + +unsigned SystemZTTIImpl:: +getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) { + // If the loop will not be vectorized, don't interleave the loop. + // Let regular unroll unroll the loop, which saves the overflow + // check and memory check cost. + if (VF == 1 || !ST->hasVector()) + return 1; + + unsigned Max = MaxILFactor; + + // Num stores after vectorization. Currently just limiting unrolling like + // for the regular unroller. TODO: we could be more generous, because this + // might result in reductions optimisation. + unsigned NumVecStores = countStores(*WideningDecisions, VF).second; + unsigned StoresLim = (NumVecStores ? (12 / NumVecStores) : UINT_MAX); + if (StoresLim == 0) + StoresLim = 1; + if (StoresLim < Max) { + DEBUG (dbgs() << "STTI: Interleaving limited for memory intensive loop.\n"); + Max = StoresLim; + } + + return Max; +} Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -52,7 +52,8 @@ unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector); - unsigned getMaxInterleaveFactor(unsigned VF); + unsigned getMaxInterleaveFactor(unsigned VF, + DecisionList *WideningDecisions = nullptr); int getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -95,7 +95,8 @@ return 32; } -unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) { +unsigned X86TTIImpl:: +getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) { // If the loop will not be vectorized, don't interleave the loop. // Let regular unroll to unroll the loop, which saves the overflow // check and memory check cost. Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1966,15 +1966,6 @@ !isScalarAfterVectorization(I, VF); } - /// Decision that was taken during cost calculation for memory instruction. - enum InstWidening { - CM_Unknown, - CM_Widen, - CM_Interleave, - CM_GatherScatter, - CM_Scalarize - }; - /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. void setWideningDecision(Instruction *I, unsigned VF, InstWidening W, @@ -2004,12 +1995,7 @@ /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. InstWidening getWideningDecision(Instruction *I, unsigned VF) { - assert(VF >= 2 && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); - auto Itr = WideningDecisions.find(InstOnVF); - if (Itr == WideningDecisions.end()) - return CM_Unknown; - return Itr->second.first; + return WideningDecisions.getWideningDecision(I, VF); } /// Return the vectorization cost for the given instruction \p I and vector @@ -2177,10 +2163,6 @@ /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - typedef DenseMap, - std::pair> - DecisionList; - DecisionList WideningDecisions; public: @@ -2970,11 +2952,10 @@ assert((LI || SI) && "Invalid Load/Store instruction"); - LoopVectorizationCostModel::InstWidening Decision = - Cost->getWideningDecision(Instr, VF); - assert(Decision != LoopVectorizationCostModel::CM_Unknown && + InstWidening Decision = Cost->getWideningDecision(Instr, VF); + assert(Decision != InstWidening::CM_Unknown && "CM decision should be taken at this point"); - if (Decision == LoopVectorizationCostModel::CM_Interleave) + if (Decision == InstWidening::CM_Interleave) return vectorizeInterleaveGroup(Instr); Type *ScalarDataTy = getMemInstValueType(Instr); @@ -2989,7 +2970,7 @@ unsigned AddressSpace = getMemInstAddressSpace(Instr); // Scalarize the memory instruction if necessary. - if (Decision == LoopVectorizationCostModel::CM_Scalarize) + if (Decision == InstWidening::CM_Scalarize) return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr)); // Determine if the pointer operand of the access is either consecutive or @@ -2997,7 +2978,7 @@ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr); bool Reverse = ConsecutiveStride < 0; bool CreateGatherScatter = - (Decision == LoopVectorizationCostModel::CM_GatherScatter); + (Decision == InstWidening::CM_GatherScatter); VectorParts VectorGep; @@ -6353,6 +6334,7 @@ // we need to divide the cost of the vector loops by the width of // the vector elements. VectorizationCostTy C = expectedCost(i); + float VectorCost = C.first / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -6363,6 +6345,14 @@ continue; } if (VectorCost < Cost) { + // Target may put a limit on memory intenisve loops. This depends on + // widening decisions, so it must be done after expectedCost(). + if (!TTI.checkVectorizationFactorForMem(WideningDecisions, i, C.first, + ScalarCost)) { + DEBUG (dbgs() << "LV: VF limited for memory intensive loop.\n"); + continue; + } + Cost = VectorCost; Width = i; } @@ -6498,7 +6488,7 @@ std::max(1U, (R.MaxLocalUsers - 1))); // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF, &WideningDecisions); // Check if the user has overridden the max. if (VF == 1) {