Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2164,6 +2164,48 @@ DecisionList WideningDecisions; + unsigned countResultingNumStores(unsigned VF) { + unsigned NumStores = 0; + for (BasicBlock *BB : TheLoop->blocks()) { + for (Instruction &I : *BB) { + if (isa(&I)) { + Type *MemAccessTy = I.getOperand(0)->getType(); + unsigned N = TTI.getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0); + if (VF > 1 && + getWideningDecision(&I, VF) == LoopVectorizationCostModel::CM_Scalarize) + N *= VF; + NumStores += N; + } + } + } + return NumStores; + } + + // Do an extra check to see if VF is ok, in the context of memory + // accesses. If the target has specified a limit for the number of stores + // in the resulting loop, the stores will be counted (and multiplied by VF + // in case of scalarization), and then true will be returned only if the + // sum is less than the limit. + bool checkVectorizationFactorForMem(unsigned VF) { + unsigned MaxNumStores = TTI.getMaxNumStoresInResultingLoop(); + if (!MaxNumStores) + return true; + return (countResultingNumStores(VF) <= MaxNumStores); + } + + // Similar to above, except that this involves the interleaving factor + // (unrolling) of the loop after VF has been decided on. If the target + // specifies a limit for the number of stores, a limit for the interleave + // factor is returned. + unsigned limitUnrollForMem(unsigned VF) { + unsigned MaxNumStores = TTI.getMaxNumStoresInResultingLoop(); + if (!MaxNumStores) + return UINT_MAX; + unsigned NumStores = countResultingNumStores(VF); + unsigned Max = (NumStores ? (MaxNumStores / NumStores) : UINT_MAX); + return (Max > 0 ? Max : 1); + } + public: /// The loop that we evaluate. Loop *TheLoop; @@ -6312,6 +6354,11 @@ // we need to divide the cost of the vector loops by the width of // the vector elements. VectorizationCostTy C = expectedCost(i); + + // Target may put a limit on memory intenisve loops. + if (!checkVectorizationFactorForMem(i)) + break; + float VectorCost = C.first / (float)i; DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); @@ -6460,6 +6507,11 @@ // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + // Target may put a limit on memory intenisve loops. + unsigned Lim = limitUnrollForMem(VF); + if (Lim < MaxInterleaveCount) + MaxInterleaveCount = Lim; + // Check if the user has overridden the max. if (VF == 1) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)