Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -63,6 +63,26 @@
   Value *PtrVal;
 };
 
+/// Decision that was taken by vectorizer during cost calculation for memory
+/// instruction.
+enum InstWidening {
+  CM_Unknown,
+  CM_Widen,
+  CM_Interleave,
+  CM_GatherScatter,
+  CM_Scalarize
+};
+
+/// A map for cost model vectorization decision and cost for instructions.
+struct DecisionList : DenseMap<std::pair<Instruction *, unsigned>,
+                               std::pair<InstWidening, unsigned>> {
+
+  /// Return the cost model decision for the given instruction \p I and
+  /// vector width \p VF. Return CM_Unknown if this instruction did not
+  /// pass through the cost modeling.
+  InstWidening getWideningDecision(Instruction *I, unsigned VF);
+};
+
 /// \brief This pass provides access to the codegen interfaces that are needed
 /// for IR-level transformations.
 class TargetTransformInfo {
@@ -434,6 +454,16 @@
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) const;
 
+  /// If a target wants to avoid vectorization of a loop due to memory
+  /// considerations, it should return false from this method.
+  /// WideningDecisions is provided because if e.g. a store is scalarized, it
+  /// will result in VF stores. This is needed when a target is known to
+  /// suffer from store-tags depletion in cases of loops with too many
+  /// stores.
+  bool checkVectorizationFactorForMem(DecisionList &WideningDecisions,
+                                      unsigned VF, unsigned Cost,
+                                      float ScalarCost) const;
+
   bool supportsVectorElementLoadStore() const;
 
   /// \brief Don't restrict interleaved unrolling to small loops.
@@ -545,7 +575,8 @@
   /// \return The maximum interleave factor that any transform should try to
   /// perform for this target. This number depends on the level of parallelism
   /// and the number of execution units in the CPU.
-  unsigned getMaxInterleaveFactor(unsigned VF) const;
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                              DecisionList *WideningDecisions = nullptr) const;
 
   /// \return The expected cost of arithmetic ops, such as mul, xor, fsub, etc.
   /// \p Args is an optional argument which holds the instruction operands  
@@ -783,6 +814,9 @@
   getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                     unsigned VF) = 0;
+  virtual bool checkVectorizationFactorForMem(DecisionList &WideningDecisions,
+                                              unsigned VF, unsigned Cost,
+                                              float ScalarCost) = 0;
   virtual bool supportsVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
@@ -808,7 +842,8 @@
   virtual unsigned getPrefetchDistance() = 0;
   virtual unsigned getMinPrefetchStride() = 0;
   virtual unsigned getMaxPrefetchIterationsAhead() = 0;
-  virtual unsigned getMaxInterleaveFactor(unsigned VF) = 0;
+  virtual unsigned getMaxInterleaveFactor(unsigned VF,
+                                          DecisionList *WideningDecisions) = 0;
   virtual unsigned
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
                          OperandValueKind Opd2Info,
@@ -987,11 +1022,15 @@
                                             unsigned VF) override {
     return Impl.getOperandsScalarizationOverhead(Args, VF);
   }
-
+  bool checkVectorizationFactorForMem(DecisionList &WideningDecisions,
+                                      unsigned VF, unsigned Cost,
+                                      float ScalarCost) override {
+    return  Impl.checkVectorizationFactorForMem(WideningDecisions, VF,
+                                                Cost, ScalarCost);
+  }
   bool supportsVectorElementLoadStore() override {
     return Impl.supportsVectorElementLoadStore();
   }
-
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
@@ -1046,8 +1085,9 @@
   unsigned getMaxPrefetchIterationsAhead() override {
     return Impl.getMaxPrefetchIterationsAhead();
   }
-  unsigned getMaxInterleaveFactor(unsigned VF) override {
-    return Impl.getMaxInterleaveFactor(VF);
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions) override {
+    return Impl.getMaxInterleaveFactor(VF, WideningDecisions);
   }
   unsigned
   getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -262,6 +262,12 @@
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) { return 0; }
 
+  bool checkVectorizationFactorForMem(DecisionList &WideningDecisions,
+                                      unsigned VF, unsigned Cost,
+                                      float ScalarCost) {
+    return true;
+  }
+
   bool supportsVectorElementLoadStore() { return false; }
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
@@ -313,7 +319,10 @@
 
   unsigned getMaxPrefetchIterationsAhead() { return UINT_MAX; }
 
-  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions = nullptr) {
+    return 1;
+  }
 
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                   TTI::OperandValueKind Opd1Info,
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -349,7 +349,10 @@
     return Cost;
   }
 
-  unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions = nullptr) {
+    return 1;
+  }
 
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -197,6 +197,23 @@
   return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
 }
 
+InstWidening DecisionList::
+getWideningDecision(Instruction *I, unsigned VF) {
+  assert(VF >= 2 && "Expected VF >=2");
+  std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+  auto Itr = find(InstOnVF);
+  if (Itr == end())
+    return CM_Unknown;
+  return Itr->second.first;
+}
+
+bool TargetTransformInfo::checkVectorizationFactorForMem(
+           DecisionList &WideningDecisions, unsigned VF, unsigned Cost,
+           float ScalarCost) const {
+  return TTIImpl->checkVectorizationFactorForMem(WideningDecisions, VF,
+                                                 Cost, ScalarCost);
+}
+
 bool TargetTransformInfo::supportsVectorElementLoadStore() const {
   return TTIImpl->supportsVectorElementLoadStore();
 }
@@ -289,8 +306,9 @@
   return TTIImpl->getMaxPrefetchIterationsAhead();
 }
 
-unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
-  return TTIImpl->getMaxInterleaveFactor(VF);
+unsigned TargetTransformInfo::
+getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) const {
+  return TTIImpl->getMaxInterleaveFactor(VF, WideningDecisions);
 }
 
 int TargetTransformInfo::getArithmeticInstrCost(
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -84,7 +84,7 @@
     return 64;
   }
 
-  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions = nullptr);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
                        const Instruction *I = nullptr);
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -532,7 +532,8 @@
   return Cost;
 }
 
-unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned AArch64TTIImpl::
+getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) {
   return ST->getMaxInterleaveFactor();
 }
 
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -89,7 +89,7 @@
                                     unsigned Alignment,
                                     unsigned AddrSpace) const;
 
-  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions = nullptr);
 
   int getArithmeticInstrCost(
     unsigned Opcode, Type *Ty,
Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -154,7 +154,8 @@
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
-unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned AMDGPUTTIImpl::
+getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) {
   // Disable unrolling if the loop is not vectorized.
   if (VF == 1)
     return 1;
Index: lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- lib/Target/ARM/ARMTargetTransformInfo.h
+++ lib/Target/ARM/ARMTargetTransformInfo.h
@@ -88,7 +88,8 @@
     return 32;
   }
 
-  unsigned getMaxInterleaveFactor(unsigned VF) {
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions = nullptr) {
     return ST->getMaxInterleaveFactor();
   }
 
Index: lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -65,7 +65,8 @@
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getCacheLineSize();
   unsigned getPrefetchDistance();
-  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions = nullptr);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -250,7 +250,8 @@
   return 300;
 }
 
-unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned PPCTTIImpl::
+getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) {
   unsigned Directive = ST->getDarwinDirective();
   // The 440 has no SIMD support, but floating-point instructions
   // have a 5-cycle latency, so unroll by 5x for latency hiding.
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.h
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -55,8 +55,13 @@
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
 
-  bool supportsVectorElementLoadStore() { return true; }
+  bool checkVectorizationFactorForMem(DecisionList &WideningDecisions,
+                                      unsigned VF, unsigned Cost, float ScalarCost);
 
+  bool supportsVectorElementLoadStore() { return true; }
+  bool enableAggressiveInterleaving(bool LoopHasReductions) {
+    return LoopHasReductions;
+  }
   bool enableInterleavedAccessVectorization() { return true; }
   bool isFPVectorizationPotentiallyUnsafe() { return false; }
 
@@ -83,6 +88,13 @@
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
                                  unsigned AddressSpace);
+
+  std::pair<unsigned, unsigned>
+    countStores(DecisionList &WideningDecisions, unsigned VF);
+
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions = nullptr);
+
   /// @}
 };
 
Index: lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -25,6 +25,19 @@
 
 #define DEBUG_TYPE "systemztti"
 
+static cl::opt<unsigned> MaxILFactor("max-interleave-factor",
+                                                  cl::Hidden,
+                                                  cl::init(1));
+
+// Experimental
+static cl::opt<float> SafeStoresRatio("safe-stores-ratio",
+                                      cl::Hidden,
+                                      cl::init(0.125));
+
+static cl::opt<float> MaxStoresRatioIncrease("max-stores-ratio-increase",
+                                      cl::Hidden,
+                                      cl::init(0.15));
+
 //===----------------------------------------------------------------------===//
 //
 // SystemZ cost model.
@@ -781,3 +794,106 @@
 
   return Cost + NumOps;
 }
+
+std::pair<unsigned, unsigned> SystemZTTIImpl::
+countStores(DecisionList &WideningDecisions, unsigned VF) {
+  unsigned NumScalarStores = 0;
+  unsigned NumVecStores = 0;
+  unsigned InterleavedBits = 0;
+
+  for (auto &I : WideningDecisions) {
+    Instruction *Ins = I.first.first;
+    if (I.first.second != VF)
+      continue;
+    InstWidening Decision = I.second.first;
+
+    if (Ins->getOpcode() != Instruction::Store)
+      continue;
+
+    Type *MemAccessTy = Ins->getOperand(0)->getType();
+    VectorType *VecTy = VectorType::get(MemAccessTy, VF);
+    unsigned ScalarCount = getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
+    NumScalarStores += ScalarCount;
+
+    switch (Decision) {
+    case InstWidening::CM_Scalarize:
+      NumVecStores += (ScalarCount * VF);
+      break;
+    case InstWidening::CM_Widen:
+      NumVecStores += getMemoryOpCost(Instruction::Store, VecTy, 0, 0);
+      break;
+    case InstWidening::CM_Interleave:
+      InterleavedBits += VecTy->getBitWidth();
+      break;
+    case InstWidening::CM_GatherScatter:
+      assert(0 && "GatherScatter needs handling here");
+      break;
+    case InstWidening::CM_Unknown:
+      assert(0 && "Store not processed?");
+      break;
+    }
+  }
+
+  NumVecStores += InterleavedBits / 128;
+  if (InterleavedBits % 128)
+    NumVecStores++;
+
+  return std::make_pair(NumScalarStores, NumVecStores);
+}
+
+bool SystemZTTIImpl::checkVectorizationFactorForMem(
+        DecisionList &WideningDecisions, unsigned VF,
+        unsigned Cost, float ScalarCost) {
+  std::pair<unsigned, unsigned> StoresCount = countStores(WideningDecisions, VF);
+  unsigned NumScalarStores = StoresCount.first;
+  unsigned NumVecStores = StoresCount.second;
+
+  float StoresRatioScalar = NumScalarStores / ScalarCost;
+  float StoresRatioVec    = NumVecStores / (float) Cost;
+  assert (StoresRatioScalar <= 1.0 && StoresRatioVec <= 1.0 &&
+          "Bad ratio calculation.");
+
+  if (StoresRatioVec < SafeStoresRatio)
+    return true;
+
+  if (StoresRatioVec <= StoresRatioScalar)
+    return true;
+
+  float StoresRatioIncrease = StoresRatioVec - StoresRatioScalar;
+
+  float Improvement = (ScalarCost / (Cost / (float) VF));
+  DEBUG (dbgs() << "STTI: Increased stores ratio -- " << "R_vec: " << StoresRatioVec
+         << " / R_scal: " << StoresRatioScalar
+         << " Diff: " << StoresRatioIncrease
+         << " Imp: " << Improvement << "\n");
+
+  if (NumVecStores <= 12 && StoresRatioIncrease < MaxStoresRatioIncrease)
+    return true;
+
+  return false;
+}
+
+unsigned SystemZTTIImpl::
+getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) {
+  // If the loop will not be vectorized, don't interleave the loop.
+  // Let regular unroll unroll the loop, which saves the overflow
+  // check and memory check cost.
+  if (VF == 1 || !ST->hasVector())
+    return 1;
+
+  unsigned Max = MaxILFactor;
+
+  // Num stores after vectorization. Currently just limiting unrolling like
+  // for the regular unroller. TODO: we could be more generous, because this
+  // might result in reductions optimisation.
+  unsigned NumVecStores = countStores(*WideningDecisions, VF).second;
+  unsigned StoresLim = (NumVecStores ? (12 / NumVecStores) : UINT_MAX);
+  if (StoresLim == 0)
+    StoresLim = 1;
+  if (StoresLim < Max) {
+    DEBUG (dbgs() << "STTI: Interleaving limited for memory intensive loop.\n");
+    Max = StoresLim;
+  }
+
+  return Max;
+}
Index: lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.h
+++ lib/Target/X86/X86TargetTransformInfo.h
@@ -52,7 +52,8 @@
 
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
-  unsigned getMaxInterleaveFactor(unsigned VF);
+  unsigned getMaxInterleaveFactor(unsigned VF,
+                                  DecisionList *WideningDecisions = nullptr);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -95,7 +95,8 @@
   return 32;
 }
 
-unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned X86TTIImpl::
+getMaxInterleaveFactor(unsigned VF, DecisionList *WideningDecisions) {
   // If the loop will not be vectorized, don't interleave the loop.
   // Let regular unroll to unroll the loop, which saves the overflow
   // check and memory check cost.
Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1966,15 +1966,6 @@
            !isScalarAfterVectorization(I, VF);
   }
 
-  /// Decision that was taken during cost calculation for memory instruction.
-  enum InstWidening {
-    CM_Unknown,
-    CM_Widen,
-    CM_Interleave,
-    CM_GatherScatter,
-    CM_Scalarize
-  };
-
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// instruction \p I and vector width \p VF.
   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
@@ -2004,12 +1995,7 @@
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
-    assert(VF >= 2 && "Expected VF >=2");
-    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
-    auto Itr = WideningDecisions.find(InstOnVF);
-    if (Itr == WideningDecisions.end())
-      return CM_Unknown;
-    return Itr->second.first;
+    return WideningDecisions.getWideningDecision(I, VF);
   }
 
   /// Return the vectorization cost for the given instruction \p I and vector
@@ -2177,10 +2163,6 @@
 
   /// Keeps cost model vectorization decision and cost for instructions.
   /// Right now it is used for memory instructions only.
-  typedef DenseMap<std::pair<Instruction *, unsigned>,
-                   std::pair<InstWidening, unsigned>>
-      DecisionList;
-
   DecisionList WideningDecisions;
 
 public:
@@ -2970,11 +2952,10 @@
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
-  LoopVectorizationCostModel::InstWidening Decision =
-      Cost->getWideningDecision(Instr, VF);
-  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+  InstWidening Decision = Cost->getWideningDecision(Instr, VF);
+  assert(Decision != InstWidening::CM_Unknown &&
          "CM decision should be taken at this point");
-  if (Decision == LoopVectorizationCostModel::CM_Interleave)
+  if (Decision == InstWidening::CM_Interleave)
     return vectorizeInterleaveGroup(Instr);
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
@@ -2989,7 +2970,7 @@
   unsigned AddressSpace = getMemInstAddressSpace(Instr);
 
   // Scalarize the memory instruction if necessary.
-  if (Decision == LoopVectorizationCostModel::CM_Scalarize)
+  if (Decision == InstWidening::CM_Scalarize)
     return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
 
   // Determine if the pointer operand of the access is either consecutive or
@@ -2997,7 +2978,7 @@
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
   bool CreateGatherScatter =
-      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
+      (Decision == InstWidening::CM_GatherScatter);
 
   VectorParts VectorGep;
 
@@ -6353,6 +6334,7 @@
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
     VectorizationCostTy C = expectedCost(i);
+
     float VectorCost = C.first / (float)i;
     DEBUG(dbgs() << "LV: Vector loop of width " << i
                  << " costs: " << (int)VectorCost << ".\n");
@@ -6363,6 +6345,14 @@
       continue;
     }
     if (VectorCost < Cost) {
+      // Target may put a limit on memory intenisve loops. This depends on
+      // widening decisions, so it must be done after expectedCost().
+      if (!TTI.checkVectorizationFactorForMem(WideningDecisions, i, C.first,
+                                              ScalarCost)) {
+        DEBUG (dbgs() << "LV: VF limited for memory intensive loop.\n");
+        continue;
+      }
+
       Cost = VectorCost;
       Width = i;
     }
@@ -6498,7 +6488,7 @@
                        std::max(1U, (R.MaxLocalUsers - 1)));
 
   // Clamp the interleave ranges to reasonable counts.
-  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
+  unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF, &WideningDecisions);
 
   // Check if the user has overridden the max.
   if (VF == 1) {