Index: ../lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- ../lib/Transforms/Vectorize/LoopVectorize.cpp
+++ ../lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1608,12 +1608,6 @@
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
-  /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
-
-  /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
-
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
@@ -1690,15 +1684,9 @@
   /// instructions that may divide by zero.
   bool isScalarWithPredication(Instruction *I);
 
-  /// Returns true if \p I is a memory instruction that has a consecutive or
-  /// consecutive-like pointer operand. Consecutive-like pointers are pointers
-  /// that are treated like consecutive pointers during vectorization. The
-  /// pointer operands of interleaved accesses are an example.
-  bool hasConsecutiveLikePtrOperand(Instruction *I);
-
-  /// Returns true if \p I is a memory instruction that must be scalarized
-  /// during vectorization.
-  bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
 private:
   /// Check if a single basic block loop is vectorizable.
@@ -1716,24 +1704,6 @@
   /// transformation.
   bool canVectorizeWithIfConvert();
 
-  /// Collect the instructions that are uniform after vectorization. An
-  /// instruction is uniform if we represent it with a single scalar value in
-  /// the vectorized loop corresponding to each vector iteration. Examples of
-  /// uniform instructions include pointer operands of consecutive or
-  /// interleaved memory accesses. Note that although uniformity implies an
-  /// instruction will be scalar, the reverse is not true. In general, a
-  /// scalarized instruction will be represented by VF scalar values in the
-  /// vectorized loop, each corresponding to an iteration of the original
-  /// scalar loop.
-  void collectLoopUniforms();
-
-  /// Collect the instructions that are scalar after vectorization. An
-  /// instruction is scalar if it is known to be uniform or will be scalarized
-  /// during vectorization. Non-uniform scalarized instructions will be
-  /// represented by VF values in the vectorized loop, each corresponding to an
-  /// iteration of the original scalar loop.
-  void collectLoopScalars();
-
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -1823,12 +1793,6 @@
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value *, 4> AllowedExit;
 
-  /// Holds the instructions known to be uniform after vectorization.
-  SmallPtrSet<Instruction *, 4> Uniforms;
-
-  /// Holds the instructions known to be scalar after vectorization.
-  SmallPtrSet<Instruction *, 4> Scalars;
-
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
 
@@ -1885,6 +1849,13 @@
   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
                                  unsigned LoopCost);
 
+  /// Memory access instruction may be vectorized in more than one way.
+  /// Form of instruction after vectorization depends on cost.
+  /// This function takes cost-based decisions for Load/Store instructions
+  /// and collects them in a map. This decisions map is used for building
+  /// the lists of loop-uniform and loop-scalar instructions.
+  void setCostBasedWideningDecision(unsigned VF);
+
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
@@ -1919,11 +1890,68 @@
     return Scalars->second.count(I);
   }
 
+  /// Returns true if \p I is known to be uniform after vectorization.
+  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
+    auto UniformsPerVF = Uniforms.find(VF);
+    return UniformsPerVF->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be scalar after vectorization.
+  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
+    auto ScalarsPerVF = Scalars.find(VF);
+    return ScalarsPerVF->second.count(I);
+  }
+
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
-           !Legal->isScalarAfterVectorization(I);
+           !isScalarAfterVectorization(I, VF);
+  }
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    CM_DECISION_NONE,
+    CM_DECISION_WIDEN,
+    CM_DECISION_INTERLEAVE,
+    CM_DECISION_GATHER_SCATTER,
+    CM_DECISION_SCALARIZE
+  };
+
+  /// Save vectorization decision \p W taken by the cost model for 
+  /// instruction \p I and vector width \p VF.
+  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W) {
+    assert(VF >= 2 && "Expected VF >=2");
+    WideningDecisions[std::make_pair(I, VF)] = W;
+  }
+
+  /// Save vectorization decision \p W taken by the cost model for 
+  /// interleaving group \p Grp and vector width \p VF.
+  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+                           InstWidening W) {
+    assert(VF >= 2 && "Expected VF >=2");
+    /// Broadcast this decicion to all instructions inside the group.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i))
+        WideningDecisions[std::make_pair(I, VF)] = W;
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return CM_DECISION_NONE if this instruction did not pass
+  /// through the cost modeling.
+  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    if (!WideningDecisions.count(InstOnVF))
+      return CM_DECISION_NONE;
+    return WideningDecisions[InstOnVF];
   }
 
 private:
@@ -1950,6 +1978,12 @@
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
 
+  /// The cost computation for scalarized memory instruction.
+  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for interleaving group of memory instructions.
+  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -1979,6 +2013,14 @@
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
 
+  /// Holds the instructions known to be uniform after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+  /// Holds the instructions known to be scalar after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
@@ -1991,6 +2033,43 @@
   /// the loop.
   void collectInstsToScalarize(unsigned VF);
 
+  /// Collect the instructions that are uniform after vectorization. An
+  /// instruction is uniform if we represent it with a single scalar value in
+  /// the vectorized loop corresponding to each vector iteration. Examples of
+  /// uniform instructions include pointer operands of consecutive or
+  /// interleaved memory accesses. Note that although uniformity implies an
+  /// instruction will be scalar, the reverse is not true. In general, a
+  /// scalarized instruction will be represented by VF scalar values in the
+  /// vectorized loop, each corresponding to an iteration of the original
+  /// scalar loop.
+  void collectLoopUniforms(unsigned VF);
+
+  /// Collect the instructions that are scalar after vectorization. An
+  /// instruction is scalar if it is known to be uniform or will be scalarized
+  /// during vectorization. Non-uniform scalarized instructions will be
+  /// represented by VF values in the vectorized loop, each corresponding to an
+  /// iteration of the original scalar loop.
+  void collectLoopScalars(unsigned VF);
+
+  /// Collect Uniform and Scalar values for the given \p VF.
+  /// The sets depend on CM decision for Load/Store instructions
+  /// that may be vectorized as interleave, gather-scatter or scalarized.
+  void collectUniformsAndScalars(unsigned VF) {
+    // Do the analysis once.
+    if (VF == 1 || Uniforms.count(VF))
+      return;
+    setCostBasedWideningDecision(VF);
+    collectLoopUniforms(VF);
+    collectLoopScalars(VF);
+  }
+
+  /// Keeps cost model decisions for instructions.
+  /// Right now it is used for memory instructions only.
+  typedef DenseMap<std::pair<Instruction *, unsigned>, InstWidening>
+      DecisionList;
+
+  DecisionList WideningDecisions;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -2224,7 +2303,7 @@
 }
 
 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
-  return Legal->isScalarAfterVectorization(I) ||
+  return Cost->isScalarAfterVectorization(I, VF) ||
          Cost->isProfitableToScalarize(I, VF);
 }
 
@@ -2400,7 +2479,7 @@
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   unsigned Lanes =
-      Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
+    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   ScalarParts Entry(UF);
@@ -2468,7 +2547,7 @@
     // known to be uniform after vectorization, this corresponds to lane zero
     // of the last unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the last unroll iteration.
-    unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
     auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
@@ -2485,7 +2564,7 @@
     // VectorLoopValueMap, we will only generate the insertelements once.
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VectorValue = nullptr;
-      if (Legal->isUniformAfterVectorization(I)) {
+      if (Cost->isUniformAfterVectorization(I, VF)) {
         VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
       } else {
         VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
@@ -2514,8 +2593,9 @@
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
-                  : true && "Uniform values only have lane zero");
+  assert(Lane > 0 ?
+         !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+         : true && "Uniform values only have lane zero");
 
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
@@ -2815,8 +2895,11 @@
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
-  // Try to vectorize the interleave group if this access is interleaved.
-  if (Legal->isAccessInterleaved(Instr))
+  LoopVectorizationCostModel::InstWidening Decision =
+      Cost->getWideningDecision(Instr, VF);
+  assert(Decision != LoopVectorizationCostModel::CM_DECISION_NONE &&
+         "CM decision should be taken at this point");
+  if (Decision == LoopVectorizationCostModel::CM_DECISION_INTERLEAVE)
     return vectorizeInterleaveGroup(Instr);
 
   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
@@ -2831,17 +2914,15 @@
   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
 
   // Scalarize the memory instruction if necessary.
-  if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
+  if (Decision == LoopVectorizationCostModel::CM_DECISION_SCALARIZE)
     return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
-
-  // Determine if either a gather or scatter operation is legal.
   bool CreateGatherScatter =
-      !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
+      (Decision == LoopVectorizationCostModel::CM_DECISION_GATHER_SCATTER);
 
   VectorParts VectorGep;
 
@@ -3026,7 +3107,7 @@
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If the instruction is uniform, we only need to generate the
   // first lane. Otherwise, we generate all VF values.
-  unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
+  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
 
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
@@ -4637,7 +4718,7 @@
     // Determine the number of scalars we need to generate for each unroll
     // iteration. If the instruction is uniform, we only need to generate the
     // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
+    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
     ScalarParts Entry(UF);
@@ -5141,12 +5222,6 @@
   if (UseInterleaved)
     InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
 
-  // Collect all instructions that are known to be uniform after vectorization.
-  collectLoopUniforms();
-
-  // Collect all instructions that are known to be scalar after vectorization.
-  collectLoopScalars();
-
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
@@ -5412,10 +5487,18 @@
   return true;
 }
 
-void LoopVectorizationLegality::collectLoopScalars() {
+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+
+  // We should not collect Scalars more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which 
+  // already does this check. Collecting Scalars for VF=1 does not make any
+  // sense.
+
+  assert(VF >= 2 && !Scalars.count(VF) &&
+         "This function should not be visited twice for the same VF");
 
   // If an instruction is uniform after vectorization, it will remain scalar.
-  Scalars.insert(Uniforms.begin(), Uniforms.end());
+  Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
 
   // Collect the getelementptr instructions that will not be vectorized. A
   // getelementptr instruction is only vectorized if it is used for a legal
@@ -5423,21 +5506,21 @@
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-        Scalars.insert(GEP);
+        Scalars[VF].insert(GEP);
         continue;
       }
       auto *Ptr = getPointerOperand(&I);
       if (!Ptr)
         continue;
       auto *GEP = getGEPInstruction(Ptr);
-      if (GEP && isLegalGatherOrScatter(&I))
-        Scalars.erase(GEP);
+      if (GEP && getWideningDecision(&I, VF) == CM_DECISION_GATHER_SCATTER)
+        Scalars[VF].erase(GEP);
     }
 
   // An induction variable will remain scalar if all users of the induction
   // variable and induction variable update remain scalar.
   auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : *getInductionVars()) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -5445,7 +5528,7 @@
     // vectorization.
     auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Scalars[VF].count(I);
     });
     if (!ScalarInd)
       continue;
@@ -5454,25 +5537,17 @@
     // scalar after vectorization.
     auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
+      return I == Ind || !TheLoop->contains(I) || Scalars[VF].count(I);
     });
     if (!ScalarIndUpdate)
       continue;
 
     // The induction variable and its update instruction will remain scalar.
-    Scalars.insert(Ind);
-    Scalars.insert(IndUpdate);
+    Scalars[VF].insert(Ind);
+    Scalars[VF].insert(IndUpdate);
   }
 }
 
-bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
-  if (isAccessInterleaved(I))
-    return true;
-  if (auto *Ptr = getPointerOperand(I))
-    return isConsecutivePtr(Ptr);
-  return false;
-}
-
 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
   if (!blockNeedsPredication(I->getParent()))
     return false;
@@ -5490,14 +5565,8 @@
   return false;
 }
 
-bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
-    Instruction *I, unsigned VF) {
-
-  // If the memory instruction is in an interleaved group, it will be
-  // vectorized and its pointer will remain uniform.
-  if (isAccessInterleaved(I))
-    return false;
-
+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
+                                                              unsigned VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -5507,31 +5576,40 @@
   // will be scalarized.
   auto *Ptr = getPointerOperand(I);
   if (LI && isUniform(Ptr))
-    return true;
+    return false;
 
-  // If the pointer operand is non-consecutive and neither a gather nor a
-  // scatter operation is legal, the memory instruction will be scalarized.
-  if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
-    return true;
+  if (!isConsecutivePtr(Ptr))
+    return false;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I))
-    return true;
+    return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL, VF))
-    return true;
+    return false;
 
-  // Otherwise, the memory instruction should be vectorized if the rest of the
-  // loop is.
-  return false;
+  return true;
 }
 
-void LoopVectorizationLegality::collectLoopUniforms() {
+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+
+  // We should not collect Uniforms more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which 
+  // already does this check. Collecting Uniforms for VF=1 does not make any
+  // sense.
+
+  assert(VF >= 2 && !Uniforms.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll 
+  // not analyze again.  Uniforms.count(VF) will return 1.
+  Uniforms[VF].clear();
+
   // We now know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
   // vectorization.
@@ -5564,6 +5642,14 @@
   // Holds pointer operands of instructions that are possibly non-uniform.
   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
 
+  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+    InstWidening WideningDecision = getWideningDecision(I, VF);
+    assert(WideningDecision != CM_DECISION_NONE &&
+           "Widening decision should be ready at this moment");
+
+    return (WideningDecision == CM_DECISION_WIDEN ||
+            WideningDecision == CM_DECISION_INTERLEAVE);
+  };
   // Iterate over the instructions in the loop, and collect all
   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
   // that a consecutive-like pointer operand will be scalarized, we collect it
@@ -5586,25 +5672,18 @@
         return getPointerOperand(U) == Ptr;
       });
 
-      // Ensure the memory instruction will not be scalarized, making its
-      // pointer operand non-uniform. If the pointer operand is used by some
-      // instruction other than a memory access, we're not going to check if
-      // that other instruction may be scalarized here. Thus, conservatively
-      // assume the pointer operand may be non-uniform.
-      if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
+      // Ensure the memory instruction will not be scalarized or used by
+      // gather/scatter, making its pointer operand non-uniform. If the pointer
+      // operand is used by any instruction other than a memory access, we
+      // conservatively assume the pointer operand may be non-uniform.
+      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
         PossibleNonUniformPtrs.insert(Ptr);
 
       // If the memory instruction will be vectorized and its pointer operand
-      // is consecutive-like, the pointer operand should remain uniform.
-      else if (hasConsecutiveLikePtrOperand(&I))
-        ConsecutiveLikePtrs.insert(Ptr);
-
-      // Otherwise, if the memory instruction will be vectorized and its
-      // pointer operand is non-consecutive-like, the memory instruction should
-      // be a gather or scatter operation. Its pointer operand will be
-      // non-uniform.
+      // is consecutive-like, or interleaving - the pointer operand should
+      // remain uniform.
       else
-        PossibleNonUniformPtrs.insert(Ptr);
+        ConsecutiveLikePtrs.insert(Ptr);
     }
 
   // Add to the Worklist all consecutive and consecutive-like pointers that
@@ -5639,7 +5718,7 @@
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, and I is known to not require scalarization.
   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
+    return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
   };
 
   // For an instruction to be added into Worklist above, all its users inside
@@ -5648,7 +5727,7 @@
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
-  for (auto &Induction : Inductions) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -5679,7 +5758,7 @@
     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
   }
 
-  Uniforms.insert(Worklist.begin(), Worklist.end());
+  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
@@ -6192,6 +6271,8 @@
     DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
 
     Factor.Width = UserVF;
+
+    collectUniformsAndScalars(UserVF);
     collectInstsToScalarize(UserVF);
     return Factor;
   }
@@ -6558,12 +6639,13 @@
         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
         continue;
       }
-
+      collectUniformsAndScalars(VFs[j]);
       // Count the number of live intervals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals) {
         // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.count(Inst))
+        if (VecValuesToIgnore.count(Inst) ||
+            isScalarAfterVectorization(Inst, VFs[j]))
           continue;
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
       }
@@ -6632,7 +6714,7 @@
     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
     unsigned VF) {
 
-  assert(!Legal->isUniformAfterVectorization(PredInst) &&
+  assert(!isUniformAfterVectorization(PredInst, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
@@ -6653,7 +6735,7 @@
     // already be scalar to avoid traversing chains that are unlikely to be
     // beneficial.
     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
-        Legal->isScalarAfterVectorization(I))
+        isScalarAfterVectorization(I, VF))
       return false;
 
     // If the instruction is scalar with predication, it will be analyzed
@@ -6673,7 +6755,7 @@
     // the lane zero values for uniforms rather than asserting.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get()))
-        if (Legal->isUniformAfterVectorization(J))
+        if (isUniformAfterVectorization(J, VF))
           return false;
 
     // Otherwise, we can scalarize the instruction.
@@ -6686,7 +6768,7 @@
   // and their return values are inserted into vectors. Thus, an extract would
   // still be required.
   auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
   };
 
   // Compute the expected cost discount from scalarizing the entire expression
@@ -6749,6 +6831,9 @@
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
 
+  // Collect Uniform and Scalar instructions after vectorization with VF.
+  collectUniformsAndScalars(VF);
+
   // Collect the instructions (and their associated costs) that will be more
   // profitable to scalarize.
   collectInstsToScalarize(VF);
@@ -6828,11 +6913,83 @@
          Legal->hasStride(I->getOperand(1));
 }
 
+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                                 unsigned VF) {
+  StoreInst *SI = dyn_cast<StoreInst>(I);
+  LoadInst *LI = dyn_cast<LoadInst>(I);
+  Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
+  auto SE = PSE.getSE();
+
+  unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+  unsigned AS =
+      SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
+  Value *Ptr = getPointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // Figure out whether the access is strided and get the stride value
+  // if it's known in compile time
+  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
+  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+
+  Cost += VF *
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+                              AS);
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF, TTI);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (Legal->isScalarWithPredication(I))
+    Cost /= getReciprocalPredBlockProb();
+
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+                                                            unsigned VF) {
+  StoreInst *SI = dyn_cast<StoreInst>(I);
+  LoadInst *LI = dyn_cast<LoadInst>(I);
+  Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned AS =
+      SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
+
+  auto Group = Legal->getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (LI) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                 Group->getFactor(), Indices,
+                                                 Group->getAlignment(), AS);
+
+  if (Group->isReverse())
+    Cost += Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
-  if (Legal->isUniformAfterVectorization(I))
+  if (isUniformAfterVectorization(I, VF))
     VF = 1;
 
   if (VF > 1 && isProfitableToScalarize(I, VF))
@@ -6846,6 +7003,92 @@
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
+  if (VF == 1)
+    return;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    VectorizationCostTy BlockCost;
+
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      if (I.getOpcode() != Instruction::Store &&
+          I.getOpcode() != Instruction::Load)
+        continue;
+
+      StoreInst *SI = dyn_cast<StoreInst>(&I);
+      LoadInst *LI = dyn_cast<LoadInst>(&I);
+
+      Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
+      Type *VectorTy = ToVectorTy(ValTy, VF);
+
+      unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+      Value *Ptr = getPointerOperand(&I);
+
+      if (LI && Legal->isUniform(Ptr)) {
+        // Scalar load + broadcast
+        setWideningDecision(&I, VF, CM_DECISION_SCALARIZE);
+        continue;
+      }
+
+      // We assume that widening is the best solution when possible.
+      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+        setWideningDecision(&I, VF, CM_DECISION_WIDEN);
+        continue;
+      }
+
+      // Choose between Interleaving, Gather/Scatter or Scalarization.
+      unsigned InterleaveGrpCost = 0;
+      unsigned InterleaveInstCost = (unsigned)(-1);
+      unsigned GatherScatterCost = (unsigned)(-1);
+      if (Legal->isAccessInterleaved(&I)) {
+        auto Group = Legal->getInterleavedAccessGroup(&I);
+        assert(Group && "Fail to get an interleaved access group.");
+
+        // Make one decision for the whole group.
+        if (Group->getInsertPos() != &I)
+          continue;
+
+        InterleaveGrpCost = getInterleaveGroupCost(&I, VF);
+
+        if (InterleaveGrpCost / Group->getNumMembers() <= 1) {
+          // The interleaving cost is good enough.
+          setWideningDecision(Group, VF, CM_DECISION_INTERLEAVE);
+          continue;
+        }
+        // We are going to check all other options.
+        InterleaveInstCost = InterleaveGrpCost / Group->getNumMembers();
+      }
+
+      if (Legal->isLegalGatherOrScatter(&I)) {
+        GatherScatterCost =
+            TTI.getAddressComputationCost(VectorTy) +
+            TTI.getGatherScatterOpCost(I.getOpcode(), VectorTy, Ptr,
+                                       Legal->isMaskRequired(&I), Alignment);
+      }
+      unsigned ScalarizationCost = getMemInstScalarizationCost(&I, VF);
+
+      // Choose better solution for the current VF,
+      // write down this decision and use it during vectorization.
+      InstWidening Decision;
+      if (InterleaveInstCost <= GatherScatterCost &&
+          InterleaveInstCost <= ScalarizationCost)
+        Decision = CM_DECISION_INTERLEAVE;
+      else if (GatherScatterCost < InterleaveInstCost &&
+               GatherScatterCost < ScalarizationCost)
+        Decision = CM_DECISION_GATHER_SCATTER;
+      else
+        Decision = CM_DECISION_SCALARIZE;
+
+      // If the instructions belongs to an interleave group, the whole group
+      // receives the same decision.
+      if (auto Group = Legal->getInterleavedAccessGroup(&I))
+        setWideningDecision(Group, VF, Decision);
+      else
+        setWideningDecision(&I, VF, Decision);
+    }
+  }
+}
+
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                         unsigned VF,
                                                         Type *&VectorTy) {
@@ -7000,12 +7243,15 @@
       Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                   Alignment, AS);
       return Cost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);
+             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
     }
 
     // For an interleaved access, calculate the total cost of the whole
     // interleave group.
-    if (Legal->isAccessInterleaved(I)) {
+    InstWidening Decision = getWideningDecision(I, VF);
+    assert(Decision != CM_DECISION_NONE &&
+           "Widening decision should be taken at this moment");
+    if (Decision == CM_DECISION_INTERLEAVE) {
       auto Group = Legal->getInterleavedAccessGroup(I);
       assert(Group && "Fail to get an interleaved access group.");
 
@@ -7037,14 +7283,11 @@
             Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
 
-      // FIXME: The interleaved load group with a huge gap could be even more
-      // expensive than scalar operations. Then we could ignore such group and
-      // use scalar operations instead.
       return Cost;
     }
 
     // Check if the memory instruction will be scalarized.
-    if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
+    if (Decision == CM_DECISION_SCALARIZE) {
       unsigned Cost = 0;
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
 
@@ -7074,14 +7317,9 @@
     // Determine if the pointer operand of the access is either consecutive or
     // reverse consecutive.
     int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = ConsecutiveStride < 0;
-
-    // Determine if either a gather or scatter operation is legal.
-    bool UseGatherOrScatter =
-        !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
 
     unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    if (UseGatherOrScatter) {
+    if (Decision == CM_DECISION_GATHER_SCATTER) {
       assert(ConsecutiveStride == 0 &&
              "Gather/Scatter are not used for consecutive stride");
       return Cost +
@@ -7089,12 +7327,15 @@
                                         Legal->isMaskRequired(I), Alignment);
     }
     // Wide load/stores.
+    assert(Decision == CM_DECISION_WIDEN && ConsecutiveStride != 0 &&
+           "Unexpected CM decision to handle");
     if (Legal->isMaskRequired(I))
       Cost +=
           TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
     else
       Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
+    bool Reverse = ConsecutiveStride < 0;
     if (Reverse)
       Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
     return Cost;
@@ -7200,19 +7441,6 @@
     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
-
-  // Insert values known to be scalar into VecValuesToIgnore. This is a
-  // conservative estimation of the values that will later be scalarized.
-  //
-  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
-  //        still be scalarized. For example, we may find an instruction to be
-  //        more profitable for a given vectorization factor if it were to be
-  //        scalarized. But at this point, we haven't yet computed the
-  //        vectorization factor.
-  for (auto *BB : TheLoop->getBlocks())
-    for (auto &I : *BB)
-      if (Legal->isScalarAfterVectorization(&I))
-        VecValuesToIgnore.insert(&I);
 }
 
 void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
Index: ../test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
===================================================================
--- ../test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ ../test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,37 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 10 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i8, i8* %tmp0, align 1
+  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
Index: ../test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
===================================================================
--- ../test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ ../test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -18,8 +18,9 @@
 ; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
 ; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
 ; CHECK:     vector.body:
+; CHECK:       %index = phi i64
 ; CHECK:       %vec.ind = phi <16 x i64>
-; CHECK:       %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0
+; CHECK:       %[[T0:.+]] = mul i64 %index, 5
 ; CHECK:       %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]]
 ; CHECK:       %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>*
 ; CHECK:       load <80 x float>, <80 x float>* %[[T2]], align 4
Index: ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
===================================================================
--- ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
+++ ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+