Index: ../lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- ../lib/Transforms/Vectorize/LoopVectorize.cpp
+++ ../lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1694,6 +1694,50 @@
   /// during vectorization.
   bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
 
+  /// Returns true if \p I is a memory instruction with consecutive memory access
+  /// that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    LV_NONE,
+    LV_WIDEN,
+    LV_INTERLEAVE,
+    LV_GATHER_SCATTER,
+    LV_SCALARIZE
+  };
+
+  typedef DenseMap<std::pair<Instruction *, unsigned>, InstWidening> DecisionList;
+
+  /// Save vectorization decision \p W taken by the cost model for 
+  /// instruction \p I and vector width \p VF.
+  void setCostModelDecision(Instruction *I, unsigned VF, InstWidening W) {
+    assert(VF >= 2 && "Expected VF >=2");
+    WideningDecisions[std::make_pair(I, VF)] = W;
+  }
+
+  /// Save vectorization decision \p W taken by the cost model for 
+  /// interleaving group \p Grp and vector width \p VF.
+  void setCostModelDecision(const InterleaveGroup *Grp, unsigned VF, InstWidening W) {
+    assert(VF >= 2 && "Expected VF >=2");
+	/// Broadcast this decicion to all instructions inside the group.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i))
+        WideningDecisions[std::make_pair(I, VF)] = W;
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return LV_NONE if this instruction did not pass through the cost
+  /// modeling.
+  InstWidening getCostModelDecision(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    if (!WideningDecisions.count(InstOnVF))
+      return LV_NONE;
+    return WideningDecisions[InstOnVF];
+  }
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -1835,6 +1879,10 @@
   /// While vectorizing these instructions we have to generate a
   /// call to the appropriate masked intrinsic
   SmallPtrSet<const Instruction *, 8> MaskedOp;
+
+  /// Keeps cost model decisions for instructions.
+  /// Right now it is used for memory instructions only.
+  DecisionList WideningDecisions;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1951,6 +1999,12 @@
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
 
+  /// The cost computation for scalarized memory instruction.
+  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for interleaving group of memory instructions.
+  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -2818,10 +2872,22 @@
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
+  // The vectorization decision taken during the cost computation.
+  LoopVectorizationLegality::InstWidening Decision =
+    Legal->getCostModelDecision(Instr, VF);
+  bool NoCostModelDecision =  (Decision == LoopVectorizationLegality::LV_NONE);
+  
   // Try to vectorize the interleave group if this access is interleaved.
-  if (Legal->isAccessInterleaved(Instr))
+  if (Decision == LoopVectorizationLegality::LV_INTERLEAVE ||
+      (NoCostModelDecision && Legal->isAccessInterleaved(Instr)))
     return vectorizeInterleaveGroup(Instr);
 
+  // Scalarize the memory instruction if necessary.
+  if (Decision == LoopVectorizationLegality::LV_SCALARIZE ||
+      (NoCostModelDecision &&
+       Legal->memoryInstructionMustBeScalarized(Instr, VF)))
+    return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
+
   Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = getPointerOperand(Instr);
@@ -2833,18 +2899,14 @@
     Alignment = DL.getABITypeAlignment(ScalarDataTy);
   unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
 
-  // Scalarize the memory instruction if necessary.
-  if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
-    return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
-
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
-
-  // Determine if either a gather or scatter operation is legal.
   bool CreateGatherScatter =
-      !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
+    (Decision == LoopVectorizationLegality::LV_GATHER_SCATTER ||
+	  (NoCostModelDecision && !ConsecutiveStride &&
+     Legal->isLegalGatherOrScatter(Instr)));
 
   VectorParts VectorGep;
 
@@ -5487,14 +5549,8 @@
   return false;
 }
 
-bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
-    Instruction *I, unsigned VF) {
-
-  // If the memory instruction is in an interleaved group, it will be
-  // vectorized and its pointer will remain uniform.
-  if (isAccessInterleaved(I))
-    return false;
-
+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
+                                                              unsigned VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
@@ -5504,28 +5560,35 @@
   // will be scalarized.
   auto *Ptr = getPointerOperand(I);
   if (LI && isUniform(Ptr))
-    return true;
+    return false;
 
-  // If the pointer operand is non-consecutive and neither a gather nor a
-  // scatter operation is legal, the memory instruction will be scalarized.
-  if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
-    return true;
+  if (!isConsecutivePtr(Ptr))
+    return false;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I))
-    return true;
+    return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL, VF))
-    return true;
+    return false;
 
-  // Otherwise, the memory instruction should be vectorized if the rest of the
-  // loop is.
-  return false;
+  return true;
+}
+
+bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
+  Instruction *I, unsigned VF) {
+
+  // If the memory instruction is in an interleaved group, it will be
+  // vectorized and its pointer will remain uniform.
+  if (isAccessInterleaved(I) || isLegalGatherOrScatter(I))
+    return false;
+
+  return !memoryInstructionCanBeWidened(I, VF);
 }
 
 void LoopVectorizationLegality::collectLoopUniforms() {
@@ -6843,6 +6906,79 @@
          Legal->hasStride(I->getOperand(1));
 }
 
+unsigned 
+LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                        unsigned VF) {
+  StoreInst *SI = dyn_cast<StoreInst>(I);
+  LoadInst *LI = dyn_cast<LoadInst>(I);
+  Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
+  auto SE = PSE.getSE();
+
+  unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
+  unsigned AS =
+    SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
+  Value *Ptr = getPointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // True if the memory instruction's address computation is complex.
+  bool IsComplexComputation =
+    isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
+  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy,
+                                                     IsComplexComputation);
+  Cost += VF *
+    TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
+                        Alignment, AS);
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF, TTI);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (Legal->isScalarWithPredication(I))
+    Cost /= getReciprocalPredBlockProb();
+
+  return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, unsigned VF) {
+  StoreInst *SI = dyn_cast<StoreInst>(I);
+  LoadInst *LI = dyn_cast<LoadInst>(I);
+  Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned AS =
+    SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
+
+  auto Group = Legal->getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (LI) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(
+    I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+    Group->getAlignment(), AS);
+
+  if (Group->isReverse())
+    Cost += Group->getNumMembers() *
+      TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
@@ -7011,107 +7147,100 @@
 
     if (LI && Legal->isUniform(Ptr)) {
       // Scalar load + broadcast
+      Legal->setCostModelDecision(I, VF,
+                                  LoopVectorizationLegality::LV_SCALARIZE);
       unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());
       Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
                                   Alignment, AS);
       return Cost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);
+             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
     }
 
-    // For an interleaved access, calculate the total cost of the whole
-    // interleave group.
-    if (Legal->isAccessInterleaved(I)) {
-      auto Group = Legal->getInterleavedAccessGroup(I);
-      assert(Group && "Fail to get an interleaved access group.");
+    // We assume that widening is the best solution when possible.
+    if (Legal->memoryInstructionCanBeWidened(I, VF)) {
+      int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+      assert(ConsecutiveStride != 0 &&
+             "Can't widen non-consecutive memory instruction");
+      bool Reverse = ConsecutiveStride < 0;
 
-      // Only calculate the cost once at the insert position.
-      if (Group->getInsertPos() != I)
-        return 0;
-
-      unsigned InterleaveFactor = Group->getFactor();
-      Type *WideVecTy =
-          VectorType::get(VectorTy->getVectorElementType(),
-                          VectorTy->getVectorNumElements() * InterleaveFactor);
-
-      // Holds the indices of existing members in an interleaved load group.
-      // An interleaved store group doesn't need this as it doesn't allow gaps.
-      SmallVector<unsigned, 4> Indices;
-      if (LI) {
-        for (unsigned i = 0; i < InterleaveFactor; i++)
-          if (Group->getMember(i))
-            Indices.push_back(i);
-      }
-
-      // Calculate the cost of the whole interleaved group.
-      unsigned Cost = TTI.getInterleavedMemoryOpCost(
-          I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-          Group->getAlignment(), AS);
-
-      if (Group->isReverse())
+      unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+      if (Legal->isMaskRequired(I))
         Cost +=
-            Group->getNumMembers() *
-            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+        TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+      else
+        Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
-      // FIXME: The interleaved load group with a huge gap could be even more
-      // expensive than scalar operations. Then we could ignore such group and
-      // use scalar operations instead.
+      if (Reverse)
+        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
+                                   0);
+      Legal->setCostModelDecision(I, VF, LoopVectorizationLegality::LV_WIDEN);
       return Cost;
     }
 
-    // Check if the memory instruction will be scalarized.
-    if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
-      unsigned Cost = 0;
-      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
-
-      // True if the memory instruction's address computation is complex.
-      bool IsComplexComputation =
-          isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
-
-      // Get the cost of the scalar memory instruction and address computation.
-      Cost += VF * TTI.getAddressComputationCost(PtrTy, IsComplexComputation);
-      Cost += VF *
-              TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-
-      // Get the overhead of the extractelement and insertelement instructions
-      // we might create due to scalarization.
-      Cost += getScalarizationOverhead(I, VF, TTI);
-
-      // If we have a predicated store, it may not be executed for each vector
-      // lane. Scale the cost by the probability of executing the predicated
-      // block.
-      if (Legal->isScalarWithPredication(I))
-        Cost /= getReciprocalPredBlockProb();
-
-      return Cost;
-    }
+    // For an interleaved access, calculate the total cost of the whole
+    // interleave group.
+    unsigned InterleaveGrpCost = 0;
+    unsigned InterleaveInstCost = (unsigned)(-1);
+    unsigned GatherScatterCost = (unsigned)(-1);
+    if (Legal->isAccessInterleaved(I)) {
+      auto Group = Legal->getInterleavedAccessGroup(I);
+      assert(Group && "Fail to get an interleaved access group.");
 
-    // Determine if the pointer operand of the access is either consecutive or
-    // reverse consecutive.
-    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = ConsecutiveStride < 0;
-
-    // Determine if either a gather or scatter operation is legal.
-    bool UseGatherOrScatter =
-        !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
-
-    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    if (UseGatherOrScatter) {
-      assert(ConsecutiveStride == 0 &&
-             "Gather/Scatter are not used for consecutive stride");
-      return Cost +
-             TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                        Legal->isMaskRequired(I), Alignment);
+      LoopVectorizationLegality::InstWidening Decision =
+        Legal->getCostModelDecision(I, VF);
+      if (Decision == LoopVectorizationLegality::LV_NONE) {
+        // Only calculate the cost once for the group.
+        InterleaveGrpCost = getInterleaveGroupCost(I, VF);
+
+        if (Group->getNumMembers() > 1)
+          DEBUG(dbgs() << "LV: Found an estimated cost of " <<
+                InterleaveGrpCost << " for VF " <<
+                VF << " For interleaving group: " << *I << '\n');
+
+        if (InterleaveGrpCost / Group->getNumMembers() <= 1) {
+          // The interleaving cost is good enough.
+          Legal->setCostModelDecision(Group, VF,
+                                      LoopVectorizationLegality::LV_INTERLEAVE);
+          return InterleaveGrpCost;
+        }
+        // We are going to check all other options.
+        InterleaveInstCost = InterleaveGrpCost / Group->getNumMembers();
+      } else if (Decision == LoopVectorizationLegality::LV_INTERLEAVE)
+        // This instructions belongs to interleaving group and will be 
+        // vectorized inside this group.
+        return 0;
+      // If we are here it means that the instruction belongs to an 
+      // interleaving group, but interleaving is not profitable for the current 
+      // VF. We'll need to choose another option.
+    }
+    
+    if (Legal->isLegalGatherOrScatter(I)) {
+      GatherScatterCost = TTI.getAddressComputationCost(VectorTy) +
+        TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                   Legal->isMaskRequired(I), Alignment);
+    }
+    unsigned ScalarizationCost = getMemInstScalarizationCost(I, VF);
+
+    // Choose better solution for the current VF and return the cost.
+    // Write down this decision and use it during vectorization.
+    unsigned Cost;
+    LoopVectorizationLegality::InstWidening Decision;
+    if (InterleaveInstCost <= GatherScatterCost &&
+        InterleaveInstCost <= ScalarizationCost) {
+      Decision = LoopVectorizationLegality::LV_INTERLEAVE;
+      Cost = InterleaveGrpCost;
+    } else if (GatherScatterCost < InterleaveInstCost &&
+      GatherScatterCost < ScalarizationCost) {
+      Decision = LoopVectorizationLegality::LV_GATHER_SCATTER;
+        Cost = GatherScatterCost;
+    } else {
+      Cost = ScalarizationCost;
+      Decision = LoopVectorizationLegality::LV_SCALARIZE;
     }
-    // Wide load/stores.
-    if (Legal->isMaskRequired(I))
-      Cost +=
-          TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+    if (auto Group = Legal->getInterleavedAccessGroup(I))
+      Legal->setCostModelDecision(Group, VF, Decision);
     else
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+      Legal->setCostModelDecision(I, VF, Decision);
     return Cost;
   }
   case Instruction::ZExt:
Index: ../test/Analysis/CostModel/X86/interleave-load-i32.ll
===================================================================
--- ../test/Analysis/CostModel/X86/interleave-load-i32.ll
+++ ../test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -10,11 +10,10 @@
 ; Function Attrs: nounwind uwtable
 define void @load_i32_interleave4() {
 ;CHECK-LABEL: load_i32_interleave4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 5 for VF 2 For interleaving group: {{.*}} = load
+;CHECK: Found an estimated cost of 5 for VF 4 For interleaving group: {{.*}} = load
+;CHECK: Found an estimated cost of 8 for VF 8 For interleaving group: {{.*}} = load
+;CHECK: Found an estimated cost of 22 for VF 16 For interleaving group: {{.*}} = load
 entry:
   br label %for.body
 
@@ -46,11 +45,10 @@
 
 define void @load_i32_interleave5() {
 ;CHECK-LABEL: load_i32_interleave5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
-;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 6 for VF 2 For interleaving group: {{.*}} = load
+;CHECK: Found an estimated cost of 9 for VF 4 For interleaving group: {{.*}} = load
+;CHECK: Found an estimated cost of 18 for VF 8 For interleaving group: {{.*}} = load
+;CHECK: Found an estimated cost of 35 for VF 16 For interleaving group: {{.*}} = load
 entry:
   br label %for.body
 
Index: ../test/Analysis/CostModel/X86/interleave-store-i32.ll
===================================================================
--- ../test/Analysis/CostModel/X86/interleave-store-i32.ll
+++ ../test/Analysis/CostModel/X86/interleave-store-i32.ll
@@ -10,11 +10,10 @@
 ; Function Attrs: nounwind uwtable
 define void @store_i32_interleave4() {
 ;CHECK-LABEL: store_i32_interleave4
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 11 for VF 8 For instruction:   store i32 %add16
-;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 5 for VF 2 For interleaving group:   store
+;CHECK: Found an estimated cost of 5 for VF 4 For interleaving group:   store
+;CHECK: Found an estimated cost of 11 for VF 8 For interleaving group:   store
+;CHECK: Found an estimated cost of 22 for VF 16 For interleaving group:   store
 entry:
   br label %for.body
 
@@ -46,11 +45,10 @@
 
 define void @store_i32_interleave5() {
 ;CHECK-LABEL: store_i32_interleave5
-;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 21 for VF 8 For instruction:   store i32 %add22
-;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 7 for VF 2 For interleaving group:   store
+;CHECK: Found an estimated cost of 14 for VF 4 For interleaving group:   store
+;CHECK: Found an estimated cost of 21 for VF 8 For interleaving group:   store
+;CHECK: Found an estimated cost of 35 for VF 16 For interleaving group:   store
 entry:
   br label %for.body
 
Index: ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
===================================================================
--- ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
+++ ../test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+