Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
@@ -467,7 +467,11 @@
     SK_Reverse,         ///< Reverse the order of the vector.
     SK_Alternate,       ///< Choose alternate elements from vector.
     SK_InsertSubvector, ///< InsertSubvector. Index indicates start offset.
-    SK_ExtractSubvector ///< ExtractSubvector Index indicates start offset.
+    SK_ExtractSubvector,///< ExtractSubvector Index indicates start offset.
+    SK_PermuteTwoSrc,   ///< Merge elements from two source vectors into one
+                        ///< with any shuffle mask.
+    SK_PermuteSingleSrc ///< Shuffle elements of single source vector with any
+                        ///< shuffle mask.
   };
 
   /// \brief Additional information about an operand's possible values.
Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h
@@ -60,8 +60,9 @@
     return Cost;
   }
 
-  /// Estimate the cost overhead of SK_Alternate shuffle.
-  unsigned getAltShuffleOverhead(Type *Ty) {
+  /// Estimate a cost of shuffle as a sequence of extract and insert
+  /// operations.
+  unsigned getPermuteShuffleOverhead(Type *Ty) {
     assert(Ty->isVectorTy() && "Can only shuffle vectors");
     unsigned Cost = 0;
     // Shuffle cost is equal to the cost of extracting element from its argument
@@ -351,8 +352,9 @@
 
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                           Type *SubTp) {
-    if (Kind == TTI::SK_Alternate) {
-      return getAltShuffleOverhead(Tp);
+    if (Kind == TTI::SK_Alternate || Kind == TTI::SK_PermuteTwoSrc ||
+        Kind == TTI::SK_PermuteSingleSrc) {
+      return getPermuteShuffleOverhead(Tp);
     }
     return 1;
   }
Index: llvm/trunk/lib/Analysis/CostModel.cpp
===================================================================
--- llvm/trunk/lib/Analysis/CostModel.cpp
+++ llvm/trunk/lib/Analysis/CostModel.cpp
@@ -97,6 +97,27 @@
   return true;
 }
 
+static bool isSingleSourceVectorMask(ArrayRef<int> Mask) {
+  bool Vec0 = false;
+  bool Vec1 = false;
+  for (unsigned i = 0, NumVecElts = Mask.size(); i < NumVecElts; ++i) {
+    if (Mask[i] >= 0) {
+      if ((unsigned)Mask[i] >= NumVecElts)
+        Vec1 = true;
+      else
+        Vec0 = true;
+    }
+  }
+  return !(Vec0 && Vec1);
+}
+
+static bool isZeroEltBroadcastVectorMask(ArrayRef<int> Mask) {
+  for (unsigned i = 0; i < Mask.size(); ++i)
+    if (Mask[i] > 0)
+      return false;
+  return true;
+}
+
 static bool isAlternateVectorMask(ArrayRef<int> Mask) {
   bool isAlternate = true;
   unsigned MaskSize = Mask.size();
@@ -501,6 +522,17 @@
       if (isAlternateVectorMask(Mask))
         return TTI->getShuffleCost(TargetTransformInfo::SK_Alternate,
                                    VecTypOp0, 0, nullptr);
+
+      if (isZeroEltBroadcastVectorMask(Mask))
+        return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast,
+                                   VecTypOp0, 0, nullptr);
+
+      if (isSingleSourceVectorMask(Mask))
+        return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
+                                   VecTypOp0, 0, nullptr);
+
+      return TTI->getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
+                                 VecTypOp0, 0, nullptr);
     }
 
     return -1;
Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.h
@@ -80,6 +80,13 @@
 
   int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
 
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
+  int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor, ArrayRef<unsigned> Indices,
+                                 unsigned Alignment, unsigned AddressSpace);
+
   int getIntImmCost(int64_t);
 
   int getIntImmCost(const APInt &Imm, Type *Ty);
Index: llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
+++ llvm/trunk/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -598,9 +598,6 @@
 
 int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                                Type *SubTp) {
-  // We only estimate the cost of reverse and alternate shuffles.
-  if (Kind != TTI::SK_Reverse && Kind != TTI::SK_Alternate)
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 
   if (Kind == TTI::SK_Reverse) {
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -700,9 +697,8 @@
       if (const auto *Entry =
               CostTableLookup(SSE1ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
         return LT.first * Entry->Cost;
-  }
 
-  if (Kind == TTI::SK_Alternate) {
+  } else if (Kind == TTI::SK_Alternate) {
     // 64-bit packed float vectors (v2f32) are widened to type v4f32.
     // 64-bit packed integer vectors (v2i32) are promoted to type v2i64.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
@@ -792,7 +788,132 @@
     if (const auto *Entry = CostTableLookup(SSEAltShuffleTbl,
                                             ISD::VECTOR_SHUFFLE, LT.second))
       return LT.first * Entry->Cost;
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+
+  } else if (Kind == TTI::SK_PermuteTwoSrc) {
+    // We assume that source and destination have the same vector type.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    int NumOfDests = LT.first;
+    int NumOfShufflesPerDest = LT.first * 2 - 1;
+    int NumOfShuffles = NumOfDests * NumOfShufflesPerDest;
+
+    static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermt2b
+        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}, // vpermt2b
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}  // vpermt2b
+    };
+
+    if (ST->hasVBMI())
+      if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+    static const CostTblEntry AVX512BWShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermt2w
+        {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3},  // zext + vpermt2w + trunc
+        {ISD::VECTOR_SHUFFLE, MVT::v64i8, 19}, // 6 * v32i8 + 1
+        {ISD::VECTOR_SHUFFLE, MVT::v16i8, 3}   // zext + vpermt2w + trunc
+    };
+
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                              ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+    static const CostTblEntry AVX512ShuffleTbl[] = {
+        {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermt2d
+        {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermt2d
+        {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermt2pd
+        {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermt2ps
+        {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermt2q
+        {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1}   // vpermt2d
+    };
+
+    if (ST->hasAVX512())
+      if (const auto *Entry =
+              CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+        return NumOfShuffles * Entry->Cost;
+
+  } else if (Kind == TTI::SK_PermuteSingleSrc) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+    if (LT.first == 1) {
+
+      static const CostTblEntry AVX512VBMIShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 1}, // vpermb
+          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 1}  // vpermb
+      };
+
+      if (ST->hasVBMI())
+        if (const auto *Entry = CostTableLookup(AVX512VBMIShuffleTbl,
+                                                ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+      static const CostTblEntry AVX512BWShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v32i16, 1}, // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v16i16, 1}, // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},  // vpermw
+          {ISD::VECTOR_SHUFFLE, MVT::v64i8, 8},  // extend to v32i16
+          {ISD::VECTOR_SHUFFLE, MVT::v32i8, 3}   // vpermw + zext/trunc
+      };
+
+      if (ST->hasBWI())
+        if (const auto *Entry = CostTableLookup(AVX512BWShuffleTbl,
+                                                ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+      static const CostTblEntry AVX512ShuffleTbl[] = {
+          {ISD::VECTOR_SHUFFLE, MVT::v8f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v4f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},  // vpermpd
+          {ISD::VECTOR_SHUFFLE, MVT::v16f32, 1}, // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v8f32, 1},  // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},  // vpermps
+          {ISD::VECTOR_SHUFFLE, MVT::v8i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v4i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},  // vpermq
+          {ISD::VECTOR_SHUFFLE, MVT::v16i32, 1}, // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v8i32, 1},  // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},  // vpermd
+          {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}   // pshufb
+      };
+
+      if (ST->hasAVX512())
+        if (const auto *Entry =
+            CostTableLookup(AVX512ShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
+          return Entry->Cost;
+
+    } else {
+      // We are going to permute multiple sources and the result will be in
+      // multiple destinations. Providing an accurate cost only for splits where
+      // the element type remains the same.
+
+      MVT LegalVT = LT.second;
+      if (LegalVT.getVectorElementType().getSizeInBits() ==
+              Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
+          LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+
+        unsigned VecTySize = DL.getTypeStoreSize(Tp);
+        unsigned LegalVTSize = LegalVT.getStoreSize();
+        // Number of source vectors after legalization:
+        unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+        // Number of destination vectors after legalization:
+        unsigned NumOfDests = LT.first;
+
+        Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
+                                           LegalVT.getVectorNumElements());
+
+        unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
+        return NumOfShuffles *
+               getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
+      }
+    }
   }
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -1942,13 +2063,14 @@
   // Vector-4 of gather/scatter instruction does not exist on KNL.
   // We can extend it to 8 elements, but zeroing upper bits of
   // the mask vector will add more instructions. Right now we give the scalar
-  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction is
-  // better in the VariableMask case.
+  // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
+  // is better in the VariableMask case.
   if (VF == 2 || (VF == 4 && !ST->hasVLX()))
     Scalarize = true;
 
   if (Scalarize)
-    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment, AddressSpace);
+    return getGSScalarCost(Opcode, SrcVTy, VariableMask, Alignment,
+                           AddressSpace);
 
   return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
 }
@@ -2013,3 +2135,115 @@
   // As a temporary solution, disable on Atom.
   return !(ST->isAtom() || ST->isSLM());
 }
+
+// Get estimation for interleaved load/store operations and strided load.
+// \p Indices contains indices for strided load.
+// \p Factor - the factor of interleaving.
+// AVX-512 provides 3-src shuffles that significantly reduces the cost.
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
+                                                 unsigned Factor,
+                                                 ArrayRef<unsigned> Indices,
+                                                 unsigned Alignment,
+                                                 unsigned AddressSpace) {
+
+  // VecTy for interleave memop is <VF*Factor x Elt>.
+  // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
+  // VecTy = <12 x i32>.
+
+  // Calculate the number of memory operations (NumOfMemOps), required
+  // for load/store the VecTy.
+  MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
+  unsigned VecTySize = DL.getTypeStoreSize(VecTy);
+  unsigned LegalVTSize = LegalVT.getStoreSize();
+  unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
+
+  // Get the cost of one memory operation.
+  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
+                                        LegalVT.getVectorNumElements());
+  unsigned MemOpCost =
+      getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+
+  if (Opcode == Instruction::Load) {
+    // Kind of shuffle depends on number of loaded values.
+    // If we load the entire data in one register, we can use a 1-src shuffle.
+    // Otherwise, we'll merge 2 sources in each operation.
+    TTI::ShuffleKind ShuffleKind =
+        (NumOfMemOps > 1) ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc;
+
+    unsigned ShuffleCost =
+        getShuffleCost(ShuffleKind, SingleMemOpTy, 0, nullptr);
+
+    unsigned NumOfLoadsInInterleaveGrp =
+        Indices.size() ? Indices.size() : Factor;
+    Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
+                                     VecTy->getVectorNumElements() / Factor);
+    unsigned NumOfResults =
+        getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
+        NumOfLoadsInInterleaveGrp;
+
+    // About a half of the loads may be folded in shuffles when we have only
+    // one result. If we have more than one result, we do not fold loads at all.
+    unsigned NumOfUnfoldedLoads =
+        NumOfResults > 1 ? NumOfMemOps : NumOfMemOps / 2;
+
+    // Get a number of shuffle operations per result.
+    unsigned NumOfShufflesPerResult =
+        std::max((unsigned)1, (unsigned)(NumOfMemOps - 1));
+
+    // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+    // When we have more than one destination, we need additional instructions
+    // to keep sources.
+    unsigned NumOfMoves = 0;
+    if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
+      NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
+
+    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+
+    return Cost;
+  }
+
+  // Store.
+  assert(Opcode == Instruction::Store &&
+         "Expected Store Instruction at this  point");
+
+  // There is no strided stores meanwhile. And store can't be folded in
+  // shuffle.
+  unsigned NumOfSources = Factor; // The number of values to be merged.
+  unsigned ShuffleCost =
+      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+  unsigned NumOfShufflesPerStore = NumOfSources - 1;
+
+  // The SK_MergeTwoSrc shuffle clobbers one of src operands.
+  // We need additional instructions to keep sources.
+  unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
+  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+             NumOfMoves;
+  return Cost;
+}
+
+int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                           unsigned Factor,
+                                           ArrayRef<unsigned> Indices,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) {
+  auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
+    RequiresBW = false;
+    Type *EltTy = VecTy->getVectorElementType();
+    if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
+        EltTy->isIntegerTy(32) || EltTy->isPointerTy())
+      return true;
+    if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
+      RequiresBW = true;
+      return true;
+    }
+    return false;
+  };
+  bool RequiresBW;
+  bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
+  if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
+                                            Alignment, AddressSpace);
+  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+                                           Alignment, AddressSpace);
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/interleave-load-i32.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/interleave-load-i32.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/interleave-load-i32.ll
@@ -0,0 +1,85 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s 
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i32_interleave4() {
+;CHECK-LABEL: load_i32_interleave4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   %0 = load
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 16
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %3 = or i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3
+  %4 = load i32, i32* %arrayidx6, align 8
+  %add7 = add nsw i32 %add3, %4
+  %5 = or i64 %indvars.iv, 3
+  %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5
+  %6 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add7, %6
+  %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %add11, i32* %arrayidx13, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+define void @load_i32_interleave5() {
+;CHECK-LABEL: load_i32_interleave5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 6 for VF 2 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 9 for VF 4 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 18 for VF 8 For instruction:   %0 = load
+;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   %0 = load
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %1 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %1
+  %2 = load i32, i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %2, %0
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx6 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %3
+  %4 = load i32, i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add3, %4
+  %5 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx10 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %5
+  %6 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %add7, %6
+  %7 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx14 = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %7
+  %8 = load i32, i32* %arrayidx14, align 4
+  %add15 = add nsw i32 %add11, %8
+  %arrayidx17 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %add15, i32* %arrayidx17, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/interleave-store-i32.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/interleave-store-i32.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/interleave-store-i32.ll
@@ -0,0 +1,85 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @store_i32_interleave4() {
+;CHECK-LABEL: store_i32_interleave4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 5 for VF 2 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 5 for VF 4 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 11 for VF 8 For instruction:   store i32 %add16
+;CHECK: Found an estimated cost of 22 for VF 16 For instruction:   store i32 %add16
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 16
+  %add = add nsw i32 %0, 1
+  %1 = or i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1
+  store i32 %add, i32* %arrayidx7, align 4
+  %add10 = add nsw i32 %0, 2
+  %2 = or i64 %indvars.iv, 2
+  %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2
+  store i32 %add10, i32* %arrayidx13, align 8
+  %add16 = add nsw i32 %0, 3
+  %3 = or i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3
+  store i32 %add16, i32* %arrayidx19, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 4
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
+
+define void @store_i32_interleave5() {
+;CHECK-LABEL: store_i32_interleave5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 7 for VF 2 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 14 for VF 4 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 21 for VF 8 For instruction:   store i32 %add22
+;CHECK: Found an estimated cost of 35 for VF 16 For instruction:   store i32 %add22
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %0, i32* %arrayidx2, align 4
+  %add = add nsw i32 %0, 1
+  %1 = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx7 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %1
+  store i32 %add, i32* %arrayidx7, align 4
+  %add10 = add nsw i32 %0, 2
+  %2 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx13 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %2
+  store i32 %add10, i32* %arrayidx13, align 4
+  %add16 = add nsw i32 %0, 3
+  %3 = add nuw nsw i64 %indvars.iv, 3
+  %arrayidx19 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %3
+  store i32 %add16, i32* %arrayidx19, align 4
+  %add22 = add nsw i32 %0, 4
+  %4 = add nuw nsw i64 %indvars.iv, 4
+  %arrayidx25 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %4
+  store i32 %add22, i32* %arrayidx25, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 5
+  %cmp = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-broadcast.ll
@@ -12,19 +12,19 @@
 
 ; CHECK-LABEL: 'test_vXf64'
 define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double> %src512) {
-  ; SSE: Unknown cost {{.*}} %V128 = shufflevector
-  ; AVX: Unknown cost {{.*}} %V128 = shufflevector
-  ; AVX512: Unknown cost {{.*}} %V128 = shufflevector
+  ; SSE: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
   %V128 = shufflevector <2 x double> %src128, <2 x double> undef, <2 x i32> zeroinitializer
 
-  ; SSE: Unknown cost {{.*}} %V256 = shufflevector
-  ; AVX: Unknown cost {{.*}} %V256 = shufflevector
-  ; AVX512: Unknown cost {{.*}} %V256 = shufflevector
+  ; SSE: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> zeroinitializer
 
-  ; SSE: Unknown cost {{.*}} %V512 = shufflevector
-  ; AVX: Unknown cost {{.*}} %V512 = shufflevector
-  ; AVX512: Unknown cost {{.*}} %V512 = shufflevector
+  ; SSE: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX: cost of 1 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> zeroinitializer
 
   ret void
Index: llvm/trunk/test/Analysis/CostModel/X86/shuffle-single-src.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/shuffle-single-src.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-single-src.ll
@@ -0,0 +1,94 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX
+
+;
+; Verify the cost model for 1 src shuffles
+;
+
+; SKX-LABEL: 'test_vXf64'
+define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) {
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; SKX-LABEL: 'test_vXi64'
+define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi32'
+define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 3 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 8 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/shuffle-two-src.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/shuffle-two-src.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/shuffle-two-src.ll
@@ -0,0 +1,68 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX
+
+;
+; Verify the cost model for 2 src shuffles
+;
+
+; SKX-LABEL: 'test_vXf64'
+define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024, <4 x double> %src256_1, <8 x double> %src512_1, <16 x double> %src1024_1) {
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <4 x double> %src256, <4 x double> %src256_1, <4 x i32> <i32 3, i32 3, i32 7, i32 6>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <8 x double> %src512, <8 x double> %src512_1, <8 x i32> <i32 7, i32 6, i32 12, i32 4, i32 3, i32 2, i32 1, i32 15>
+
+  ; SKX: cost of 6 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <16 x double> %src1024, <16 x double> %src1024_1, <16 x i32> <i32 30, i32 14, i32 13, i32 12, i32 13, i32 10, i32 18, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXf32'
+define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512, <32 x float> %src1024, <4 x float> %src128_1, <8 x float> %src256_1, <16 x float> %src512_1, <32 x float> %src1024_1) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <4 x float> %src128, <4 x float> %src128_1, <4 x i32> <i32 3, i32 6, i32 1, i32 5>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <8 x float> %src256, <8 x float> %src256_1, <8 x i32> <i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 12, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <16 x float> %src512, <16 x float> %src512_1, <16 x i32> <i32 15, i32 17, i32 13, i32 20, i32 11, i32 10, i32 8, i32 8, i32 7, i32 22, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 6 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <32 x float> %src1024, <32 x float> %src1024_1, <32 x i32> <i32 31, i32 33, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 48, i32 13, i32 12, i32 11, i32 11, i32 9, i32 45, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi16'
+define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024, <8 x i16> %src128_1, <16 x i16> %src256_1, <32 x i16> %src512_1, <64 x i16> %src1024_1) {
+
+  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <8 x i16> %src128, <8 x i16> %src128_1, <8 x i32> <i32 7, i32 6, i32 6, i32 8, i32 9, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <16 x i16> %src256, <16 x i16> %src256_1, <16 x i32> <i32 15, i32 14, i32 13, i32 20, i32 21, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <32 x i16> %src512, <32 x i16> %src512_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 38, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 6 {{.*}} %V1024 = shufflevector
+  %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> %src1024_1, <64 x i32> <i32 63, i32 62, i32 71, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 66, i32 2, i32 1, i32 0>
+  ret void
+}
+
+; CHECK-LABEL: 'test_vXi8'
+define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512, <16 x i8> %src128_1, <32 x i8> %src256_1, <64 x i8> %src512_1) {
+  ; SKX: cost of 3 {{.*}} %V128 = shufflevector
+  %V128 = shufflevector <16 x i8> %src128, <16 x i8> %src128_1, <16 x i32> <i32 29, i32 14, i32 28, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 3 {{.*}} %V256 = shufflevector
+  %V256 = shufflevector <32 x i8> %src256, <32 x i8> %src256_1, <32 x i32> <i32 31, i32 30, i32 45, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ; SKX: cost of 19 {{.*}} %V512 = shufflevector
+  %V512 = shufflevector <64 x i8> %src512, <64 x i8> %src512_1, <64 x i32> <i32 63, i32 100, i32 61, i32 96, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+
+  ret void
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i16.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i16.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i16.ll
@@ -0,0 +1,113 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i16] zeroinitializer, align 16
+@B = global [10240 x i16] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i16_stride2() {
+;CHECK-LABEL: load_i16_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride3() {
+;CHECK-LABEL: load_i16_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride4() {
+;CHECK-LABEL: load_i16_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i16_stride5() {
+;CHECK-LABEL: load_i16_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 32 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i16], [10240 x i16]* @A, i64 0, i64 %0
+  %1 = load i16, i16* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i16], [10240 x i16]* @B, i64 0, i64 %indvars.iv
+  store i16 %1, i16* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i32.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i32.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i32.ll
@@ -0,0 +1,110 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i32] zeroinitializer, align 16
+@B = global [10240 x i32] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_int_stride2() {
+;CHECK-LABEL: load_int_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride3() {
+;CHECK-LABEL: load_int_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride4() {
+;CHECK-LABEL: load_int_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_int_stride5() {
+;CHECK-LABEL: load_int_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 6 for VF 16 For instruction:  %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i32], [10240 x i32]* @A, i64 0, i64 %0
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [10240 x i32], [10240 x i32]* @B, i64 0, i64 %indvars.iv
+  store i32 %1, i32* %arrayidx2, align 2
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i64.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i64.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i64.ll
@@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i64_stride2() {
+;CHECK-LABEL: load_i64_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride3() {
+;CHECK-LABEL: load_i64_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i64_stride4() {
+;CHECK-LABEL: load_i64_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 2 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 5 for VF 8 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 4
+  %arrayidx = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %0
+  %1 = load i64, i64* %arrayidx, align 16
+  %arrayidx2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %1, i64* %arrayidx2, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
Index: llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll
===================================================================
--- llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll
+++ llvm/trunk/test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -0,0 +1,117 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1| FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@A = global [10240 x i8] zeroinitializer, align 16
+@B = global [10240 x i8] zeroinitializer, align 16
+
+; Function Attrs: nounwind uwtable
+define void @load_i8_stride2() {
+;CHECK-LABEL: load_i8_stride2
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride3() {
+;CHECK-LABEL: load_i8_stride3
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 3
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride4() {
+;CHECK-LABEL: load_i8_stride4
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 59 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @load_i8_stride5() {
+;CHECK-LABEL: load_i8_stride5
+;CHECK: Found an estimated cost of 1 for VF 1 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 3 for VF 4 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 8 for VF 8 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 20 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 39 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 78 for VF 64 For instruction:   %1 = load
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = mul nsw i64 %indvars.iv, 5
+  %arrayidx = getelementptr inbounds [10240 x i8], [10240 x i8]* @A, i64 0, i64 %0
+  %1 = load i8, i8* %arrayidx, align 2
+  %arrayidx2 = getelementptr inbounds [10240 x i8], [10240 x i8]* @B, i64 0, i64 %indvars.iv
+  store i8 %1, i8* %arrayidx2, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}