Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2549,6 +2549,45 @@
   return !(ST->isAtom());
 }
 
+// OptimizeInterleavedCost - This function returns a true if the Interleaved
+// pass supports the interleaved or false otherwise. If the interleaved is
+// supported by the pass, the function computes the real cost of the interleaved
+// inside the 'Cost' argument. The 'Factor' argument contains the stride
+// information of the interleaved.
+static bool optimizeInterleavedCost(unsigned Opcode, Type *VecTy,
+                                    unsigned Factor, int &Cost) {
+  // The following types are supported by X86InterleavedAccess  pass. The v8i8
+  // is blocked by the pass for stride 3.
+  std::vector<MVT> supportType = {MVT::v8i8, MVT::v16i8, MVT::v32i8,
+                                  MVT::v64i8};
+  if (VecTy->getScalarSizeInBits() != 8)
+    return false;
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  MVT VT = MVT::getVectorVT(MVT::i8, VF);
+  auto NativeVT = std::find(supportType.begin(), supportType.end(), VT);
+  if (NativeVT == supportType.end())
+    return false;
+  if (Opcode == Instruction::Store && Factor == 4) {
+    unsigned Vinsert = VF > 8 ? 3 * (VF / 16) : 1;
+    unsigned Vmov = VF > 8 ? VF / 16 : 1;
+    unsigned Extra = VF == 8 ? 1 : 0;
+    unsigned Vpunpck = 8;
+    unsigned Vextract = VF == 64 ? 4 : 0;
+    Cost = Vmov + Vpunpck + Vinsert + Vextract + Extra;
+    return true;
+  }
+  if (Factor == 3 && NativeVT[0] != MVT::v8i8) {
+    unsigned Vmov = VF == 64 ? 6 : 3 + 1;
+    unsigned Vpshufb = 3;
+    unsigned Vpalignr = 6;
+    unsigned Vinsert = VF == 16 ? 0 : std::pow(3, VF / 32);
+    unsigned Vextract = VF == 64 ? 4 : 0;
+    Cost = Vmov + Vpshufb + Vinsert + Vpalignr + Vextract;
+    return true;
+  }
+  return false;
+}
+
 // Get estimation for interleaved load/store operations for AVX2.
 // \p Factor is the interleaved-access factor (stride) - number of
 // (interleaved) elements in the group.
@@ -2566,6 +2605,9 @@
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
                                                unsigned AddressSpace) {
+  int Cost;
+  if (optimizeInterleavedCost(Opcode, VecTy, Factor, Cost))
+    return Cost;
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2683,7 +2725,9 @@
                                         LegalVT.getVectorNumElements());
   unsigned MemOpCost =
       getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
-
+  int Cost;
+  if (optimizeInterleavedCost(Opcode, VecTy, Factor, Cost))
+    return Cost;
   if (Opcode == Instruction::Load) {
     // Kind of shuffle depends on number of loaded values.
     // If we load the entire data in one register, we can use a 1-src shuffle.
@@ -2718,7 +2762,7 @@
     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
 
-    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+    Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
                NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
 
     return Cost;
@@ -2732,13 +2776,13 @@
   // shuffle.
   unsigned NumOfSources = Factor; // The number of values to be merged.
   unsigned ShuffleCost =
-      getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
+        getShuffleCost(TTI::SK_PermuteTwoSrc, SingleMemOpTy, 0, nullptr);
   unsigned NumOfShufflesPerStore = NumOfSources - 1;
 
   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
   // We need additional instructions to keep sources.
   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
-  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+  Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
              NumOfMoves;
   return Cost;
 }
Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/interleaved-load-i8.ll
+++ test/Analysis/CostModel/X86/interleaved-load-i8.ll
@@ -10,8 +10,8 @@
 ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   %0 = load i8
 entry:
   %cmp13 = icmp sgt i32 %Nels, 0
   br i1 %cmp13, label %for.body.preheader, label %for.end
Index: test/Analysis/CostModel/X86/interleaved-store-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/interleaved-store-i8.ll
+++ test/Analysis/CostModel/X86/interleaved-store-i8.ll
@@ -10,8 +10,8 @@
 ;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i8 %conv4
 ;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i8 %conv4
 ;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv4
 entry:
   %cmp14 = icmp sgt i32 %Nels, 0
   br i1 %cmp14, label %for.body.lr.ph, label %for.end
@@ -47,9 +47,9 @@
 ;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv7
 ;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i8 %conv7
 ;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv7
 entry:
   %cmp19 = icmp sgt i32 %Nels, 0
   br i1 %cmp19, label %for.body.lr.ph, label %for.end
Index: test/Analysis/CostModel/X86/strided-load-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/strided-load-i8.ll
+++ test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -41,9 +41,9 @@
 ;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
 ;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
 ;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 13 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 16 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 28 for VF 64 For instruction:   %1 = load
 entry:
   br label %for.body