Index: lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- lib/Target/X86/X86TargetTransformInfo.cpp
+++ lib/Target/X86/X86TargetTransformInfo.cpp
@@ -2549,6 +2549,62 @@
   return !(ST->isAtom());
 }
 
+// X86InterleavedPassCost - This function returns true if the
+// X86InterleavedAccess pass supports the specific interleaved access group at
+// hand or false otherwise. If it does, the function computes the \p 'Cost' of
+// the optimized load/store+shuffle sequence that the X86InterleavedAccess pass
+// will generate for this interleaved-access group.
+// The \p 'Factor' contains the stride information of the interleaved.
+static bool X86InterleavedPassCost(unsigned Opcode, Type *VecTy,
+                                   unsigned Factor, int &Cost) {
+  //  The following types are supported by X86InterleavedAccess pass.
+  //  The type 'v8i8' is blocked by the pass for stride 3.
+  std::vector<MVT> SupportType = {MVT::v8i8, MVT::v16i8, MVT::v32i8,
+                                  MVT::v64i8};
+  //  Currently the X86InterleavedAccess pass supports only char accesses.
+  if (VecTy->getScalarSizeInBits() != 8)
+    return false;
+  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  MVT VT = MVT::getVectorVT(MVT::i8, VF);
+  auto NativeVT = std::find(SupportType.begin(), SupportType.end(), VT);
+  if (NativeVT == SupportType.end())
+    return false;
+  if ((Opcode == Instruction::Store && Factor == 4) ||
+      (Factor == 3 && NativeVT[0] != MVT::v8i8)) {
+    // Base - contains the number of instructions for single lane.
+    unsigned Base = Factor == 4 ? 8 : 9;
+    unsigned Lanes = std::max(int(VT.getSizeInBits() / 128), 1);
+
+    // The number of moves is equal to the 'Factor' (for load or store) + the
+    // number of move for rearranging the data inside the lanes.
+    unsigned Vmov = std::max(int(std::log2(Lanes)), 1) * Factor;
+
+    // For Factor == 3 there is an extra move for the suffle instruction.
+    Vmov += Factor == 4 ? 0 : 1;
+
+    // 'NumberOfPairs' contains the number of instructions for reconstruct the
+    // chosen register. The value is calculated according to the
+    // X86InterleavedAccess pass and it equal to the number of pairs that
+    // built the chosen register from the basic lane size (128) up to the size
+    // of the chosen register.
+    // For example: The 512 bits register contains 4 lanes of 128.
+    // The 'NumberOfPairs' to reconstruct this register equal to two pairs for 128
+    // and 1 pair of 256. Totle of 3 pairs.
+    unsigned NumberOfPairs = 0;
+    unsigned TotleLanes = Lanes * Factor;
+    while (TotleLanes > Factor) {
+      TotleLanes = TotleLanes / 2;
+      NumberOfPairs += TotleLanes;
+    }
+
+    Cost = Vmov + Base + NumberOfPairs;
+    Cost += (VF == 64 && Factor != 4 && Opcode == Instruction::Store) ? 4 : 0;
+    Cost -= (VF == 8 && Factor == 4) ? 1 : 0;
+    return true;
+  }
+  return false;
+}
+
 // Get estimation for interleaved load/store operations for AVX2.
 // \p Factor is the interleaved-access factor (stride) - number of
 // (interleaved) elements in the group.
@@ -2566,6 +2622,7 @@
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
                                                unsigned AddressSpace) {
+  int Cost;
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
@@ -2573,6 +2630,9 @@
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace);
 
+  if (X86InterleavedPassCost(Opcode, VecTy, Factor, Cost))
+    return Cost;
+
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
   // VecTy = <12 x i32>.
@@ -2671,6 +2731,10 @@
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
   // VecTy = <12 x i32>.
 
+  int Cost;
+  if (X86InterleavedPassCost(Opcode, VecTy, Factor, Cost))
+    return Cost;
+
   // Calculate the number of memory operations (NumOfMemOps), required
   // for load/store the VecTy.
   MVT LegalVT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
@@ -2718,8 +2782,8 @@
     if (NumOfResults > 1 && ShuffleKind == TTI::SK_PermuteTwoSrc)
       NumOfMoves = NumOfResults * NumOfShufflesPerResult / 2;
 
-    int Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
-               NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
+    Cost = NumOfResults * NumOfShufflesPerResult * ShuffleCost +
+           NumOfUnfoldedLoads * MemOpCost + NumOfMoves;
 
     return Cost;
   }
@@ -2738,8 +2802,8 @@
   // The SK_MergeTwoSrc shuffle clobbers one of src operands.
   // We need additional instructions to keep sources.
   unsigned NumOfMoves = NumOfMemOps * NumOfShufflesPerStore / 2;
-  int Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
-             NumOfMoves;
+  Cost = NumOfMemOps * (MemOpCost + NumOfShufflesPerStore * ShuffleCost) +
+         NumOfMoves;
   return Cost;
 }
 
Index: test/Analysis/CostModel/X86/interleaved-load-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/interleaved-load-i8.ll
+++ test/Analysis/CostModel/X86/interleaved-load-i8.ll
@@ -10,8 +10,8 @@
 ;CHECK: LV: Found an estimated cost of 11 for VF 2 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 5 for VF 4 For instruction:   %0 = load i8
 ;CHECK: LV: Found an estimated cost of 10 for VF 8 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 20 for VF 16 For instruction:   %0 = load i8
-;CHECK: LV: Found an estimated cost of 45 for VF 32 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction:   %0 = load i8
+;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   %0 = load i8
 entry:
   %cmp13 = icmp sgt i32 %Nels, 0
   br i1 %cmp13, label %for.body.preheader, label %for.end
Index: test/Analysis/CostModel/X86/interleaved-store-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/interleaved-store-i8.ll
+++ test/Analysis/CostModel/X86/interleaved-store-i8.ll
@@ -10,8 +10,8 @@
 ;CHECK: LV: Found an estimated cost of 8 for VF 2 For instruction:   store i8 %conv4
 ;CHECK: LV: Found an estimated cost of 9 for VF 4 For instruction:   store i8 %conv4
 ;CHECK: LV: Found an estimated cost of 12 for VF 8 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 19 for VF 16 For instruction:   store i8 %conv4
-;CHECK: LV: Found an estimated cost of 35 for VF 32 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 13 for VF 16 For instruction:   store i8 %conv4
+;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv4
 entry:
   %cmp14 = icmp sgt i32 %Nels, 0
   br i1 %cmp14, label %for.body.lr.ph, label %for.end
@@ -47,9 +47,9 @@
 ;CHECK: LV: Found an estimated cost of 1 for VF 1 For instruction:   store i8 %conv7
 ;CHECK: LV: Found an estimated cost of 13 for VF 2 For instruction:   store i8 %conv7
 ;CHECK: LV: Found an estimated cost of 10 for VF 4 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 17 for VF 8 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 22 for VF 16 For instruction:   store i8 %conv7
-;CHECK: LV: Found an estimated cost of 44 for VF 32 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 11 for VF 8 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 12 for VF 16 For instruction:   store i8 %conv7
+;CHECK: LV: Found an estimated cost of 16 for VF 32 For instruction:   store i8 %conv7
 entry:
   %cmp19 = icmp sgt i32 %Nels, 0
   br i1 %cmp19, label %for.body.lr.ph, label %for.end
Index: test/Analysis/CostModel/X86/strided-load-i8.ll
===================================================================
--- test/Analysis/CostModel/X86/strided-load-i8.ll
+++ test/Analysis/CostModel/X86/strided-load-i8.ll
@@ -41,9 +41,9 @@
 ;CHECK: Found an estimated cost of 1 for VF 2 For instruction:   %1 = load
 ;CHECK: Found an estimated cost of 1 for VF 4 For instruction:   %1 = load
 ;CHECK: Found an estimated cost of 3 for VF 8 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 8 for VF 16 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 20 for VF 32 For instruction:   %1 = load
-;CHECK: Found an estimated cost of 39 for VF 64 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 13 for VF 16 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 16 for VF 32 For instruction:   %1 = load
+;CHECK: Found an estimated cost of 28 for VF 64 For instruction:   %1 = load
 entry:
   br label %for.body