diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -938,7 +938,8 @@
   /// \p AddressSpace is address space of the pointer.
   /// \p UseMaskForCond indicates if the memory access is predicated.
   /// \p UseMaskForGaps indicates if gaps should be masked.
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                 unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
@@ -1319,7 +1320,8 @@
                                      bool VariableMask, unsigned Alignment,
                                      const Instruction *I = nullptr) = 0;
   virtual int
-  getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                             unsigned Opcode, Type *VecTy, unsigned Factor,
                              ArrayRef<unsigned> Indices, unsigned Alignment,
                              unsigned AddressSpace, bool UseMaskForCond = false,
                              bool UseMaskForGaps = false) = 0;
@@ -1731,11 +1733,12 @@
     return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                        Alignment, I);
   }
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                 unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace, bool UseMaskForCond,
                                  bool UseMaskForGaps) override {
-    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return Impl.getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -445,7 +445,8 @@
     return 1;
   }
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                      unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -900,7 +900,8 @@
     return Cost;
   }
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  				      unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2575,6 +2575,12 @@
     return false;
   }
 
+  /// Lower a shufflevector to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p SI is the shufflevector to RE-interleave the stored vector.
+  virtual bool lowerShuffleVector(ShuffleVectorInst *SI) const { return false; }
+
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -570,6 +570,10 @@
 def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Tbl1_temp_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_v16i8_ty],
+                [IntrNoMem]>;
+
   class AdvSIMD_Tbl1_Intrinsic
     : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
@@ -607,6 +611,7 @@
                  llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
 }
+def int_aarch64_neon_tbl1_temp : AdvSIMD_Tbl1_temp_Intrinsic;
 def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
 def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
 def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -668,11 +668,11 @@
   return Cost;
 }
 
-int TargetTransformInfo::getInterleavedMemoryOpCost(
+int TargetTransformInfo::getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
     bool UseMaskForGaps) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(I, VF,
       Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond,
       UseMaskForGaps);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -112,6 +112,10 @@
   bool lowerInterleavedStore(StoreInst *SI,
                              SmallVector<Instruction *, 32> &DeadInsts);
 
+  /// Transform an type unmatched shufflevector into target specific intrinsics.
+  bool lowerShuffleVector(ShuffleVectorInst *SI,
+                          SmallVector<Instruction *, 32> &DeadInsts);
+
   /// Returns true if the uses of an interleaved load by the
   /// extractelement instructions in \p Extracts can be replaced by uses of the
   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
@@ -443,6 +447,22 @@
   return true;
 }
 
+bool InterleavedAccess::lowerShuffleVector(
+    ShuffleVectorInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
+
+  LLVM_DEBUG(dbgs() << "IA: Found a shufflevector: " << *SI << "\n");
+
+  // Try to create target specific intrinsics to replace the shuffle.
+  if (!TLI->lowerShuffleVector(SI))
+    return false;
+
+  // Already have a new target specific tbl instruction. Erase the old 
+  // shufflevector.
+  DeadInsts.push_back(SI);
+
+  return true;
+}
+
 bool InterleavedAccess::runOnFunction(Function &F) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC || !LowerInterleavedAccesses)
@@ -470,5 +490,14 @@
   for (auto I : DeadInsts)
     I->eraseFromParent();
 
+  SmallVector<Instruction *, 32> SFDeadInsts;
+  for (auto &I : instructions(F)) {
+    if (ShuffleVectorInst *SHI = dyn_cast<ShuffleVectorInst>(&I))
+      Changed |= lowerShuffleVector(SHI, SFDeadInsts);
+  }
+
+  for (auto *I : SFDeadInsts)
+    I->eraseFromParent();
+
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -1207,8 +1207,8 @@
   for (unsigned i = 0; i < Factor; i++)
     Indices.push_back(i);
   InterleavedCost = TTI.getInterleavedMemoryOpCost(
-      Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
-      InsertionPoint->getPointerAddressSpace());
+      nullptr, 0, Instruction::Load, ILTy, Factor, Indices,
+      InsertionPoint->getAlignment(), InsertionPoint->getPointerAddressSpace());
 
   if (InterleavedCost >= InstructionCost) {
     return false;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -3678,6 +3678,19 @@
     case Intrinsic::aarch64_tagp:
       SelectTagP(Node);
       return;
+    case Intrinsic::aarch64_neon_tbl1_temp: {
+      SDLoc Dl(Node);
+
+      SmallVector<SDValue, 2> Ops;
+      // the source vector
+      Ops.push_back(Node->getOperand(1));
+      // the mask
+      Ops.push_back(Node->getOperand(2));
+      ReplaceNode(Node,
+                  CurDAG->getMachineNode(AArch64::TBLv16i8One, Dl, VT, Ops));
+
+      return;
+    }
     case Intrinsic::aarch64_neon_tbl2:
       SelectTable(Node, 2,
                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -457,6 +457,7 @@
                             unsigned Factor) const override;
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
+  bool lowerShuffleVector(ShuffleVectorInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
@@ -867,6 +868,13 @@
 
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
+
+  /// Create a tbl1 mask with default 0xFF.
+  /// This function creates tbl1 mask whose elements are defaults to 0xff which
+  /// means to fill '0' to the output vector.
+  Constant *createTbl1Mask(IRBuilderBase &Builder,
+                           ArrayRef<int> InputMask, unsigned NumElts,
+                           unsigned InputEltSize, unsigned OutputEltSize) const;
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9622,6 +9622,123 @@
   return true;
 }
 
+bool AArch64TargetLowering::lowerShuffleVector(ShuffleVectorInst *SI) const {
+  IRBuilder<> Builder(SI);
+
+  // First check the shuffle_vector instruction
+  // 1) The first operand has to be 128 bit, byte mask requires the vector
+  //    size has to be 16*i8. We do not handle small vector shuffle here for
+  //    TBL1 instruction, for example, v2i16 size is 32
+  // 2) The 2nd operand has to be UNDEF for tbl1 instruction
+  if (SI->getOperand(0)->getType()->isVectorTy() &&
+      SI->getOperand(0)->getType()->getPrimitiveSizeInBits() != 128)
+    return false;
+
+  // The 2nd operand has to be UNDEF
+  if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
+    if (!(isa<UndefValue>(C)))
+      return false;
+
+  // We only handle shuffle_vector which has only one user instruction here,
+  // because multiple user instructions will cause multiple tbl1 instructions
+  // generated. we leave it to the next stage implementation
+  if (!SI->hasOneUse())
+    return false;
+
+  // Now we check the one use instruction, we only handle UItoFP at this stage
+  // and a few other instructions. The user instruction list can also be
+  // expanded later
+  auto UI = SI->user_begin();
+  Instruction *I = cast<Instruction>(*UI);
+
+  // we only support the following instructions at this stage
+  // it can be expanded
+  if (I->getOpcode() != Instruction::UIToFP &&
+      I->getOpcode() != Instruction::FAdd &&
+      I->getOpcode() != Instruction::FSub &&
+      I->getOpcode() != Instruction::FMul &&
+      I->getOpcode() != Instruction::Add &&
+      I->getOpcode() != Instruction::Sub) {
+    LLVM_DEBUG(dbgs() << "Quit Shuffle vector's user instruction not qualify : "
+                      << *I << "\n");
+    return false;
+  }
+
+  // Now we do the type check on the vector.
+  // If the type of the input vector to the user instuction is the same the
+  // output of the user instruction, then it is already handled in later DAG
+  // lowering stage, no need to to handle them here
+  VectorType *SVTy = SI->getType();
+  if (SVTy == I->getType())
+    return false;
+
+  // At the point we exclude all the not handled situations, we can work out
+  // the intrinsic call
+  Type *SVEltTy = SVTy->getElementType();
+  unsigned SVNum = SVTy->getNumElements();
+  Type *PromotedIntTy;
+
+  // Here we need to decide the tbl1 instruction's result type based on
+  // its users (UIToFP) result type
+  // As the result type can only be 64-bit or 32-bit float, we can set
+  // corresponding integer type to the tbl1's result
+  unsigned UIEltSize =
+      I->getType()->getArrayElementType()->getScalarSizeInBits();
+  if (UIEltSize == 64 && SVNum == 2)
+    PromotedIntTy = Type::getInt64Ty(SI->getType()->getContext());
+  else if (UIEltSize == 32 && SVNum == 4)
+    PromotedIntTy = Type::getInt32Ty(SI->getType()->getContext());
+  else
+    return false;
+
+  VectorType *VecTy = VectorType::get(PromotedIntTy, SVNum);
+
+  // VecTy is the tbl1 result type, this needs to be worked out
+  // Followed by tbl1 input source vector type
+  Type *Tys[2] = {VecTy, SI->getOperand(0)->getType()};
+
+  // Get the input Mask
+  auto Mask = SI->getShuffleMask();
+
+  // Generate the intrinsic function call
+  Function *Tbl1Func = Intrinsic::getDeclaration(
+      SI->getModule(), Intrinsic::aarch64_neon_tbl1_temp, Tys);
+
+  // Generate one Tbl1 for each use, could merge if the uses are the same
+  // in terms of the input type
+  for (auto UI = SI->user_begin(), E = SI->user_end(); UI != E; UI++) {
+    Instruction *I = cast<Instruction>(*UI);
+    Type *UserTy = I->getType();
+
+    // Two operands, 1st is the Mask, 2nd one is the input vector
+    SmallVector<Value *, 2> Ops;
+
+    // This is the vector operand to the Tbl1 intrisic, any vector type is OK
+    // however we need to adjust it to match the user result type
+    // we should be save to arbitarily change the type here however there could
+    // be a problem in later passes
+    Ops.push_back(SI->getOperand(0));
+
+    // This is the mask operand to the Tbl1 intrinsic, it has to be v16i8 type
+    // we need to work it out from the input mask together with the result type
+    // input mask is SI->getOperand[2]
+    // result type is the user of SI, I->getType()
+    unsigned InputEltSize = SVEltTy->getPrimitiveSizeInBits();
+    unsigned OutputEltSize =
+        UserTy->getArrayElementType()->getPrimitiveSizeInBits();
+    Value *Tbl1mask =
+        createTbl1Mask(Builder, Mask, SVNum, InputEltSize, OutputEltSize);
+    LLVM_DEBUG(dbgs() << "Tbl1 mask: "; Tbl1mask->dump());
+    Ops.push_back(Tbl1mask);
+
+    // Make the call for this user
+    CallInst *Tbl1 = Builder.CreateCall(Tbl1Func, Ops);
+    UI->replaceUsesOfWith(SI, Tbl1);
+  }
+
+  // Return true if it is successful
+  return true;
+}
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
@@ -14043,3 +14160,51 @@
   }
   return TargetLoweringBase::shouldLocalize(MI, TTI);
 }
+
+Constant *AArch64TargetLowering::createTbl1Mask(IRBuilderBase &Builder,
+                                                ArrayRef<int> InputMask,
+                                                unsigned NumElts,
+                                                unsigned InputEltSize,
+                                                unsigned OutputEltSize) const {
+
+  unsigned InputEltIdx = 0;
+  unsigned CurrInputIdx = 0;
+  unsigned CurrOffset;
+  unsigned OffsetLeft = 0;
+  unsigned OffsetRight = InputEltSize;
+
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned Idx = 0; Idx < 16; Idx++) {
+    // if all the elements are placed in the output vector, then just fill up
+    // with out of range index
+    if (InputEltIdx >= NumElts)
+      Mask.push_back(Builder.getInt8(255));
+    else {
+      CurrOffset = Idx * 8;
+      if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) {
+        CurrInputIdx = InputMask[InputEltIdx] * InputEltSize / 8 +
+                       (CurrOffset - OffsetLeft) / 8;
+        Mask.push_back(Builder.getInt8(CurrInputIdx));
+      }
+      // finished one input element, move to the next
+      else if (CurrOffset == OffsetRight) {
+        InputEltIdx++;
+        if (InputEltIdx >= NumElts) {
+          Mask.push_back(Builder.getInt8(255));
+          continue;
+        }
+        OffsetLeft = OutputEltSize * InputEltIdx;
+        OffsetRight = OffsetLeft + InputEltSize;
+        // check this new byte
+        if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) {
+          CurrInputIdx = InputMask[InputEltIdx] * InputEltSize / 8 +
+                         (CurrOffset - OffsetLeft) / 8;
+          Mask.push_back(Builder.getInt8(CurrInputIdx));
+        } else
+          Mask.push_back(Builder.getInt8(255));
+      } else
+        Mask.push_back(Builder.getInt8(255));
+    }
+  }
+  return ConstantVector::get(Mask);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -189,7 +189,8 @@
     return BaseT::isLegalNTStore(DataType, Alignment);
   }
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -683,13 +683,11 @@
   return LT.first;
 }
 
-int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                               unsigned Factor,
-                                               ArrayRef<unsigned> Indices,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool UseMaskForCond,
-                                               bool UseMaskForGaps) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) {
+
   assert(Factor >= 2 && "Invalid interleave factor");
   auto *VecVTy = cast<VectorType>(VecTy);
 
@@ -706,8 +704,32 @@
       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+  // we now check to see if this interleave memory access can be lowered
+  // to TBL1 instruction later in the IntereleavedAccessPass
+  // if True, then the cost will be the number of TBL1 * the basic cost of
+  // TBL1 instruction which is set to 1 at this time
+  if (I && VF > 1 && I->hasOneUse()) {
+    auto UI = I->user_begin();
+    Instruction *UserInstruction = cast<Instruction>(*UI);
+    // We currently just support the following instructions, can be expanded
+    if (UserInstruction->getOpcode() == Instruction::UIToFP ||
+        UserInstruction->getOpcode() == Instruction::FAdd ||
+        UserInstruction->getOpcode() == Instruction::FSub ||
+        UserInstruction->getOpcode() == Instruction::FMul ||
+        UserInstruction->getOpcode() == Instruction::Add ||
+        UserInstruction->getOpcode() == Instruction::Sub) {
+      // the first check to make sure the result can form a 128-bit vector
+      // the 2nd check to make sure the input data can fit into 128-bit vector
+      // so that we can use tbl1 instruction
+      // there will be Group->getFactor() tbl1 generated, each tbl1 costs 1
+      if ((UserInstruction->getType()->getScalarSizeInBits() * VF == 128) &&
+          (I->getType()->getScalarSizeInBits() * Factor * VF == 128))
+        return Factor * 1;
+    }
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                           Indices, Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -216,7 +216,8 @@
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                 unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -822,7 +822,7 @@
   return BaseCost * LT.first;
 }
 
-int ARMTTIImpl::getInterleavedMemoryOpCost(
+int ARMTTIImpl::getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
     bool UseMaskForGaps) {
@@ -855,7 +855,7 @@
       return 2 * BaseCost;
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+  return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -124,7 +124,8 @@
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
                                   bool VariableMask, unsigned Alignment,
                                   const Instruction *I);
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+            unsigned Opcode, Type *VecTy,
             unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
             unsigned AddressSpace, bool UseMaskForCond = false,
             bool UseMaskForGaps = false);
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -220,12 +220,14 @@
                                        Alignment, I);
 }
 
-unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
+unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(Instruction *I,
+      unsigned VF, unsigned Opcode,
       Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
       unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
       bool UseMaskForGaps) {
   if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(I, VF,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
   return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -104,7 +104,8 @@
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                 unsigned Opcode, Type *VecTy,
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -907,7 +907,8 @@
   return Cost;
 }
 
-int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+int PPCTTIImpl::getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                           unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
@@ -915,7 +916,8 @@
                                            bool UseMaskForCond,
                                            bool UseMaskForGaps) {
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(I, VF,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -96,7 +96,8 @@
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VFactor,
+                                 unsigned Opcode, Type *VecTy,
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
                                  unsigned Alignment,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1072,7 +1072,8 @@
 // needed for using / defining the vector operands. The SystemZ version does
 // roughly the same but bases the computations on vector permutations
 // instead.
-int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+int SystemZTTIImpl::getInterleavedMemoryOpCost(Instruction *I, unsigned VFactor,
+                                               unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
@@ -1080,7 +1081,8 @@
                                                bool UseMaskForCond,
                                                bool UseMaskForGaps) {
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(I, VFactor,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
   assert(isa<VectorType>(VecTy) &&
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -161,7 +161,8 @@
   int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                              bool IsPairwiseForm, bool IsUnsigned);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                 unsigned Opcode, Type *VecTy,
                                  unsigned Factor, ArrayRef<unsigned> Indices,
                                  unsigned Alignment, unsigned AddressSpace,
                                  bool UseMaskForCond = false,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3904,14 +3904,16 @@
                                                bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
   if (Indices.size() && Indices.size() != Factor)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
@@ -3923,7 +3925,8 @@
   // the VF=2, while v2i128 is an unsupported MVT vector type
   // (see MachineValueType.h::getVectorVT()).
   if (!LegalVT.isVector())
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace);
 
   unsigned VF = cast<VectorType>(VecTy)->getNumElements() / Factor;
@@ -3945,7 +3948,8 @@
   VectorType *VT = VectorType::get(ScalarTy, VF);
   EVT ETy = TLI->getValueType(DL, VT);
   if (!ETy.isSimple())
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0,
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace);
 
   // TODO: Complete for other data-types and strides.
@@ -4004,7 +4008,8 @@
       return NumOfMemOps * MemOpCost + Entry->Cost;
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+  return BaseT::getInterleavedMemoryOpCost(nullptr, 0,
+                                           Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace);
 }
 
@@ -4021,7 +4026,8 @@
                                                  bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0, 
+                                             Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
 
@@ -4139,7 +4145,8 @@
   return Cost;
 }
 
-int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+int X86TTIImpl::getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                           unsigned Opcode, Type *VecTy,
                                            unsigned Factor,
                                            ArrayRef<unsigned> Indices,
                                            unsigned Alignment,
@@ -4164,7 +4171,8 @@
                                           Alignment, AddressSpace,
                                           UseMaskForCond, UseMaskForGaps);
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+  return BaseT::getInterleavedMemoryOpCost(I, VF,
+                                           Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5938,7 +5938,7 @@
   bool UseMaskForGaps =
       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   unsigned Cost = TTI.getInterleavedMemoryOpCost(
-      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+      I, VF, I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
 
   if (Group->isReverse()) {