Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -938,7 +938,8 @@
   /// \p AddressSpace is address space of the pointer.
   /// \p UseMaskForCond indicates if the memory access is predicated.
   /// \p UseMaskForGaps indicates if gaps should be masked.
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
@@ -1319,7 +1320,8 @@
                                      bool VariableMask, unsigned Alignment,
                                      const Instruction *I = nullptr) = 0;
   virtual int
-  getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                             Type *VecTy, unsigned Factor,
                              ArrayRef<unsigned> Indices, unsigned Alignment,
                              unsigned AddressSpace, bool UseMaskForCond = false,
                              bool UseMaskForGaps = false) = 0;
@@ -1731,12 +1733,13 @@
     return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                        Alignment, I);
   }
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace, bool UseMaskForCond,
                                  bool UseMaskForGaps) override {
-    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+    return Impl.getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                           Indices, Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
   }
   int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -445,7 +445,8 @@
     return 1;
   }
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                      unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -900,7 +900,8 @@
     return Cost;
   }
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                      unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2575,6 +2575,12 @@
     return false;
   }
 
+  /// Lower a shufflevector to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p SI is the shufflevector to RE-interleave the stored vector.
+  virtual bool lowerShuffleVector(ShuffleVectorInst *SI) const { return false; }
+
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -570,6 +570,10 @@
 def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Tbl1_temp_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_v16i8_ty],
+                [IntrNoMem]>;
+
   class AdvSIMD_Tbl1_Intrinsic
     : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
@@ -607,6 +611,7 @@
                  llvm_v16i8_ty, llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
 }
+def int_aarch64_neon_tbl1_temp : AdvSIMD_Tbl1_temp_Intrinsic;
 def int_aarch64_neon_tbl1 : AdvSIMD_Tbl1_Intrinsic;
 def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
 def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -669,12 +669,12 @@
 }
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
-    bool UseMaskForGaps) const {
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) const {
   int Cost = TTIImpl->getInterleavedMemoryOpCost(
-      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, UseMaskForCond,
-      UseMaskForGaps);
+      I, VF, Opcode, VecTy, Factor, Indices, Alignment, AddressSpace,
+      UseMaskForCond, UseMaskForGaps);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
Index: llvm/lib/CodeGen/InterleavedAccessPass.cpp
===================================================================
--- llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -112,6 +112,10 @@
   bool lowerInterleavedStore(StoreInst *SI,
                              SmallVector<Instruction *, 32> &DeadInsts);
 
+  /// Transform an type unmatched shufflevector into target specific intrinsics.
+  bool lowerShuffleVector(ShuffleVectorInst *SI,
+                          SmallVector<Instruction *, 32> &DeadInsts);
+
   /// Returns true if the uses of an interleaved load by the
   /// extractelement instructions in \p Extracts can be replaced by uses of the
   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
@@ -443,6 +447,22 @@
   return true;
 }
 
+bool InterleavedAccess::lowerShuffleVector(
+    ShuffleVectorInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
+
+  LLVM_DEBUG(dbgs() << "IA: Found a shufflevector: " << *SI << "\n");
+
+  // Try to create target specific intrinsics to replace the shuffle.
+  if (!TLI->lowerShuffleVector(SI))
+    return false;
+
+  // Already have a new target specific tbl instruction. Erase the old
+  // shufflevector.
+  DeadInsts.push_back(SI);
+
+  return true;
+}
+
 bool InterleavedAccess::runOnFunction(Function &F) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC || !LowerInterleavedAccesses)
@@ -470,5 +490,14 @@
   for (auto I : DeadInsts)
     I->eraseFromParent();
 
+  SmallVector<Instruction *, 32> SFDeadInsts;
+  for (auto &I : instructions(F)) {
+    if (ShuffleVectorInst *SHI = dyn_cast<ShuffleVectorInst>(&I))
+      Changed |= lowerShuffleVector(SHI, SFDeadInsts);
+  }
+
+  for (auto *I : SFDeadInsts)
+    I->eraseFromParent();
+
   return Changed;
 }
Index: llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
===================================================================
--- llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -1207,8 +1207,8 @@
   for (unsigned i = 0; i < Factor; i++)
     Indices.push_back(i);
   InterleavedCost = TTI.getInterleavedMemoryOpCost(
-      Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
-      InsertionPoint->getPointerAddressSpace());
+      nullptr, 0, Instruction::Load, ILTy, Factor, Indices,
+      InsertionPoint->getAlignment(), InsertionPoint->getPointerAddressSpace());
 
   if (InterleavedCost >= InstructionCost) {
     return false;
Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -3678,6 +3678,19 @@
     case Intrinsic::aarch64_tagp:
       SelectTagP(Node);
       return;
+    case Intrinsic::aarch64_neon_tbl1_temp: {
+      SDLoc Dl(Node);
+
+      SmallVector<SDValue, 2> Ops;
+      // the source vector
+      Ops.push_back(Node->getOperand(1));
+      // the mask
+      Ops.push_back(Node->getOperand(2));
+      ReplaceNode(Node,
+                  CurDAG->getMachineNode(AArch64::TBLv16i8One, Dl, VT, Ops));
+
+      return;
+    }
     case Intrinsic::aarch64_neon_tbl2:
       SelectTable(Node, 2,
                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -457,6 +457,7 @@
                             unsigned Factor) const override;
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
+  bool lowerShuffleVector(ShuffleVectorInst *SI) const override;
 
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
@@ -867,6 +868,13 @@
 
   bool shouldLocalize(const MachineInstr &MI,
                       const TargetTransformInfo *TTI) const override;
+
+  /// Create a tbl1 mask with default 0xFF.
+  /// This function creates tbl1 mask whose elements are defaults to 0xff which
+  /// means to fill '0' to the output vector.
+  Constant *createTbl1Mask(IRBuilderBase &Builder, ArrayRef<int> InputMask,
+                           unsigned NumElts, unsigned InputEltSize,
+                           unsigned OutputEltSize) const;
 };
 
 namespace AArch64 {
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9622,6 +9622,123 @@
   return true;
 }
 
+bool AArch64TargetLowering::lowerShuffleVector(ShuffleVectorInst *SI) const {
+  IRBuilder<> Builder(SI);
+
+  // First check the shuffle_vector instruction
+  // 1) The first operand has to be 128 bit, byte mask requires the vector
+  //    size has to be 16*i8. We do not handle small vector shuffle here for
+  //    TBL1 instruction, for example, v2i16 size is 32
+  // 2) The 2nd operand has to be UNDEF for tbl1 instruction
+  if (SI->getOperand(0)->getType()->isVectorTy() &&
+      SI->getOperand(0)->getType()->getPrimitiveSizeInBits() != 128)
+    return false;
+
+  // The 2nd operand has to be UNDEF
+  if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
+    if (!(isa<UndefValue>(C)))
+      return false;
+
+  // We only handle shuffle_vector which has only one user instruction here,
+  // because multiple user instructions will cause multiple tbl1 instructions
+  // generated. we leave it to the next stage implementation
+  if (!SI->hasOneUse())
+    return false;
+
+  // Now we check the one use instruction, we only handle UItoFP at this stage
+  // and a few other instructions. The user instruction list can also be
+  // expanded later
+  auto UI = SI->user_begin();
+  Instruction *I = cast<Instruction>(*UI);
+
+  // we only support the following instructions at this stage
+  // it can be expanded
+  if (I->getOpcode() != Instruction::UIToFP &&
+      I->getOpcode() != Instruction::FAdd &&
+      I->getOpcode() != Instruction::FSub &&
+      I->getOpcode() != Instruction::FMul &&
+      I->getOpcode() != Instruction::Add &&
+      I->getOpcode() != Instruction::Sub) {
+    LLVM_DEBUG(dbgs() << "Quit Shuffle vector's user instruction not qualify : "
+                      << *I << "\n");
+    return false;
+  }
+
+  // Now we do the type check on the vector.
+  // If the type of the input vector to the user instuction is the same the
+  // output of the user instruction, then it is already handled in later DAG
+  // lowering stage, no need to to handle them here
+  VectorType *SVTy = SI->getType();
+  if (SVTy == I->getType())
+    return false;
+
+  // At the point we exclude all the not handled situations, we can work out
+  // the intrinsic call
+  Type *SVEltTy = SVTy->getElementType();
+  unsigned SVNum = SVTy->getNumElements();
+  Type *PromotedIntTy;
+
+  // Here we need to decide the tbl1 instruction's result type based on
+  // its users (UIToFP) result type
+  // As the result type can only be 64-bit or 32-bit float, we can set
+  // corresponding integer type to the tbl1's result
+  unsigned UIEltSize =
+      I->getType()->getArrayElementType()->getScalarSizeInBits();
+  if (UIEltSize == 64 && SVNum == 2)
+    PromotedIntTy = Type::getInt64Ty(SI->getType()->getContext());
+  else if (UIEltSize == 32 && SVNum == 4)
+    PromotedIntTy = Type::getInt32Ty(SI->getType()->getContext());
+  else
+    return false;
+
+  VectorType *VecTy = VectorType::get(PromotedIntTy, SVNum);
+
+  // VecTy is the tbl1 result type, this needs to be worked out
+  // Followed by tbl1 input source vector type
+  Type *Tys[2] = {VecTy, SI->getOperand(0)->getType()};
+
+  // Get the input Mask
+  auto Mask = SI->getShuffleMask();
+
+  // Generate the intrinsic function call
+  Function *Tbl1Func = Intrinsic::getDeclaration(
+      SI->getModule(), Intrinsic::aarch64_neon_tbl1_temp, Tys);
+
+  // Generate one Tbl1 for each use, could merge if the uses are the same
+  // in terms of the input type
+  for (auto UI = SI->user_begin(), E = SI->user_end(); UI != E; UI++) {
+    Instruction *I = cast<Instruction>(*UI);
+    Type *UserTy = I->getType();
+
+    // Two operands, 1st is the Mask, 2nd one is the input vector
+    SmallVector<Value *, 2> Ops;
+
+    // This is the vector operand to the Tbl1 intrisic, any vector type is OK
+    // however we need to adjust it to match the user result type
+    // we should be save to arbitarily change the type here however there could
+    // be a problem in later passes
+    Ops.push_back(SI->getOperand(0));
+
+    // This is the mask operand to the Tbl1 intrinsic, it has to be v16i8 type
+    // we need to work it out from the input mask together with the result type
+    // input mask is SI->getOperand[2]
+    // result type is the user of SI, I->getType()
+    unsigned InputEltSize = SVEltTy->getPrimitiveSizeInBits();
+    unsigned OutputEltSize =
+        UserTy->getArrayElementType()->getPrimitiveSizeInBits();
+    Value *Tbl1mask =
+        createTbl1Mask(Builder, Mask, SVNum, InputEltSize, OutputEltSize);
+    LLVM_DEBUG(dbgs() << "Tbl1 mask: "; Tbl1mask->dump());
+    Ops.push_back(Tbl1mask);
+
+    // Make the call for this user
+    CallInst *Tbl1 = Builder.CreateCall(Tbl1Func, Ops);
+    UI->replaceUsesOfWith(SI, Tbl1);
+  }
+
+  // Return true if it is successful
+  return true;
+}
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
@@ -14043,3 +14160,51 @@
   }
   return TargetLoweringBase::shouldLocalize(MI, TTI);
 }
+
+Constant *AArch64TargetLowering::createTbl1Mask(IRBuilderBase &Builder,
+                                                ArrayRef<int> InputMask,
+                                                unsigned NumElts,
+                                                unsigned InputEltSize,
+                                                unsigned OutputEltSize) const {
+
+  unsigned InputEltIdx = 0;
+  unsigned CurrInputIdx = 0;
+  unsigned CurrOffset;
+  unsigned OffsetLeft = 0;
+  unsigned OffsetRight = InputEltSize;
+
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned Idx = 0; Idx < 16; Idx++) {
+    // if all the elements are placed in the output vector, then just fill up
+    // with out of range index
+    if (InputEltIdx >= NumElts)
+      Mask.push_back(Builder.getInt8(255));
+    else {
+      CurrOffset = Idx * 8;
+      if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) {
+        CurrInputIdx = InputMask[InputEltIdx] * InputEltSize / 8 +
+                       (CurrOffset - OffsetLeft) / 8;
+        Mask.push_back(Builder.getInt8(CurrInputIdx));
+      }
+      // finished one input element, move to the next
+      else if (CurrOffset == OffsetRight) {
+        InputEltIdx++;
+        if (InputEltIdx >= NumElts) {
+          Mask.push_back(Builder.getInt8(255));
+          continue;
+        }
+        OffsetLeft = OutputEltSize * InputEltIdx;
+        OffsetRight = OffsetLeft + InputEltSize;
+        // check this new byte
+        if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) {
+          CurrInputIdx = InputMask[InputEltIdx] * InputEltSize / 8 +
+                         (CurrOffset - OffsetLeft) / 8;
+          Mask.push_back(Builder.getInt8(CurrInputIdx));
+        } else
+          Mask.push_back(Builder.getInt8(255));
+      } else
+        Mask.push_back(Builder.getInt8(255));
+    }
+  }
+  return ConstantVector::get(Mask);
+}
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -189,7 +189,8 @@
     return BaseT::isLegalNTStore(DataType, Alignment);
   }
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -683,13 +683,11 @@
   return LT.first;
 }
 
-int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                               unsigned Factor,
-                                               ArrayRef<unsigned> Indices,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool UseMaskForCond,
-                                               bool UseMaskForGaps) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) {
+
   assert(Factor >= 2 && "Invalid interleave factor");
   auto *VecVTy = cast<VectorType>(VecTy);
 
@@ -706,8 +704,32 @@
       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+  // we now check to see if this interleave memory access can be lowered
+  // to TBL1 instruction later in the IntereleavedAccessPass
+  // if True, then the cost will be the number of TBL1 * the basic cost of
+  // TBL1 instruction which is set to 1 at this time
+  if (I && VF > 1 && I->hasOneUse()) {
+    auto UI = I->user_begin();
+    Instruction *UserInstruction = cast<Instruction>(*UI);
+    // We currently just support the following instructions, can be expanded
+    if (UserInstruction->getOpcode() == Instruction::UIToFP ||
+        UserInstruction->getOpcode() == Instruction::FAdd ||
+        UserInstruction->getOpcode() == Instruction::FSub ||
+        UserInstruction->getOpcode() == Instruction::FMul ||
+        UserInstruction->getOpcode() == Instruction::Add ||
+        UserInstruction->getOpcode() == Instruction::Sub) {
+      // the first check to make sure the result can form a 128-bit vector
+      // the 2nd check to make sure the input data can fit into 128-bit vector
+      // so that we can use tbl1 instruction
+      // there will be Group->getFactor() tbl1 generated, each tbl1 costs 1
+      if ((UserInstruction->getType()->getScalarSizeInBits() * VF == 128) &&
+          (I->getType()->getScalarSizeInBits() * Factor * VF == 128))
+        return Factor * 1;
+    }
+  }
+
+  return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                           Indices, Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
 
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -216,7 +216,8 @@
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -823,9 +823,9 @@
 }
 
 int ARMTTIImpl::getInterleavedMemoryOpCost(
-    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
-    bool UseMaskForGaps) {
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
@@ -855,8 +855,8 @@
       return 2 * BaseCost;
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+  return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                           Indices, Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
 
Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -124,10 +124,13 @@
   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
                                   bool VariableMask, unsigned Alignment,
                                   const Instruction *I);
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-            unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace, bool UseMaskForCond = false,
-            bool UseMaskForGaps = false);
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+                                      unsigned Opcode, Type *VecTy,
+                                      unsigned Factor,
+                                      ArrayRef<unsigned> Indices,
+                                      unsigned Alignment, unsigned AddressSpace,
+                                      bool UseMaskForCond = false,
+                                      bool UseMaskForGaps = false);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
             const Instruction *I);
   unsigned getArithmeticInstrCost(
Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -220,13 +220,13 @@
                                        Alignment, I);
 }
 
-unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
-      Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
-      bool UseMaskForGaps) {
+unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+    return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
   return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
                          nullptr);
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -104,10 +104,9 @@
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor,
-                                 ArrayRef<unsigned> Indices,
-                                 unsigned Alignment,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
Index: llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -907,16 +907,13 @@
   return Cost;
 }
 
-int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                           unsigned Factor,
-                                           ArrayRef<unsigned> Indices,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           bool UseMaskForCond,
-                                           bool UseMaskForGaps) {
+int PPCTTIImpl::getInterleavedMemoryOpCost(
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+    return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
 
   assert(isa<VectorType>(VecTy) &&
Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -96,10 +96,9 @@
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                       unsigned AddressSpace, const Instruction *I = nullptr);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor,
-                                 ArrayRef<unsigned> Indices,
-                                 unsigned Alignment,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VFactor,
+                                 unsigned Opcode, Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
Index: llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -1072,16 +1072,13 @@
 // needed for using / defining the vector operands. The SystemZ version does
 // roughly the same but bases the computations on vector permutations
 // instead.
-int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                               unsigned Factor,
-                                               ArrayRef<unsigned> Indices,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool UseMaskForCond,
-                                               bool UseMaskForGaps) {
+int SystemZTTIImpl::getInterleavedMemoryOpCost(
+    Instruction *I, unsigned VFactor, unsigned Opcode, Type *VecTy,
+    unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
+    unsigned AddressSpace, bool UseMaskForCond, bool UseMaskForGaps) {
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+    return BaseT::getInterleavedMemoryOpCost(I, VFactor, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
Index: llvm/lib/Target/X86/X86TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -161,9 +161,10 @@
   int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                              bool IsPairwiseForm, bool IsUnsigned);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF, unsigned Opcode,
+                                 Type *VecTy, unsigned Factor,
+                                 ArrayRef<unsigned> Indices, unsigned Alignment,
+                                 unsigned AddressSpace,
                                  bool UseMaskForCond = false,
                                  bool UseMaskForGaps = false);
   int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
Index: llvm/lib/Target/X86/X86TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -3904,15 +3904,15 @@
                                                bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
   if (Indices.size() && Indices.size() != Factor)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -3923,8 +3923,8 @@
   // the VF=2, while v2i128 is an unsupported MVT vector type
   // (see MachineValueType.h::getVectorVT()).
   if (!LegalVT.isVector())
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace);
 
   unsigned VF = cast<VectorType>(VecTy)->getNumElements() / Factor;
   Type *ScalarTy = cast<VectorType>(VecTy)->getElementType();
@@ -3945,8 +3945,8 @@
   VectorType *VT = VectorType::get(ScalarTy, VF);
   EVT ETy = TLI->getValueType(DL, VT);
   if (!ETy.isSimple())
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace);
 
   // TODO: Complete for other data-types and strides.
   // Each combination of Stride, ElementTy and VF results in a different
@@ -4004,8 +4004,8 @@
       return NumOfMemOps * MemOpCost + Entry->Cost;
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+  return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor,
+                                           Indices, Alignment, AddressSpace);
 }
 
 // Get estimation for interleaved load/store operations and strided load.
@@ -4021,8 +4021,8 @@
                                                  bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
-    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+    return BaseT::getInterleavedMemoryOpCost(nullptr, 0, Opcode, VecTy, Factor,
+                                             Indices, Alignment, AddressSpace,
                                              UseMaskForCond, UseMaskForGaps);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
@@ -4139,13 +4139,11 @@
   return Cost;
 }
 
-int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                           unsigned Factor,
-                                           ArrayRef<unsigned> Indices,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           bool UseMaskForCond,
-                                           bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCost(
+    Instruction *I, unsigned VF, unsigned Opcode, Type *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace,
+    bool UseMaskForCond, bool UseMaskForGaps) {
+
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
     Type *EltTy = cast<VectorType>(VecTy)->getElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
@@ -4164,7 +4162,7 @@
                                           Alignment, AddressSpace,
                                           UseMaskForCond, UseMaskForGaps);
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+  return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor,
+                                           Indices, Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5938,7 +5938,7 @@
   bool UseMaskForGaps =
       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   unsigned Cost = TTI.getInterleavedMemoryOpCost(
-      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
+      I, VF, I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
       Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
 
   if (Group->isReverse()) {