diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -977,7 +977,8 @@
   /// \p AddressSpace is address space of the pointer.
   /// \p UseMaskForCond indicates if the memory access is predicated.
   /// \p UseMaskForGaps indicates if gaps should be masked.
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  				 unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
@@ -1341,7 +1342,8 @@
   virtual int getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                      Value *Ptr, bool VariableMask,
                                      unsigned Alignment) = 0;
-  virtual int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  virtual int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  				         unsigned Opcode, Type *VecTy,
                                          unsigned Factor,
                                          ArrayRef<unsigned> Indices,
                                          unsigned Alignment,
@@ -1763,11 +1765,12 @@
     return Impl.getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                        Alignment);
   }
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  				 unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace, bool UseMaskForCond,
                                  bool UseMaskForGaps) override {
-    return Impl.getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+    return Impl.getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -481,7 +481,8 @@
     return 1;
   }
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  	                              unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h
--- a/llvm/include/llvm/Analysis/VectorUtils.h
+++ b/llvm/include/llvm/Analysis/VectorUtils.h
@@ -464,6 +464,14 @@
 Constant *createSequentialMask(IRBuilderBase &Builder, unsigned Start,
                                unsigned NumInts, unsigned NumUndefs);
 
+/// Create a tbl1 mask with default 0xFF.
+///
+/// This function creates tbl1 mask whose elements are defaults to 0xff which
+/// means to fill '0' to the output vector.
+Constant *createTbl1Mask(IRBuilderBase &Builder, SmallVector<int, 16> &InputMask,
+                         unsigned NumElts, unsigned InputEltSize, 
+                         unsigned OutputEltSize);
+
 /// Concatenate a list of vectors.
 ///
 /// This function generates code that concatenate the vectors in \p Vecs into a
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -914,7 +914,8 @@
     return Cost;
   }
 
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+  unsigned getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  				      unsigned Opcode, Type *VecTy,
                                       unsigned Factor,
                                       ArrayRef<unsigned> Indices,
                                       unsigned Alignment, unsigned AddressSpace,
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2562,6 +2562,14 @@
     return false;
   }
 
+  /// Lower a shufflevector to target specific intrinsics. Return
+  /// true on success.
+  ///
+  /// \p SI is the shufflevector to RE-interleave the stored vector.
+  virtual bool lowerShuffleVector(ShuffleVectorInst *SI) const {
+    return false;
+  }
+
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -570,6 +570,9 @@
 def int_aarch64_neon_st4lane  : AdvSIMD_4Vec_Store_Lane_Intrinsic;
 
 let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
+  class AdvSIMD_Tbl1_temp_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_v16i8_ty],
+                [IntrNoMem]>;
   class AdvSIMD_Tbl1_Intrinsic
     : Intrinsic<[llvm_anyvector_ty], [llvm_v16i8_ty, LLVMMatchType<0>],
                 [IntrNoMem]>;
@@ -611,6 +614,7 @@
 def int_aarch64_neon_tbl2 : AdvSIMD_Tbl2_Intrinsic;
 def int_aarch64_neon_tbl3 : AdvSIMD_Tbl3_Intrinsic;
 def int_aarch64_neon_tbl4 : AdvSIMD_Tbl4_Intrinsic;
+def int_aarch64_neon_tbl1_temp : AdvSIMD_Tbl1_temp_Intrinsic;
 
 def int_aarch64_neon_tbx1 : AdvSIMD_Tbx1_Intrinsic;
 def int_aarch64_neon_tbx2 : AdvSIMD_Tbx2_Intrinsic;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -681,11 +681,12 @@
   return Cost;
 }
 
-int TargetTransformInfo::getInterleavedMemoryOpCost(
+int TargetTransformInfo::getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
     unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
     bool UseMaskForGaps) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(I, VF, 
+  						 Opcode, VecTy, Factor, Indices,
                                                  Alignment, AddressSpace,
                                                  UseMaskForCond,
                                                  UseMaskForGaps);
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -726,6 +726,48 @@
   return ConstantVector::get(Mask);
 }
 
+Constant *llvm::createTbl1Mask(IRBuilderBase &Builder, SmallVector<int, 16> &InputMask, unsigned NumElts, unsigned InputEltSize, unsigned OutputEltSize) {
+  unsigned InputEltIdx = 0;
+  unsigned CurrInputIdx = 0;
+  unsigned CurrOffset;
+  unsigned OffsetLeft =  0;
+  unsigned OffsetRight = InputEltSize; 
+
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < 16; i++) {
+    // if all the elements are placed in the output vector, then just fill up with out of range index    
+    if (InputEltIdx >= NumElts)
+      Mask.push_back(Builder.getInt8(255));          
+    else {  
+      CurrOffset = i * 8;        
+      if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) {
+        CurrInputIdx = InputMask[InputEltIdx] * InputEltSize/8 + (CurrOffset-OffsetLeft)/8;        
+        Mask.push_back(Builder.getInt8(CurrInputIdx));       
+      }
+      // finished one input element, move to the next
+      else if (CurrOffset == OffsetRight) {
+        InputEltIdx++;     
+        if (InputEltIdx >= NumElts) {
+          Mask.push_back(Builder.getInt8(255));              
+          continue;
+        }
+        OffsetLeft = OutputEltSize * InputEltIdx;
+        OffsetRight = OffsetLeft + InputEltSize;
+        //check this new byte
+        if (CurrOffset >= OffsetLeft && CurrOffset < OffsetRight) {
+          CurrInputIdx = InputMask[InputEltIdx] * InputEltSize/8 + (CurrOffset-OffsetLeft)/8;      
+          Mask.push_back(Builder.getInt8(CurrInputIdx));        
+        }
+        else
+          Mask.push_back(Builder.getInt8(255));              
+      }
+      else
+        Mask.push_back(Builder.getInt8(255));        
+    }
+  }
+  return ConstantVector::get(Mask);
+}
+
 /// A helper function for concatenating vectors. This function concatenates two
 /// vectors having the same element type. If the second vector has fewer
 /// elements than the first, it is padded with undefs.
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -112,6 +112,10 @@
   bool lowerInterleavedStore(StoreInst *SI,
                              SmallVector<Instruction *, 32> &DeadInsts);
 
+  /// Transform an type unmatched shufflevector into target specific intrinsics.
+  bool lowerShuffleVector(ShuffleVectorInst *SI,
+                          SmallVector<Instruction *, 32> &DeadInsts);
+
   /// Returns true if the uses of an interleaved load by the
   /// extractelement instructions in \p Extracts can be replaced by uses of the
   /// shufflevector instructions in \p Shuffles instead. If so, the necessary
@@ -442,6 +446,21 @@
   return true;
 }
 
+bool InterleavedAccess::lowerShuffleVector(
+     ShuffleVectorInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
+
+  LLVM_DEBUG(dbgs() << "IA: Found a shufflevector: " << *SI << "\n");
+
+  // Try to create target specific intrinsics to replace the shuffle.
+  if (!TLI->lowerShuffleVector(SI))
+    return false;
+
+  // Already have a new target specific tbl instruction. the old shufflevector.
+  DeadInsts.push_back(SI);
+
+  return true;
+}
+
 bool InterleavedAccess::runOnFunction(Function &F) {
   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
   if (!TPC || !LowerInterleavedAccesses)
@@ -469,5 +488,14 @@
   for (auto I : DeadInsts)
     I->eraseFromParent();
 
+  SmallVector<Instruction *, 32> SFDeadInsts;
+  for (auto &I : instructions(F)) {
+    if (ShuffleVectorInst *SHI = dyn_cast<ShuffleVectorInst>(&I))
+      Changed |= lowerShuffleVector(SHI, SFDeadInsts);
+  }
+
+  for (auto I : SFDeadInsts) 
+    I->eraseFromParent();
+
   return Changed;
 }
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -1206,7 +1206,7 @@
   SmallVector<unsigned, 4> Indices;
   for (unsigned i = 0; i < Factor; i++)
     Indices.push_back(i);
-  InterleavedCost = TTI.getInterleavedMemoryOpCost(
+  InterleavedCost = TTI.getInterleavedMemoryOpCost(nullptr, 0,
       Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
       InsertionPoint->getPointerAddressSpace());
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -3581,6 +3581,19 @@
     case Intrinsic::aarch64_tagp:
       SelectTagP(Node);
       return;
+#if 1      
+      case Intrinsic::aarch64_neon_tbl1_temp: {
+        SDLoc dl(Node);
+
+        SmallVector<SDValue, 2> Ops;
+        // the source vector
+        Ops.push_back(Node->getOperand(1));
+        // the mask
+        Ops.push_back(Node->getOperand(2));
+        ReplaceNode(Node, CurDAG->getMachineNode(AArch64::TBLv16i8One, dl, VT, Ops));
+      return;
+    }
+#endif
     case Intrinsic::aarch64_neon_tbl2:
       SelectTable(Node, 2,
                   VT == MVT::v8i8 ? AArch64::TBLv8i8Two : AArch64::TBLv16i8Two,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -423,6 +423,8 @@
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
+  bool lowerShuffleVector(ShuffleVectorInst *SI) const override;
+
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9454,6 +9454,119 @@
   return true;
 }
 
+bool AArch64TargetLowering::lowerShuffleVector(ShuffleVectorInst *SI) const {
+  IRBuilder<> Builder(SI);
+
+  // First check the shuffle_vector instruction
+  // 1) The first operand has to be 128 bit, byte mask requires the vector 
+  //    size has to be 16*i8. We do not handle small vector shuffle here for 
+  //    TBL1 instruction, for example, v2i16 size is 32
+  // 2) The 2nd operand has to be UNDEF for tbl1 instruction
+  if (SI->getOperand(0)->getType()->isVectorTy() &&
+      SI->getOperand(0)->getType()->getPrimitiveSizeInBits() != 128)
+    return false;
+
+  // The 2nd operand has to be UNDEF
+  if (Constant * C = dyn_cast<Constant>(SI->getOperand(1)))
+    if (!(isa<UndefValue>(C)))
+      return false;
+
+  // We only handle shuffle_vector which has only one user instruction here, 
+  // because multiple user instructions will cause multiple tbl1 instructions 
+  // generated. we leave it to the next stage implementation
+  if (!SI->hasOneUse()) 
+    return false;
+
+  // Now we check the one use instruction, we only handle UItoFP at this stage
+  // and a few other instructions. The user instruction list can also be expanded later
+  auto UI = SI->user_begin();
+  Instruction *I = cast<Instruction>(*UI);
+
+  // we only support the following instructions at this stage
+  // it can be expanded
+  if (I->getOpcode() != Instruction::UIToFP &&
+      I->getOpcode() != Instruction::FAdd   &&	  
+      I->getOpcode() != Instruction::FSub   && 	  
+      I->getOpcode() != Instruction::FMul   &&	  
+      I->getOpcode() != Instruction::Add    && 	  
+      I->getOpcode() != Instruction::Sub) { 
+    LLVM_DEBUG(dbgs() << "Quit Shuffle vector's user instruction not qualify : " << *I << "\n");
+    return false;
+  } 
+
+  // Now we do the type check on the vector. 
+  // If the type of the input vector to the user instuction is the same the output
+  // of the user instruction, then it is already handled in later DAG lowering
+  // stage, no need to to handle them here
+  VectorType *SVTy = SI->getType();
+  if (SVTy == I->getType())
+    return false;
+
+  // At the point we exclude all the not handled situations, we can work out 
+  // the intrinsic call
+  Type *SVEltTy = SVTy->getVectorElementType();
+  unsigned SVNum = SVTy->getVectorNumElements();
+  Type * PromotedIntTy;
+  //Instruction *I = cast<Instruction>(*UI);
+
+  // Here we need to decide the tbl1 instruction's result type based on 
+  // its users (UIToFP) result type
+  // As the result type can only be 64-bit or 32-bit float, we can set
+  // corresponding integer type to the tbl1's result
+  unsigned UIEltSize = I->getType()->getVectorElementType()->getScalarSizeInBits(); 
+  if (UIEltSize == 64 && SVNum == 2 )
+     PromotedIntTy = Type::getInt64Ty(SI->getType()->getContext());
+  else if (UIEltSize == 32 && SVNum == 4)
+     PromotedIntTy = Type::getInt32Ty(SI->getType()->getContext());
+  else
+     return false;
+
+  VectorType *VecTy = VectorType::get(PromotedIntTy ,SVNum);
+
+  //VecTy is the tbl1 result type, this needs to be worked out
+  //Followed by tbl1 input source vector type 
+  Type *Tys[2] = {VecTy, SI->getOperand(0)->getType()};
+
+  // Get the input Mask
+  auto Mask = SI->getShuffleMask();
+
+  // Generate the intrinsic function call 
+  Function *Tbl1Func =
+      Intrinsic::getDeclaration(SI->getModule(), Intrinsic::aarch64_neon_tbl1_temp, Tys);
+
+  // Generate one Tbl1 for each use, could merge if the uses are the same 
+  // in terms of the input type
+  for (auto UI = SI->user_begin(), E = SI->user_end(); UI != E; UI++) {
+    Instruction *I = cast<Instruction>(*UI);
+    Type *UserTy = I->getType();
+       // Two operands, 1st is the Mask, 2nd one is the input vector
+       SmallVector<Value *, 2> Ops;    
+
+       // This is the vector operand to the Tbl1 intrisic, any vector type is OK
+       // however we need to adjust it to match the user result type
+       // we should be save to arbitarily change the type here however there could 
+       // be a problem in later passes
+       Ops.push_back(SI->getOperand(0));
+
+       // This is the mask operand to the Tbl1 intrinsic, it has to be v16i8 type
+       // we need to work it out from the input mask together with the result type
+       // input mask is SI->getOperand[2]
+       // result type is the user of SI, I->getType()
+       unsigned InputEltSize = SVEltTy->getPrimitiveSizeInBits();
+       unsigned OutputEltSize = UserTy->getVectorElementType()->getPrimitiveSizeInBits(); 
+       Value * Tbl1_mask = createTbl1Mask(Builder, Mask, SVNum, 
+                                          InputEltSize, OutputEltSize); 
+       LLVM_DEBUG(dbgs() << "Tbl1 mask: "; Tbl1_mask->dump());
+       Ops.push_back(Tbl1_mask);
+
+       // Make the call for this user
+       CallInst *Tbl1 = Builder.CreateCall(Tbl1Func, Ops);
+       UI->replaceUsesOfWith(SI, Tbl1);
+  }
+
+  // Return true if it is successful
+  return true;
+}
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -190,7 +190,8 @@
     return BaseT::isLegalNTStore(DataType, Alignment);
   }
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
+  int getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+  				 unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace,
                                  bool UseMaskForCond = false,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -677,7 +677,8 @@
   return LT.first;
 }
 
-int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+int AArch64TTIImpl::getInterleavedMemoryOpCost(Instruction *I, unsigned VF,
+					       unsigned Opcode, Type *VecTy,
                                                unsigned Factor,
                                                ArrayRef<unsigned> Indices,
                                                unsigned Alignment,
@@ -700,7 +701,32 @@
       return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
-  return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
+  // we now check to see if this interleave memory access can be lowered
+  // to TBL1 instruction later in the IntereleavedAccessPass
+  // if True, then the cost will be the number of TBL1 * the basic cost of
+  // TBL1 instruction which is set to 1 at this time
+  if (I && VF> 1 && I->hasOneUse()) {
+     auto UI = I->user_begin();
+     Instruction *UserInstruction = cast<Instruction>(*UI);
+     // We currently just support the following instructions, can be expanded   
+     if (UserInstruction->getOpcode() == Instruction::UIToFP ||
+         UserInstruction->getOpcode() == Instruction::FAdd   ||
+         UserInstruction->getOpcode() == Instruction::FSub   ||
+         UserInstruction->getOpcode() == Instruction::FMul   ||
+         UserInstruction->getOpcode() == Instruction::Add    ||
+         UserInstruction->getOpcode() == Instruction::Sub    
+        ) {
+        // the first check to make sure the result can form a 128-bit vector
+        // the 2nd check to make sure the input data can fit into 128-bit vector
+        // so that we can use tbl1 instruction
+        // there will be Group->getFactor() tbl1 generated, each tbl1 costs 1       
+        if ((UserInstruction->getType()->getScalarSizeInBits() * VF == 128) &&
+            (I->getType()->getScalarSizeInBits() * Factor * VF == 128))
+               return Factor * 1;
+     }
+  }      
+
+  return BaseT::getInterleavedMemoryOpCost(I, VF, Opcode, VecTy, Factor, Indices,
                                            Alignment, AddressSpace,
                                            UseMaskForCond, UseMaskForGaps);
 }
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5914,7 +5914,7 @@
   // Calculate the cost of the whole interleaved group.
   bool UseMaskForGaps =
       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
-  unsigned Cost = TTI.getInterleavedMemoryOpCost(
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I, VF,
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);