Index: docs/LangRef.rst
===================================================================
--- docs/LangRef.rst
+++ docs/LangRef.rst
@@ -9833,6 +9833,124 @@
        %res = select <16 x i1> %mask, <16 x float> %value, <16 x float> %oldval
        store <16 x float> %res, <16 x float>* %ptr, align 4
 
+Indexed Vector Load and Store Intrinsics
+---------------------------------------
+
+LLVM provides intrinsics for indexed vector load and store operations, which allow read/write access to multiple memory addresses.
+The addresses are specified by a base address and an index vector.
+
+.. _int_iload:
+
+'``llvm.indexed.load.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The loaded data is a vector of any integer or floating point data type.
+
+::
+
+      declare <16 x i32> @llvm.indexed.load.v16i32 (i32* <ptr>, <16 x i32> <index>, i32 <alignment>)
+      declare <8 x double> @llvm.indexed.load.v8f64 (double* <ptr>, <8 x i32> <index>, i32 <alignment>)
+
+Overview:
+"""""""""
+
+Reads the vector elements from multiple memory addresses. The address of each element is specified by the base address and the corresponding index element.
+
+
+Arguments:
+""""""""""
+
+The first operand is the base pointer for the load. It must be a pointer type to the loaded vector element.
+The second operand is the index vector whose element type is always 'i32'. It must be the same length as the loaded vector.
+The third operand is the alignment of the source location. It is always 'i32' type.
+
+The index must be a constant vector. The alignment must be a constant.
+
+Semantics:
+""""""""""
+
+The '``llvm.indexed.load``' intrinsic is designed for reading vector elements from multiple addresses in a single IR operation.
+It can be used as interleaved load and strided load.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+       %res = call <8 x float> @llvm.indexed.load.v8f32 (float* %ptr, <8 x i32> <i32 idx0, i32 idx1, ..., i32 idx7>, i32 4)
+
+       ;; The result of the following instructions is identical aside from potential memory access exception
+       %ptr0 = getelementptr float, float* %ptr, i32 idx0                 ; Address for lane 0
+       %ptr1 = getelementptr float, float* %ptr, i32 idx1                 ; Address for lane 1
+       ...
+       %ptr7 = getelementptr float, float* %ptr, i32 idx7                 ; Address for lane 7
+       %lane0 = load float, float* %ptr0, align 4                         ; Load for lane 0
+       %lane1 = load float, float* %ptr1, align 4                         ; Load for lane 1
+       ...
+       %lane7 = load float, float* %ptr7, align 4                         ; Load for lane 7
+       %res0 = insertelement <8 x float> undef, float %lane0, i32 0       ; Insert lane 0
+       %res1 = insertelement <8 x float> res0, float %lane1, i32 1        ; Insert lane 1
+       ...
+       %res = insertelement <8 x float> res14, float %lane7, i32 7        ; Insert lane 7
+
+.. _int_istore:
+
+'``llvm.indexed.store.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic. The data stored in memory is a vector of any integer or floating point data type.
+
+::
+
+      declare void @llvm.indexed.store.v16i32 (<16 x i32> <value>, i32* <ptr>, <16 x i32> <index>, i32 <alignment>)
+      declare void @llvm.indexed.store.v8f64 (<8 x double> <value>, double* <ptr>, <8 x i32> <index>, i32 <alignment>)
+
+Overview:
+"""""""""
+
+Writes the vector elements to multiple memory addresses. The address of each element is specified by the the base address and corresponding index element.
+
+Arguments:
+""""""""""
+
+The first operand is the vector value to be written to memory.
+The second operand is the base pointer for the store. It must be a pointer type to the stored vector element.
+The third operand is the index vector whose element type is always 'i32'. It must be the same length as the stored vector.
+The fourth operand is the alignment of the source location. It is always 'i32' type.
+
+The index must be a constant vector. The alignment must be a constant.
+
+Semantics:
+""""""""""
+
+The '``llvm.indexed.store``' intrinsic is designed for writing vector elements to multiple addresses in a single IR operation.
+It can be used as interleaved store and strided store.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+       call void @llvm.indexed.store.v8f32(<8 x float> %value, float* %ptr, <8 x i32> <idx0, idx1, ..., idx7>, i32 4)
+
+       ;; The result of the following instructions is identical aside from potential memory access exceptions
+       %ptr0 = getelementptr float, float* %ptr, i32 idx0                 ; Address for lane 0
+       %ptr1 = getelementptr float, float* %ptr, i32 idx1                 ; Address for lane 1
+       ...
+       %ptr7 = getelementptr float, float* %ptr, i32 idx7                 ; Address for lane 7
+       %lane0 = extractelement <8 x float> %value, i32 0                  ; Extract lane 0
+       %lane1 = extractelement <8 x float> %value, i32 1                  ; Extract lane 1
+       ...
+       %lane7 = extractelement <8 x float> %value, i32 1                  ; Extract lane 7
+       store float %lane0, float* %ptr0, align 4                          ; Store lane 0
+       store float %lane1, float* %ptr1, align 4                          ; Store lane 1
+       ...
+       store float %lane7, float* %ptr7, align 4                          ; Store lane 7
 
 Memory Use Markers
 ------------------
Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -312,6 +312,11 @@
   bool isLegalMaskedStore(Type *DataType, int Consecutive) const;
   bool isLegalMaskedLoad(Type *DataType, int Consecutive) const;
 
+  /// \brief Return true if the target works with indexed instruction of the
+  /// given data type and indices.
+  bool supportIndexedStore(Type *DataType, ArrayRef<unsigned> Indices) const;
+  bool supportIndexedLoad(Type *DataType, ArrayRef<unsigned> Indices) const;
+
   /// \brief Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
@@ -542,6 +547,10 @@
                                      int64_t Scale) = 0;
   virtual bool isLegalMaskedStore(Type *DataType, int Consecutive) = 0;
   virtual bool isLegalMaskedLoad(Type *DataType, int Consecutive) = 0;
+  virtual bool supportIndexedStore(Type *DataType,
+                                   ArrayRef<unsigned> Indices) = 0;
+  virtual bool supportIndexedLoad(Type *DataType,
+                                  ArrayRef<unsigned> Indices) = 0;
   virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                    int64_t BaseOffset, bool HasBaseReg,
                                    int64_t Scale) = 0;
@@ -658,6 +667,13 @@
   bool isLegalMaskedLoad(Type *DataType, int Consecutive) override {
     return Impl.isLegalMaskedLoad(DataType, Consecutive);
   }
+  bool supportIndexedStore(Type *DataType,
+                           ArrayRef<unsigned> Indices) override {
+    return Impl.supportIndexedStore(DataType, Indices);
+  }
+  bool supportIndexedLoad(Type *DataType, ArrayRef<unsigned> Indices) override {
+    return Impl.supportIndexedLoad(DataType, Indices);
+  }
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale) override {
     return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale);
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -217,6 +217,14 @@
 
   bool isLegalMaskedLoad(Type *DataType, int Consecutive) { return false; }
 
+  bool supportIndexedStore(Type *DataType, ArrayRef<unsigned> Indices) {
+    return false;
+  }
+
+  bool supportIndexedLoad(Type *DataType, ArrayRef<unsigned> Indices) {
+    return false;
+  }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale) {
     // Guess that all legal addressing mode are free.
Index: include/llvm/CodeGen/ISDOpcodes.h
===================================================================
--- include/llvm/CodeGen/ISDOpcodes.h
+++ include/llvm/CodeGen/ISDOpcodes.h
@@ -690,6 +690,9 @@
     // Masked load and store
     MLOAD, MSTORE,
 
+    // Indexed load and store
+    ILOAD, ISTORE,
+
     /// This corresponds to the llvm.lifetime.* intrinsics. The first operand
     /// is the chain and the second operand is the alloca pointer.
     LIFETIME_START, LIFETIME_END,
Index: include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- include/llvm/CodeGen/SelectionDAG.h
+++ include/llvm/CodeGen/SelectionDAG.h
@@ -856,6 +856,15 @@
   SDValue getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
                          SDValue Ptr, SDValue Mask, EVT MemVT,
                          MachineMemOperand *MMO, bool IsTrunc);
+
+  // Construct a ILOAD node
+  SDValue getIndexedLoad(EVT VT, SDValue Chain, SDValue Ptr, SDValue Index,
+                         EVT MemVT, MachineMemOperand *MMO, SDLoc dl);
+  // Construct a ISTORE node
+  SDValue getIndexedStore(SDValue Chain, SDValue Val, SDValue Ptr,
+                          SDValue Index, EVT MemVT, MachineMemOperand *MMO,
+                          SDLoc dl);
+
   /// Construct a node to track a Value* through the backend.
   SDValue getSrcValue(const Value *v);
 
Index: include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- include/llvm/CodeGen/SelectionDAGNodes.h
+++ include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1151,6 +1151,8 @@
            N->getOpcode() == ISD::ATOMIC_STORE        ||
            N->getOpcode() == ISD::MLOAD               ||
            N->getOpcode() == ISD::MSTORE              ||
+           N->getOpcode() == ISD::ILOAD               ||
+           N->getOpcode() == ISD::ISTORE              ||
            N->isMemIntrinsic()                        ||
            N->isTargetMemoryOpcode();
   }
@@ -1987,6 +1989,59 @@
   }
 };
 
+/// This base class is used to represent ILOAD and ISTORE nodes
+class IndexedLoadStoreSDNode : public MemSDNode {
+  // Operands
+  SDUse Ops[4];
+
+public:
+  friend class SelectionDAG;
+  IndexedLoadStoreSDNode(ISD::NodeType NodeTy, unsigned Order, DebugLoc dl,
+                         SDValue *Operands, unsigned numOperands, SDVTList VTs,
+                         EVT MemVT, MachineMemOperand *MMO)
+      : MemSDNode(NodeTy, Order, dl, VTs, MemVT, MMO) {
+    InitOperands(Ops, Operands, numOperands);
+  }
+
+  // IndexedLoadSDNode (Chain, Ptr, Index)
+  // IndexedStoreSDNode (Chain, Ptr, Index, Src)
+  // In the both nodes address is Op1, Index is Op2.
+  const SDValue &getBasePtr() const { return getOperand(1); }
+  const SDValue &getIndex() const { return getOperand(2); }
+
+  static bool classof(const SDNode *N) {
+    return N->getOpcode() == ISD::ILOAD || N->getOpcode() == ISD::ISTORE;
+  }
+};
+
+/// This class is used to represent an ILOAD node
+class IndexedLoadSDNode : public IndexedLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+  IndexedLoadSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
+                    unsigned numOperands, SDVTList VTs, EVT MemVT,
+                    MachineMemOperand *MMO)
+      : IndexedLoadStoreSDNode(ISD::ILOAD, Order, dl, Operands, numOperands,
+                               VTs, MemVT, MMO) {}
+
+  static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ILOAD; }
+};
+
+/// This class is used to represent an ISTORE node
+class IndexedStoreSDNode : public IndexedLoadStoreSDNode {
+public:
+  friend class SelectionDAG;
+  IndexedStoreSDNode(unsigned Order, DebugLoc dl, SDValue *Operands,
+                     unsigned numOperands, SDVTList VTs, EVT MemVT,
+                     MachineMemOperand *MMO)
+      : IndexedLoadStoreSDNode(ISD::ISTORE, Order, dl, Operands, numOperands,
+                               VTs, MemVT, MMO) {}
+
+  const SDValue &getValue() const { return getOperand(3); }
+
+  static bool classof(const SDNode *N) { return N->getOpcode() == ISD::ISTORE; }
+};
+
 /// An SDNode that represents everything that will be needed
 /// to construct a MachineInstr. These nodes are created during the
 /// instruction selection proper phase.
Index: include/llvm/IR/Intrinsics.h
===================================================================
--- include/llvm/IR/Intrinsics.h
+++ include/llvm/IR/Intrinsics.h
@@ -77,7 +77,7 @@
       Void, VarArg, MMX, Metadata, Half, Float, Double,
       Integer, Vector, Pointer, Struct,
       Argument, ExtendArgument, TruncArgument, HalfVecArgument,
-      SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt
+      SameVecWidthArgument, PtrToArgument, VecOfPtrsToElt, PtrToVecElt
     } Kind;
 
     union {
@@ -100,14 +100,14 @@
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
              Kind == SameVecWidthArgument || Kind == PtrToArgument ||
-             Kind == VecOfPtrsToElt);
+             Kind == VecOfPtrsToElt || Kind == PtrToVecElt);
       return Argument_Info >> 3;
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
              Kind == SameVecWidthArgument || Kind == PtrToArgument ||
-             Kind == VecOfPtrsToElt);
+             Kind == VecOfPtrsToElt || Kind == PtrToVecElt);
       return (ArgKind)(Argument_Info & 7);
     }
 
Index: include/llvm/IR/Intrinsics.td
===================================================================
--- include/llvm/IR/Intrinsics.td
+++ include/llvm/IR/Intrinsics.td
@@ -118,6 +118,7 @@
 }
 class LLVMPointerTo<int num> : LLVMMatchType<num>;
 class LLVMVectorOfPointersToElt<int num> : LLVMMatchType<num>;
+class LLVMPointerToVectorElt<int num> : LLVMMatchType<num>;
 
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type, but change the element count to be half as many
@@ -608,6 +609,20 @@
                                    LLVMVectorSameWidth<0, llvm_i1_ty>],
                                   [IntrReadWriteArgMem]>;
 
+//===--------------------- Indexed load/store Intrinsics ------------------===//
+//
+def int_indexed_load : Intrinsic<[llvm_anyvector_ty], 
+                                 [LLVMPointerToVectorElt<0>,
+                                  LLVMVectorSameWidth<0, llvm_i32_ty>,
+                                  llvm_i32_ty],
+                                 [IntrReadArgMem]>;
+
+def int_indexed_store : Intrinsic<[],
+                                  [llvm_anyvector_ty, LLVMPointerToVectorElt<0>,
+                                   LLVMVectorSameWidth<0, llvm_i32_ty>,
+                                   llvm_i32_ty],
+                                  [IntrReadWriteArgMem]>;
+
 // Intrinsics to support bit sets.
 def int_bitset_test : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_metadata_ty],
                                 [IntrNoMem]>;
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -115,6 +115,16 @@
   return TTIImpl->isLegalMaskedLoad(DataType, Consecutive);
 }
 
+bool TargetTransformInfo::supportIndexedStore(
+    Type *DataType, ArrayRef<unsigned> Indices) const {
+  return TTIImpl->supportIndexedStore(DataType, Indices);
+}
+
+bool TargetTransformInfo::supportIndexedLoad(Type *DataType,
+                                             ArrayRef<unsigned> Indices) const {
+  return TTIImpl->supportIndexedLoad(DataType, Indices);
+}
+
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               int64_t BaseOffset,
                                               bool HasBaseReg,
Index: lib/CodeGen/CodeGenPrepare.cpp
===================================================================
--- lib/CodeGen/CodeGenPrepare.cpp
+++ lib/CodeGen/CodeGenPrepare.cpp
@@ -1251,6 +1251,91 @@
   CI->eraseFromParent();
 }
 
+// Translate index load intrinsic, like
+//     <4 x i32> @llvm.indexed.load(<i32* %ptr, <4 x i32> %<idx0, ..., idx3>,
+//                                  i32 align)
+// to scalar loads and insertelements:
+//     %ptr0 = getelementptr i32, i32 *%ptr, i32 %idx0
+//     %lane0 = load i32, i32 *%ptr0                            ; Load lane 0
+//     %res0 = insertelement <4 x i32> undef, i32 %lane0, i32 0 ; Insert lane 0
+//     %ptr1 = getelementptr i32, i32 *%ptr, i32 %idx1
+//     %lane1 = load i32, i32 *%ptr1                            ; Load lane 1
+//     %res1 = insertelement <4 x i32> %res0, i32 %lane1, i32 1 ; Insert lane 1
+//     %ptr2 = getelementptr i32, i32 *%ptr, i32 %idx2
+//     %lane2 = load i32, i32 *%ptr2                            ; Load lane 2
+//     %res2 = insertelement <4 x i32> %res0, i32 %lane2, i32 1 ; Insert lane 2
+//     %ptr3 = getelementptr i32, i32 *%ptr, i32 %idx3
+//     %lane3 = load i32, i32 *%ptr3                            ; Load lane 3
+//     %res = insertelement <4 x i32> %res2, i32 %lane3, i32 3  ; Insert lane 3
+static void ScalarizeIndexedLoad(CallInst *CI) {
+  const Constant *CIdx = dyn_cast<Constant>(CI->getArgOperand(1));
+  assert(CIdx && (isa<ConstantDataVector>(CIdx) || isa<ConstantVector>(CIdx)) &&
+         "Expect a constant index vector");
+  Value *Ptr = CI->getArgOperand(0);
+  ConstantInt *Alignment = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  assert(Alignment && "The alignment must be a constant");
+  unsigned Align = Alignment->getZExtValue();
+
+  VectorType *VecTy = dyn_cast<VectorType>(CI->getType());
+  Type *EltTy = VecTy->getElementType();
+  Type *PtrTy = EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  IRBuilder<> Builder(CI);
+  Ptr = Builder.CreateBitCast(Ptr, PtrTy);
+  Value *Result = UndefValue::get(VecTy);
+  for (unsigned i = 0; i < VecTy->getNumElements(); i++) {
+    ConstantInt *IdxElt = dyn_cast<ConstantInt>(CIdx->getAggregateElement(i));
+    assert(IdxElt && "Expect a constant index element");
+    Value *EltPtr = Builder.CreateGEP(Ptr, IdxElt);
+    Value *Elt = Builder.CreateAlignedLoad(EltPtr, Align);
+    Result = Builder.CreateInsertElement(Result, Elt, Builder.getInt32(i));
+  }
+
+  CI->replaceAllUsesWith(Result);
+  CI->eraseFromParent();
+}
+
+// Translate index load intrinsic, like
+//     void @llvm.indexed.store(i32* %ptr, <4 x i32> %vec,
+//                              <4 x i32> %<idx0, ..., idx3>, i32 align)
+// to extractelements and scalar stores:
+//     %ptr0 = getelementptr i32, i32* %ptr, i32 %idx0
+//     %lane0 = extractelement <4 x i32> %vec, i32 0           ; Extract lane 0
+//     store i32 %lane0, i32* %ptr0                            ; Store lane 0
+//     %ptr1 = getelementptr i32, i32* %ptr, i32 %idx1
+//     %lane1 = extractelement <4 x i32> %vec, i32 1           ; Extract lane 1
+//     store i32 %lane1, i32* %ptr1                            ; Store lane 1
+//     %ptr2 = getelementptr i32, i32* %ptr, i32 %idx2
+//     %lane2 = extractelement <4 x i32> %vec, i32 1           ; Extract lane 2
+//     store i32 %lane2, i32* %ptr2                            ; Store lane 2
+//     %ptr3 = getelementptr i32, i32* %ptr, i32 %idx3
+//     %lane3 = extractelement <4 x i32> %vec, i32 3           ; Extract lane 3
+//     store i32 %lane3, i32* %ptr3                            ; Store lane 3
+static void ScalarizeIndexedStore(CallInst *CI) {
+  const Constant *CIdx = dyn_cast<Constant>(CI->getArgOperand(2));
+  assert(CIdx && (isa<ConstantDataVector>(CIdx) || isa<ConstantVector>(CIdx)) &&
+         "Expect a constant index vector");
+  Value *VecVal = CI->getArgOperand(0);
+  Value *Ptr = CI->getArgOperand(1);
+  ConstantInt *Alignment = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  assert(Alignment && "The alignment must be a constant");
+  unsigned Align = Alignment->getZExtValue();
+
+  VectorType *VecTy = dyn_cast<VectorType>(VecVal->getType());
+  Type *EltTy = VecTy->getElementType();
+  Type *PtrTy = EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  IRBuilder<> Builder(CI);
+  Ptr = Builder.CreateBitCast(Ptr, PtrTy);
+  for (unsigned i = 0; i < VecTy->getNumElements(); i++) {
+    ConstantInt *IdxElt = dyn_cast<ConstantInt>(CIdx->getAggregateElement(i));
+    assert(IdxElt && "Expect a constant index element");
+    Value *Elt = Builder.CreateExtractElement(VecVal, Builder.getInt32(i));
+    Value *EllPtr = Builder.CreateGEP(Ptr, IdxElt);
+    Builder.CreateAlignedStore(Elt, EllPtr, Align);
+  }
+
+  CI->eraseFromParent();
+}
+
 bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
@@ -1362,6 +1447,52 @@
       }
       return false;
     }
+    case Intrinsic::indexed_load: {
+      SmallVector<unsigned, 8> Indices;
+      Constant *CIdx = dyn_cast<Constant>(CI->getArgOperand(1));
+      assert(CIdx &&
+             (isa<ConstantDataVector>(CIdx) || isa<ConstantVector>(CIdx)) &&
+             "Expect a constant index vector");
+      unsigned NumElts = CIdx->getType()->getVectorNumElements();
+      for (unsigned i = 0; i < NumElts; i++) {
+        ConstantInt *IdxElt =
+            dyn_cast<ConstantInt>(CIdx->getAggregateElement(i));
+        assert(IdxElt && "Expect a constant index element");
+        Indices.push_back(IdxElt->getZExtValue());
+      }
+
+      if (!TTI->supportIndexedLoad(CI->getType(), Indices)) {
+        // TODO: Some llvm.indexed.load can be optimized by vector load and
+        // shufflevector.
+        ScalarizeIndexedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::indexed_store: {
+      SmallVector<unsigned, 8> Indices;
+      Constant *CIdx = dyn_cast<Constant>(CI->getArgOperand(2));
+      assert(CIdx &&
+             (isa<ConstantDataVector>(CIdx) || isa<ConstantVector>(CIdx)) &&
+             "Expect a constant index vector");
+      unsigned NumElts = CIdx->getType()->getVectorNumElements();
+      for (unsigned i = 0; i < NumElts; i++) {
+        ConstantInt *IdxElt =
+            dyn_cast<ConstantInt>(CIdx->getAggregateElement(i));
+        assert(IdxElt && "Expect a constant index element");
+        Indices.push_back(IdxElt->getZExtValue());
+      }
+
+      if (!TTI->supportIndexedStore(CI->getArgOperand(0)->getType(), Indices)) {
+        // TODO: Some llvm.indexed.store can be optimized by shufflevector and
+        // vector store.
+        ScalarizeIndexedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
     }
 
     if (TLI) {
Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5082,6 +5082,56 @@
   return SDValue(N, 0);
 }
 
+SDValue SelectionDAG::getIndexedLoad(EVT VT, SDValue Chain, SDValue Ptr,
+                                     SDValue Index, EVT MemVT,
+                                     MachineMemOperand *MMO, SDLoc dl) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr, Index};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ILOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
+                                     MMO->isVolatile(), MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<IndexedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) IndexedLoadSDNode(
+      dl.getIROrder(), dl.getDebugLoc(), Ops, 3, VTs, MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getIndexedStore(SDValue Chain, SDValue Val, SDValue Ptr,
+                                      SDValue Index, EVT MemVT,
+                                      MachineMemOperand *MMO, SDLoc dl) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = {Chain, Ptr, Index, Val};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ISTORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<IndexedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) IndexedStoreSDNode(
+      dl.getIROrder(), dl.getDebugLoc(), Ops, 4, VTs, MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
 SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
                                SDValue Chain, SDValue Ptr,
                                SDValue SV,
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -774,6 +774,8 @@
   void visitStore(const StoreInst &I);
   void visitMaskedLoad(const CallInst &I);
   void visitMaskedStore(const CallInst &I);
+  void visitIndexedLoad(const CallInst &I);
+  void visitIndexedStore(const CallInst &I);
   void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
   void visitAtomicRMW(const AtomicRMWInst &I);
   void visitFence(const FenceInst &I);
Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3705,6 +3705,65 @@
   setValue(&I, Load);
 }
 
+void SelectionDAGBuilder::visitIndexedStore(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.indexed.store.*(Src, Ptr, Index, alignemt)
+  Value *PtrOperand = I.getArgOperand(1);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src = getValue(I.getArgOperand(0));
+  SDValue Index = getValue(I.getArgOperand(2));
+  EVT VT = Src.getValueType();
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(3)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      VT.getStoreSize(), Alignment, AAInfo);
+  SDValue StoreNode =
+      DAG.getIndexedStore(getRoot(), Src, Ptr, Index, VT, MMO, sdl);
+  DAG.setRoot(StoreNode);
+  setValue(&I, StoreNode);
+}
+
+void SelectionDAGBuilder::visitIndexedLoad(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+  // @llvm.indexed.load.*(Ptr, Index, alignment)
+  Value *PtrOperand = I.getArgOperand(0);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Index = getValue(I.getArgOperand(1));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SDValue InChain = DAG.getRoot();
+  if (AA->pointsToConstantMemory(AliasAnalysis::Location(
+          PtrOperand, AA->getTypeStoreSize(I.getType()), AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    InChain = DAG.getEntryNode();
+  }
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      VT.getStoreSize(), Alignment, AAInfo, Ranges);
+
+  SDValue Load = DAG.getIndexedLoad(VT, InChain, Ptr, Index, VT, MMO, sdl);
+  SDValue OutChain = Load.getValue(1);
+  DAG.setRoot(OutChain);
+  setValue(&I, Load);
+}
+
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
@@ -4864,6 +4923,12 @@
   case Intrinsic::masked_store:
     visitMaskedStore(I);
     return nullptr;
+  case Intrinsic::indexed_load:
+    visitIndexedLoad(I);
+    return nullptr;
+  case Intrinsic::indexed_store:
+    visitIndexedStore(I);
+    return nullptr;
   case Intrinsic::x86_mmx_pslli_w:
   case Intrinsic::x86_mmx_pslli_d:
   case Intrinsic::x86_mmx_pslli_q:
Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -273,6 +273,8 @@
   case ISD::STORE:                      return "store";
   case ISD::MLOAD:                      return "masked_load";
   case ISD::MSTORE:                     return "masked_store";
+  case ISD::ILOAD:                      return "indexed_load";
+  case ISD::ISTORE:                     return "indexed_store";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";
Index: lib/IR/Function.cpp
===================================================================
--- lib/IR/Function.cpp
+++ lib/IR/Function.cpp
@@ -548,10 +548,10 @@
   IIT_HALF_VEC_ARG = 29,
   IIT_SAME_VEC_WIDTH_ARG = 30,
   IIT_PTR_TO_ARG = 31,
-  IIT_VEC_OF_PTRS_TO_ELT = 32
+  IIT_VEC_OF_PTRS_TO_ELT = 32,
+  IIT_PTR_TO_VEC_ELT = 33
 };
 
-
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                       SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
@@ -674,6 +674,12 @@
                                              ArgInfo));
     return;
   }
+  case IIT_PTR_TO_VEC_ELT: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::PtrToVecElt, ArgInfo));
+    return;
+  }
   case IIT_EMPTYSTRUCT:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct, 0));
     return;
@@ -802,6 +808,14 @@
     return VectorType::get(PointerType::getUnqual(EltTy),
                            VTy->getNumElements());
   }
+  case IITDescriptor::PtrToVecElt: {
+    Type *Ty = Tys[D.getArgumentNumber()];
+    VectorType *VTy = dyn_cast<VectorType>(Ty);
+    if (!VTy)
+      llvm_unreachable("Expected an argument of Vector Type");
+    Type *EltTy = VTy->getVectorElementType();
+    return PointerType::getUnqual(EltTy);
+  }
  }
   llvm_unreachable("unhandled");
 }
Index: lib/IR/Verifier.cpp
===================================================================
--- lib/IR/Verifier.cpp
+++ lib/IR/Verifier.cpp
@@ -3042,6 +3042,18 @@
     return (!(ThisArgEltTy->getElementType() ==
             ReferenceType->getVectorElementType()));
   }
+  case IITDescriptor::PtrToVecElt: {
+    if (D.getArgumentNumber() >= ArgTys.size())
+      return true;
+    VectorType *ReferenceType =
+        dyn_cast<VectorType>(ArgTys[D.getArgumentNumber()]);
+    if (!ReferenceType)
+      return true;
+    PointerType *ThisArgType = dyn_cast<PointerType>(Ty);
+    return (!ThisArgType ||
+            ThisArgType->getPointerElementType() !=
+                ReferenceType->getElementType());
+  }
   }
   llvm_unreachable("unhandled");
 }
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -495,6 +495,8 @@
   setTargetDAGCombine(ISD::INTRINSIC_VOID);
   setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::ILOAD);
+  setTargetDAGCombine(ISD::ISTORE);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
   MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
@@ -8145,6 +8147,113 @@
   return SDValue();
 }
 
+static unsigned getLdNStNIntrinsicID(unsigned NumVec, bool IsLoad) {
+  static unsigned LoadInt[3] = {Intrinsic::aarch64_neon_ld2,
+                                Intrinsic::aarch64_neon_ld3,
+                                Intrinsic::aarch64_neon_ld4};
+  static unsigned StoreInt[3] = {Intrinsic::aarch64_neon_st2,
+                                 Intrinsic::aarch64_neon_st3,
+                                 Intrinsic::aarch64_neon_st4};
+
+  return IsLoad ? LoadInt[NumVec - 2] : StoreInt[NumVec - 2];
+}
+
+// Check if the given indices are interleaved by N (N = 2,3,4).
+bool static isInterleavedIndices(ArrayRef<unsigned> Indices, unsigned &NumVec,
+                                 unsigned &NumElts) {
+  if (Indices.size() <= 2)
+    return false;
+  if (Indices[0] != 0)
+    return false;
+  NumVec = Indices[1];
+  if (NumVec < 2 || NumVec > 4)
+    return false;
+
+  NumElts = Indices.size() / NumVec;
+  // The index should match: 0, NumVec, 2*NumVec, ..., 1, NumVec + 1, ...
+  for (unsigned i = 0; i < NumVec; i++)
+    for (unsigned j = 0; j < NumElts; j++)
+      if (Indices[j + i * NumElts] != j * NumVec + i)
+        return false;
+
+  return true;
+}
+
+static SDValue
+performIndexedLoadStoreCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                               SelectionDAG &DAG) {
+  bool IsLoad = N->getOpcode() == ISD::ILOAD;
+  // VecVal = ILOAD (Chain, Pointer, index)
+  // ISTORE (Chain, Pointer, Index, VecVal)
+  SDNode *IdxNode = N->getOperand(2).getNode(); // Indexed Node
+  SmallVector<unsigned, 16> Indices;
+  for (unsigned i = 0; i < IdxNode->getNumOperands(); i++) {
+    ConstantSDNode *IdxElt =
+        dyn_cast<ConstantSDNode>(IdxNode->getOperand(i).getNode());
+    assert(IdxElt && "Expect a constant index element");
+    Indices.push_back(IdxElt->getZExtValue());
+  }
+
+  unsigned NumVec, NumElts;
+  if (!isInterleavedIndices(Indices, NumVec, NumElts))
+    return SDValue();
+
+  // For store, get the stored vector type. For load, get the result type.
+  EVT VT =
+      IsLoad ? N->getValueType(0) : N->getOperand(3).getNode()->getValueType(0);
+  EVT ValVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(ValVT))
+    return SDValue();
+
+  // Build the operand list.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(N->getOperand(0)); // The Chain
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc DL(N);
+  // Push the intrinsic ID for ldN stN.
+  Ops.push_back(DAG.getTargetConstant(getLdNStNIntrinsicID(NumVec, IsLoad),
+                                      TLI.getPointerTy()));
+  if (!IsLoad) {
+    SDValue StoreVec = N->getOperand(3);
+    for (unsigned i = 0; i < NumVec; i++) {
+      SDValue ValVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValVT, StoreVec,
+                                   DAG.getConstant(0 + i * NumElts, MVT::i64));
+      Ops.push_back(ValVec); // The stored vectors
+    }
+  }
+  Ops.push_back(N->getOperand(1)); // The pointer
+
+  EVT Tys[4];
+  unsigned n;
+  unsigned NumRetVecs = IsLoad ? NumVec : 0;
+  for (n = 0; n < NumRetVecs; ++n)
+    Tys[n] = ValVT;
+  Tys[n] = MVT::Other; // Type of the chain
+  SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumRetVecs + 1));
+
+  IndexedLoadStoreSDNode *MemNode = cast<IndexedLoadStoreSDNode>(N);
+  unsigned NewOp = IsLoad ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID;
+  SDValue NewNode = DAG.getMemIntrinsicNode(
+      NewOp, DL, SDTys, Ops, MemNode->getMemoryVT(), MemNode->getMemOperand());
+
+  if (!IsLoad)
+    return NewNode;
+
+  SDValue ResVec;
+  SDValue Res[4];
+  for (unsigned i = 0; i < NumVec; i++)
+    Res[i] = SDValue(NewNode.getNode(), i);
+  ResVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, makeArrayRef(Res, NumVec));
+  // Replace the result
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), ResVec);
+  // Replace the Chain
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1),
+                                SDValue(NewNode.getNode(), NumVec));
+
+  return SDValue();
+}
+
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
 static SDValue performNEONPostLDSTCombine(SDNode *N,
@@ -8703,6 +8812,9 @@
     return performVSelectCombine(N, DCI.DAG);
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
+  case ISD::ILOAD:
+  case ISD::ISTORE:
+    return performIndexedLoadStoreCombine(N, DCI, DAG);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
   case AArch64ISD::CSEL:
@@ -8739,6 +8851,7 @@
     default:
       break;
     }
+    break;
   }
   return SDValue();
 }
Index: lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -139,6 +139,8 @@
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
+  bool supportIndexedStore(Type *DataType, ArrayRef<unsigned> Indices);
+  bool supportIndexedLoad(Type *DataType, ArrayRef<unsigned> Indices);
   /// @}
 };
 
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -407,6 +407,55 @@
   return LT.first;
 }
 
+// Check if the given indices are interleaved by N (N = 2,3,4).
+bool static isInterleavedIndices(ArrayRef<unsigned> Indices, unsigned &NumVec,
+                                 unsigned &NumElts) {
+  if (Indices.size() <= 2)
+    return false;
+  if (Indices[0] != 0)
+    return false;
+  NumVec = Indices[1];
+  if (NumVec < 2 || NumVec > 4)
+    return false;
+
+  NumElts = Indices.size() / NumVec;
+  // The index should match: 0, NumVec, 2*NumVec, ..., 1, NumVec + 1, ...
+  for (unsigned i = 0; i < NumVec; i++)
+    for (unsigned j = 0; j < NumElts; j++)
+      if (Indices[j + i * NumElts] != j * NumVec + i)
+        return false;
+
+  return true;
+}
+
+bool AArch64TTIImpl::supportIndexedStore(Type *DataType,
+                                         ArrayRef<unsigned> Indices) {
+  unsigned NumVec, NumElts;
+  if (!isInterleavedIndices(Indices, NumVec, NumElts))
+    return false;
+
+  VectorType *VecType = dyn_cast<VectorType>(DataType);
+  assert(VecType && VecType->getNumElements() == NumVec * NumElts &&
+         "Expected a vector type");
+
+  VectorType *ValVec = VectorType::get(VecType->getElementType(), NumElts);
+  return isTypeLegal(ValVec);
+}
+
+bool AArch64TTIImpl::supportIndexedLoad(Type *DataType,
+                                        ArrayRef<unsigned> Indices) {
+  unsigned NumVec, NumElts;
+  if (!isInterleavedIndices(Indices, NumVec, NumElts))
+    return false;
+
+  VectorType *VecType = dyn_cast<VectorType>(DataType);
+  assert(VecType && VecType->getNumElements() == NumVec * NumElts &&
+         "Expected a vector type");
+
+  VectorType *ValVec = VectorType::get(VecType->getElementType(), NumElts);
+  return isTypeLegal(ValVec);
+}
+
 unsigned AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   unsigned Cost = 0;
   for (auto *I : Tys) {
Index: test/CodeGen/AArch64/indexed-load-store-noninterleaved.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/indexed-load-store-noninterleaved.ll
@@ -0,0 +1,76 @@
+; RUN: llc -print-after codegenprepare < %s 2>&1 | FileCheck %s
+; REQUIRES: asserts 
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; CHECK-LABEL: @test_v4i32(i32* %ptr) {
+; CHECK: load i32
+; CHECK: insertelement <4 x i32> {{.*}}, i32 0
+; CHECK: getelementptr {{.*}}, i32 1
+; CHECK: load i32
+; CHECK: insertelement <4 x i32> {{.*}}, i32 1
+; CHECK: getelementptr {{.*}}, i32 2
+; CHECK: load i32
+; CHECK: insertelement <4 x i32> {{.*}}, i32 2
+; CHECK: getelementptr {{.*}}, i32 3
+; CHECK: load i32
+; CHECK: insertelement <4 x i32> {{.*}}, i32 3
+
+; CHECK: extractelement <4 x i32> {{.*}}, i32 0
+; CHECK: getelementptr {{.*}}, i32 3
+; CHECK: store i32
+; CHECK: extractelement <4 x i32> {{.*}}, i32 1
+; CHECK: getelementptr {{.*}}, i32 2
+; CHECK: store i32
+; CHECK: extractelement <4 x i32> {{.*}}, i32 2
+; CHECK: getelementptr {{.*}}, i32 1
+; CHECK: store i32
+; CHECK: extractelement <4 x i32> {{.*}}, i32 3
+; CHECK: store i32
+
+define void @test_v4i32(i32* %ptr) {
+entry:
+  %indexed.load = call <4 x i32> @llvm.indexed.load.v4i32(i32* %ptr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 4)
+  %0 = add nsw <4 x i32> %indexed.load, <i32 1, i32 1, i32 1, i32 1>
+  call void @llvm.indexed.store.v4i32(<4 x i32> %0, i32* %ptr, <4 x i32> <i32 3, i32 2, i32 1, i32 0>, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: @test_v4f32(float* %ptr) {
+; CHECK: load float
+; CHECK: insertelement <4 x float> {{.*}}, i32 0
+; CHECK: getelementptr {{.*}}, i32 2
+; CHECK: load float
+; CHECK: insertelement <4 x float> {{.*}}, i32 1
+; CHECK: getelementptr {{.*}}, i32 4
+; CHECK: load float
+; CHECK: insertelement <4 x float> {{.*}}, i32 2
+; CHECK: getelementptr {{.*}}, i32 6
+; CHECK: load float
+; CHECK: insertelement <4 x float> {{.*}}, i32 3
+
+; CHECK: extractelement <4 x float> {{.*}}, i32 0
+; CHECK: store float
+; CHECK: extractelement <4 x float> {{.*}}, i32 1
+; CHECK: getelementptr {{.*}}, i32 2
+; CHECK: store float
+; CHECK: extractelement <4 x float> {{.*}}, i32 2
+; CHECK: getelementptr {{.*}}, i32 4
+; CHECK: store float
+; CHECK: extractelement <4 x float> {{.*}}, i32 3
+; CHECK: getelementptr {{.*}}, i32 6
+; CHECK: store float
+
+define void @test_v4f32(float* %ptr) {
+entry:
+  %indexed.load = call <4 x float> @llvm.indexed.load.v4f32(float* %ptr, <4 x i32> <i32 0, i32 2, i32 4, i32 6>, i32 4)
+  %0 = fadd <4 x float> %indexed.load, <float 1.0, float 1.0, float 1.0, float 1.0>
+  call void @llvm.indexed.store.v4f32(<4 x float> %0, float* %ptr, <4 x i32> <i32 0, i32 2, i32 4, i32 6>, i32 4)
+  ret void
+}
+
+declare <4 x i32> @llvm.indexed.load.v4i32(i32*, <4 x i32>, i32)
+declare void @llvm.indexed.store.v4i32(<4 x i32>, i32*, <4 x i32>, i32)
+declare <4 x float> @llvm.indexed.load.v4f32(float*, <4 x i32>, i32)
+declare void @llvm.indexed.store.v4f32(<4 x float>, float*, <4 x i32>, i32)
Index: test/CodeGen/AArch64/interleaved-load-store.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/interleaved-load-store.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-linux-gnueabi"
+
+; Make sure the intrinsic about 2 interleaved vectors can be matched
+; CHECK-LABEL: test_ld2_st2:
+; CHECK: ld2
+; CHECK: st2
+
+define void @test_ld2_st2(i32* %ptr) {
+entry:
+  %interleave.load = call <8 x i32> @llvm.indexed.load.v8i32(i32* %ptr, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+  %0 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <8 x i32> %interleave.load, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = add nsw <4 x i32> %0, <i32 1, i32 1, i32 1, i32 1>
+  %3 = add nsw <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
+  %4 = shufflevector <4 x i32> %2, <4 x i32> %3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  call void @llvm.indexed.store.v8i32(<8 x i32> %4, i32* %ptr, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>, i32 4)
+  ret void
+}
+
+; Make sure the intrinsic about 3 interleaved vectors can be matched
+; CHECK-LABEL: test_ld3_st3:
+; CHECK: ld3
+; CHECK: st3
+
+define void @test_ld3_st3(float* %ptr) {
+entry:
+  %interleave.load = call <12 x float> @llvm.indexed.load.v12f32(float* %ptr, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+  %0 = shufflevector <12 x float> %interleave.load, <12 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %1 = shufflevector <12 x float> %interleave.load, <12 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %2 = shufflevector <12 x float> %interleave.load, <12 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+  %3 = fadd <4 x float> %0, <float 1.0, float 1.0, float 1.0, float 1.0>
+  %4 = fadd <4 x float> %1, <float 2.0, float 2.0, float 2.0, float 2.0>
+  %5 = fadd <4 x float> %2, <float 3.0, float 3.0, float 3.0, float 3.0>
+  %6 = shufflevector <4 x float> %3, <4 x float> %4, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %7 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8 = shufflevector <8 x float> %6, <8 x float> %7, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  call void @llvm.indexed.store.v12f32(<12 x float> %8, float* %ptr, <12 x i32> <i32 0, i32 3, i32 6, i32 9, i32 1, i32 4, i32 7, i32 10, i32 2, i32 5, i32 8, i32 11>, i32 4)
+  ret void
+}
+
+; Make sure the intrinsic about 3 interleaved vectors can be matched
+; CHECK-LABEL: test_ld4_st4:
+; CHECK: ld4
+; CHECK: st4
+
+define void @test_ld4_st4(i64* %ptr) {
+entry:
+  %interleave.load = call <8 x i64> @llvm.indexed.load.v8i64(i64* %ptr, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>, i32 4)
+  %0 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %1 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
+  %2 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
+  %3 = shufflevector <8 x i64> %interleave.load, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
+  %4 = add nsw <2 x i64> %0, <i64 1, i64 1>
+  %5 = add nsw <2 x i64> %1, <i64 2, i64 2>
+  %6 = add nsw <2 x i64> %2, <i64 3, i64 3>
+  %7 = add nsw <2 x i64> %3, <i64 4, i64 4>
+  %8 = shufflevector <2 x i64> %4, <2 x i64> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %9 = shufflevector <2 x i64> %6, <2 x i64> %7, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %10 = shufflevector <4 x i64> %8, <4 x i64> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  call void @llvm.indexed.store.v8i64(<8 x i64> %10, i64* %ptr, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>, i32 4)
+  ret void
+}
+
+
+declare <8 x i32> @llvm.indexed.load.v8i32(i32*, <8 x i32>, i32)
+declare void @llvm.indexed.store.v8i32(<8 x i32>, i32*, <8 x i32>, i32)
+declare <12 x float> @llvm.indexed.load.v12f32(float*, <12 x i32>, i32)
+declare void @llvm.indexed.store.v12f32(<12 x float>, float*, <12 x i32>, i32)
+declare <8 x i64> @llvm.indexed.load.v8i64(i64*, <8 x i32>, i32)
+declare void @llvm.indexed.store.v8i64(<8 x i64>, i64*, <8 x i32>, i32)
Index: utils/TableGen/IntrinsicEmitter.cpp
===================================================================
--- utils/TableGen/IntrinsicEmitter.cpp
+++ utils/TableGen/IntrinsicEmitter.cpp
@@ -260,10 +260,10 @@
   IIT_HALF_VEC_ARG = 29,
   IIT_SAME_VEC_WIDTH_ARG = 30,
   IIT_PTR_TO_ARG = 31,
-  IIT_VEC_OF_PTRS_TO_ELT = 32
+  IIT_VEC_OF_PTRS_TO_ELT = 32,
+  IIT_PTR_TO_VEC_ELT = 33
 };
 
-
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
                                  std::vector<unsigned char> &Sig) {
   if (MVT(VT).isInteger()) {
@@ -319,6 +319,8 @@
       Sig.push_back(IIT_PTR_TO_ARG);
     else if (R->isSubClassOf("LLVMVectorOfPointersToElt"))
       Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT);
+    else if (R->isSubClassOf("LLVMPointerToVectorElt"))
+      Sig.push_back(IIT_PTR_TO_VEC_ELT);
     else
       Sig.push_back(IIT_ARG);
     return Sig.push_back((Number << 3) | ArgCodes[Number]);