Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2564,6 +2564,16 @@
     return false;
   }
 
+  /// Lower an SVE (AArch64) structured load returning a wide type to target
+  /// specific intrinsic that splits the wide type and creates a multi-result
+  /// node.
+  virtual SDValue lowerStructuredLoad(unsigned Intrinsic, const CallInst &I,
+                                      ArrayRef<SDValue> LoadOps,
+                                      SelectionDAG &DAG,
+                                      const SDLoc &DL) const {
+    llvm_unreachable("Not Implemented");
+  }
+
   /// Return true if zero-extending the specific node Val to type VT2 is free
   /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or
   /// because it's folded such as X86 zero-extending loads).
Index: llvm/include/llvm/IR/IntrinsicsAArch64.td
===================================================================
--- llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -802,6 +802,10 @@
     : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
                 [IntrReadMem, IntrArgMemOnly, ImmArg<1>]>;
 
+  class AdvSIMD_ManyVec_PredLoad_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty, llvm_anyptr_ty],
+                [IntrReadMem, IntrArgMemOnly]>;
+
   class AdvSIMD_1Vec_PredLoad_Intrinsic
     : Intrinsic<[llvm_anyvector_ty],
                 [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
@@ -1298,6 +1302,10 @@
 // Loads
 //
 
+def int_aarch64_sve_ld2 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld3 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+def int_aarch64_sve_ld4 : AdvSIMD_ManyVec_PredLoad_Intrinsic;
+
 def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
 
 def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7033,6 +7033,17 @@
                              DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT)));
     return;
   }
+  case Intrinsic::aarch64_sve_ld2:
+  case Intrinsic::aarch64_sve_ld3:
+  case Intrinsic::aarch64_sve_ld4: {
+    SDValue Chain = getRoot();
+    SDValue Mask = getValue(I.getArgOperand(0));
+    SDValue BasePtr = getValue(I.getArgOperand(1));
+    SDValue LoadOps[] = {Chain, Mask, BasePtr};
+    SDValue Result = TLI.lowerStructuredLoad(Intrinsic, I, LoadOps, DAG, sdl);
+    setValue(&I, Result);
+    return;
+  }
   case Intrinsic::aarch64_sve_tuple_get: {
     SDValue Src1 = getValue(I.getOperand(0));
     SDValue Idx = getValue(I.getOperand(1));
Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -237,6 +237,8 @@
                          unsigned SubRegIdx);
   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc,
+                            unsigned SubRegIdx);
 
   bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
   /// SVE Reg+Imm addressing mode.
@@ -1387,6 +1389,29 @@
   CurDAG->RemoveDeadNode(N);
 }
 
+void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
+                                               const unsigned Opc,
+                                               unsigned SubRegIdx) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SDValue Ops[] = {N->getOperand(1), // Predicate
+                   N->getOperand(2), // Memory operand
+                   CurDAG->getTargetConstant(0, dl, MVT::i64), Chain};
+
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+  SDNode *Load = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl,
+                                                              VT, SuperReg));
+
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
                                       unsigned Opc) {
   SDLoc dl(N);
@@ -4445,6 +4470,54 @@
     }
     break;
   }
+  case AArch64ISD::SVE_LD2: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM, AArch64::zsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::SVE_LD3: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM, AArch64::zsub0);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::SVE_LD4: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM, AArch64::zsub0);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM, AArch64::zsub0);
+      return;
+    }
+    break;
+  }
   }
 
   // Select the default instruction
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -225,6 +225,11 @@
   LDFF1,
   LDFF1S,
 
+  // Structured loads.
+  SVE_LD2,
+  SVE_LD3,
+  SVE_LD4,
+
   // Unsigned gather loads.
   GLD1,
   GLD1_SCALED,
@@ -463,6 +468,10 @@
   bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
                              unsigned Factor) const override;
 
+  SDValue lowerStructuredLoad(unsigned Intrinsic, const CallInst &I,
+                              ArrayRef<SDValue> LoadOps, SelectionDAG &DAG,
+                              const SDLoc &DL) const override;
+
   bool isLegalAddImmediate(int64_t) const override;
   bool isLegalICmpImmediate(int64_t) const override;
 
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1404,6 +1404,9 @@
   case AArch64ISD::INSR:              return "AArch64ISD::INSR";
   case AArch64ISD::PTEST:             return "AArch64ISD::PTEST";
   case AArch64ISD::PTRUE:             return "AArch64ISD::PTRUE";
+  case AArch64ISD::SVE_LD2:           return "AArch64ISD::SVE_LD2";
+  case AArch64ISD::SVE_LD3:           return "AArch64ISD::SVE_LD3";
+  case AArch64ISD::SVE_LD4:           return "AArch64ISD::SVE_LD4";
   case AArch64ISD::LDNF1:             return "AArch64ISD::LDNF1";
   case AArch64ISD::LDNF1S:            return "AArch64ISD::LDNF1S";
   case AArch64ISD::LDFF1:             return "AArch64ISD::LDFF1";
@@ -9617,6 +9620,67 @@
   return true;
 }
 
+// Lower an SVE structured load intrinsic returning a tuple type to target
+// specific intrinsic taking the same input but returning a multi-result value
+// of the split tuple type.
+//
+// E.g. Lowering an ld3:
+//
+//  call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
+//                                                    <vscale x 4 x i1> %pred,
+//                                                    <vscale x 4 x i32>* %addr)
+//
+//  Output DAG:
+//
+//    t0: ch = EntryToken
+//        t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
+//        t4: i64,ch = CopyFromReg t0, Register:i64 %1
+//    t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
+//    t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
+//
+// This is called during SelectionDAGBuilder to avoid legalization issues with
+// wide irregular types such as nxv12i32 in the example above and other types
+// which aren't a power of 2 wide.
+SDValue AArch64TargetLowering::lowerStructuredLoad(unsigned Intrinsic,
+                                                   const CallInst &I,
+                                                   ArrayRef<SDValue> LoadOps,
+                                                   SelectionDAG &DAG,
+                                                   const SDLoc &DL) const {
+  auto VT = EVT::getEVT(I.getType());
+  assert(VT.isScalableVector() && "Can only lower scalable vectors");
+
+  unsigned N, Opcode;
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_sve_ld2:
+    N = 2;
+    Opcode = AArch64ISD::SVE_LD2;
+    break;
+  case Intrinsic::aarch64_sve_ld3:
+    N = 3;
+    Opcode = AArch64ISD::SVE_LD3;
+    break;
+  case Intrinsic::aarch64_sve_ld4:
+    N = 4;
+    Opcode = AArch64ISD::SVE_LD4;
+    break;
+  default:
+    llvm_unreachable("unhandled intrinsic");
+  }
+
+  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                 VT.getVectorElementCount() / N);
+  assert(isTypeLegal(SplitVT));
+
+  SmallVector<EVT, 5> VTs(N, SplitVT);
+  VTs.push_back(MVT::Other); // Chain
+  SDVTList NodeTys = DAG.getVTList(VTs);
+
+  SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
+  SmallVector<SDValue, 4> PseudoLoadOps;
+  for (unsigned I = 0; I < N; ++I)
+    PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
+}
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
     const MemOp &Op, const AttributeList &FuncAttributes) const {
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads-with-extract.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads-with-extract.ll
@@ -0,0 +1,727 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
+
+;
+; LD2B
+;
+
+define <vscale x 16 x i8> @ld2b_i8_0(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld2b_i8_0:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v1 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv32i8(<vscale x 32 x i8> %res, i32 0)
+  ret <vscale x 16 x i8> %v1
+}
+
+define <vscale x 16 x i8> @ld2b_i8_1(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld2b_i8_1:
+; CHECK: ld2b { z31.b, z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v2 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv32i8(<vscale x 32 x i8> %res, i32 1)
+  ret <vscale x 16 x i8> %v2
+}
+
+;
+; LD2H
+;
+
+define <vscale x 8 x i16> @ld2h_i16_0(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld2h_i16_0:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v1 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv16i16(<vscale x 16 x i16> %res, i32 0)
+  ret <vscale x 8 x i16> %v1
+}
+
+define <vscale x 8 x i16> @ld2h_i16_1(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld2h_i16_1:
+; CHECK: ld2h { z31.h, z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v2 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv16i16(<vscale x 16 x i16> %res, i32 1)
+  ret <vscale x 8 x i16> %v2
+}
+
+define <vscale x 8 x half> @ld2h_f16_0(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld2h_f16_0:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v1 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv16f16(<vscale x 16 x half> %res, i32 0)
+  ret <vscale x 8 x half> %v1
+}
+
+define <vscale x 8 x half> @ld2h_f16_1(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld2h_f16_1:
+; CHECK: ld2h { z31.h, z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v2 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv16f16(<vscale x 16 x half> %res, i32 1)
+  ret <vscale x 8 x half> %v2
+}
+
+;
+; LD2W
+;
+
+define <vscale x 4 x i32> @ld2w_i32_0(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld2w_i32_0:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32(<vscale x 4 x i1> %pred,
+                                                               <vscale x 4 x i32>* %addr)
+  %v1 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32> %res, i32 0)
+  ret <vscale x 4 x i32> %v1
+}
+
+define <vscale x 4 x i32> @ld2w_i32_1(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld2w_i32_1:
+; CHECK: ld2w { z31.s, z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32(<vscale x 4 x i1> %pred,
+                                                               <vscale x 4 x i32>* %addr)
+  %v2 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32> %res, i32 1)
+  ret <vscale x 4 x i32> %v2
+}
+
+define <vscale x 4 x float> @ld2w_f32_0(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld2w_f32_0:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x float>* %addr)
+  %v1 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv8f32(<vscale x 8 x float> %res, i32 0)
+  ret <vscale x 4 x float> %v1
+}
+
+define <vscale x 4 x float> @ld2w_f32_1(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld2w_f32_1:
+; CHECK: ld2w { z31.s, z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x float>* %addr)
+  %v2 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv8f32(<vscale x 8 x float> %res, i32 1)
+  ret <vscale x 4 x float> %v2
+}
+
+;
+; LD2D
+;
+
+define <vscale x 2 x i64> @ld2d_i64_0(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld2d_i64_0:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v1 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv4i64(<vscale x 4 x i64> %res, i32 0)
+  ret <vscale x 2 x i64> %v1
+}
+
+define <vscale x 2 x i64> @ld2d_i64_1(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld2d_i64_1:
+; CHECK: ld2d { z31.d, z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v2 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv4i64(<vscale x 4 x i64> %res, i32 1)
+  ret <vscale x 2 x i64> %v2
+}
+
+define <vscale x 2 x double> @ld2d_f64_0(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld2d_f64_0:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64(<vscale x 2 x i1> %pred,
+                                                                 <vscale x 2 x double>* %addr)
+  %v1 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv4f64(<vscale x 4 x double> %res, i32 0)
+  ret <vscale x 2 x double> %v1
+}
+
+define <vscale x 2 x double> @ld2d_f64_1(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld2d_f64_1:
+; CHECK: ld2d { z31.d, z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64(<vscale x 2 x i1> %pred,
+                                                                 <vscale x 2 x double>* %addr)
+  %v2 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv4f64(<vscale x 4 x double> %res, i32 1)
+  ret <vscale x 2 x double> %v2
+}
+
+;
+; LD3B
+;
+
+define <vscale x 16 x i8> @ld3b_i8_0(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld3b_i8_0:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v1 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv48i8(<vscale x 48 x i8> %res, i32 0);
+  ret <vscale x 16 x i8> %v1
+}
+
+define <vscale x 16 x i8> @ld3b_i8_1(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld3b_i8_1:
+; CHECK: ld3b { z31.b, z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v2 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv48i8(<vscale x 48 x i8> %res, i32 1);
+  ret <vscale x 16 x i8> %v2
+}
+
+define <vscale x 16 x i8> @ld3b_i8_2(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld3b_i8_2:
+; CHECK: ld3b { z30.b, z31.b, z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v3 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv48i8(<vscale x 48 x i8> %res, i32 2);
+  ret <vscale x 16 x i8> %v3
+}
+
+;
+; LD3H
+;
+
+define <vscale x 8 x i16> @ld3h_i16_0(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld3h_i16_0:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v1 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv24i16(<vscale x 24 x i16> %res, i32 0);
+  ret <vscale x 8 x i16> %v1
+}
+
+define <vscale x 8 x i16> @ld3h_i16_1(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld3h_i16_1:
+; CHECK: ld3h { z31.h, z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v2 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv24i16(<vscale x 24 x i16> %res, i32 1);
+  ret <vscale x 8 x i16> %v2
+}
+
+define <vscale x 8 x i16> @ld3h_i16_2(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld3h_i16_2:
+; CHECK: ld3h { z30.h, z31.h, z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v3 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv24i16(<vscale x 24 x i16> %res, i32 2);
+  ret <vscale x 8 x i16> %v3
+}
+
+define <vscale x 8 x half> @ld3h_f16_0(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld3h_f16_0:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v1 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv24f16(<vscale x 24 x half> %res, i32 0);
+  ret <vscale x 8 x half> %v1
+}
+
+define <vscale x 8 x half> @ld3h_f16_1(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld3h_f16_1:
+; CHECK: ld3h { z31.h, z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v2 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv24f16(<vscale x 24 x half> %res, i32 1);
+  ret <vscale x 8 x half> %v2
+}
+
+define <vscale x 8 x half> @ld3h_f16_2(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld3h_f16_2:
+; CHECK: ld3h { z30.h, z31.h, z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v3 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv24f16(<vscale x 24 x half> %res, i32 2);
+  ret <vscale x 8 x half> %v3
+}
+
+;
+; LD3W
+;
+
+define <vscale x 4 x i32> @ld3w_i32_0(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld3w_i32_0:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(<vscale x 4 x i1> %pred,
+                                                               <vscale x 4 x i32>* %addr)
+  %v1 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32> %res, i32 0);
+  ret <vscale x 4 x i32> %v1
+}
+
+define <vscale x 4 x i32> @ld3w_i32_1(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld3w_i32_1:
+; CHECK: ld3w { z31.s, z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  %v2 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32> %res, i32 1);
+  ret <vscale x 4 x i32> %v2
+}
+
+define <vscale x 4 x i32> @ld3w_i32_2(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld3w_i32_2:
+; CHECK: ld3w { z30.s, z31.s, z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  %v3 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32> %res, i32 2);
+  ret <vscale x 4 x i32> %v3
+}
+
+define <vscale x 4 x float> @ld3w_f32_0(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld3w_f32_0:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v1 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv12f32(<vscale x 12 x float> %res, i32 0);
+  ret <vscale x 4 x float> %v1
+}
+
+define <vscale x 4 x float> @ld3w_f32_1(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld3w_f32_1:
+; CHECK: ld3w { z31.s, z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v2 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv12f32(<vscale x 12 x float> %res, i32 1);
+  ret <vscale x 4 x float> %v2
+}
+
+define <vscale x 4 x float> @ld3w_f32_2(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld3w_f32_2:
+; CHECK: ld3w { z30.s, z31.s, z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v3 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv12f32(<vscale x 12 x float> %res, i32 2);
+  ret <vscale x 4 x float> %v3
+}
+
+;
+; LD3D
+;
+
+define <vscale x 2 x i64> @ld3d_i64_0(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld3d_i64_0:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v1 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv6i64(<vscale x 6 x i64> %res, i32 0);
+  ret <vscale x 2 x i64> %v1
+}
+
+define <vscale x 2 x i64> @ld3d_i64_1(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld3d_i64_1:
+; CHECK: ld3d { z31.d, z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v2 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv6i64(<vscale x 6 x i64> %res, i32 1);
+  ret <vscale x 2 x i64> %v2
+}
+
+define <vscale x 2 x i64> @ld3d_i64_2(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld3d_i64_2:
+; CHECK: ld3d { z30.d, z31.d, z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v3 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv6i64(<vscale x 6 x i64> %res, i32 2);
+  ret <vscale x 2 x i64> %v3
+}
+
+define <vscale x 2 x double> @ld3d_f64_0(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld3d_f64_0:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v1 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv6f64(<vscale x 6 x double> %res, i32 0);
+  ret <vscale x 2 x double> %v1
+}
+
+define <vscale x 2 x double> @ld3d_f64_1(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld3d_f64_1:
+; CHECK: ld3d { z31.d, z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v2 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv6f64(<vscale x 6 x double> %res, i32 1);
+  ret <vscale x 2 x double> %v2
+}
+
+define <vscale x 2 x double> @ld3d_f64_2(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld3d_f64_2:
+; CHECK: ld3d { z30.d, z31.d, z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v3 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv6f64(<vscale x 6 x double> %res, i32 2);
+  ret <vscale x 2 x double> %v3
+}
+
+;
+; LD4B
+;
+
+define <vscale x 16 x i8> @ld4b_i8_0(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8_0:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v1 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv64i8(<vscale x 64 x i8> %res, i32 0);
+  ret <vscale x 16 x i8> %v1
+}
+
+define <vscale x 16 x i8> @ld4b_i8_1(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8_1:
+; CHECK: ld4b { z31.b, z0.b, z1.b, z2.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v2 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv64i8(<vscale x 64 x i8> %res, i32 1);
+  ret <vscale x 16 x i8> %v2
+}
+
+define <vscale x 16 x i8> @ld4b_i8_2(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8_2:
+; CHECK: ld4b { z30.b, z31.b, z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v3 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv64i8(<vscale x 64 x i8> %res, i32 2);
+  ret <vscale x 16 x i8> %v3
+}
+
+define <vscale x 16 x i8> @ld4b_i8_3(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8_3:
+; CHECK: ld4b { z29.b, z30.b, z31.b, z0.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  %v4 = call <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv64i8(<vscale x 64 x i8> %res, i32 3);
+  ret <vscale x 16 x i8> %v4
+}
+
+;
+; LD4H
+;
+
+define <vscale x 8 x i16> @ld4h_i16_0(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16_0:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v1 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv32i16(<vscale x 32 x i16> %res, i32 0);
+  ret <vscale x 8 x i16> %v1
+}
+
+define <vscale x 8 x i16> @ld4h_i16_1(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16_1:
+; CHECK: ld4h { z31.h, z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v2 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv32i16(<vscale x 32 x i16> %res, i32 1);
+  ret <vscale x 8 x i16> %v2
+}
+
+define <vscale x 8 x i16> @ld4h_i16_2(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16_2:
+; CHECK: ld4h { z30.h, z31.h, z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v3 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv32i16(<vscale x 32 x i16> %res, i32 2);
+  ret <vscale x 8 x i16> %v3
+}
+
+define <vscale x 8 x i16> @ld4h_i16_3(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16_3:
+; CHECK: ld4h { z29.h, z30.h, z31.h, z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  %v4 = call <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv32i16(<vscale x 32 x i16> %res, i32 3);
+  ret <vscale x 8 x i16> %v4
+}
+
+define <vscale x 8 x half> @ld4h_f16_0(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16_0:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v1 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv32f16(<vscale x 32 x half> %res, i32 0);
+  ret <vscale x 8 x half> %v1
+}
+
+define <vscale x 8 x half> @ld4h_f16_1(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16_1:
+; CHECK: ld4h { z31.h, z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v2 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv32f16(<vscale x 32 x half> %res, i32 1);
+  ret <vscale x 8 x half> %v2
+}
+
+define <vscale x 8 x half> @ld4h_f16_2(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16_2:
+; CHECK: ld4h { z30.h, z31.h, z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v3 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv32f16(<vscale x 32 x half> %res, i32 2);
+  ret <vscale x 8 x half> %v3
+}
+
+define <vscale x 8 x half> @ld4h_f16_3(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16_3:
+; CHECK: ld4h { z29.h, z30.h, z31.h, z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x half>* %addr)
+  %v4 = call <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv32f16(<vscale x 32 x half> %res, i32 3);
+  ret <vscale x 8 x half> %v4
+}
+
+;
+; LD4W
+;
+
+define <vscale x 4 x i32> @ld4w_i32_0(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32_0:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1> %pred,
+                                                               <vscale x 4 x i32>* %addr)
+  %v1 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %res, i32 0);
+  ret <vscale x 4 x i32> %v1
+}
+
+define <vscale x 4 x i32> @ld4w_i32_1(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32_1:
+; CHECK: ld4w { z31.s, z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  %v2 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %res, i32 1);
+  ret <vscale x 4 x i32> %v2
+}
+
+define <vscale x 4 x i32> @ld4w_i32_2(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32_2:
+; CHECK: ld4w { z30.s, z31.s, z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  %v3 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %res, i32 2);
+  ret <vscale x 4 x i32> %v3
+}
+
+define <vscale x 4 x i32> @ld4w_i32_3(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32_3:
+; CHECK: ld4w { z29.s, z30.s, z31.s, z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  %v4 = call <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32> %res, i32 3);
+  ret <vscale x 4 x i32> %v4
+}
+
+define <vscale x 4 x float> @ld4w_f32_0(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32_0:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v1 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv16f32(<vscale x 16 x float> %res, i32 0);
+  ret <vscale x 4 x float> %v1
+}
+
+define <vscale x 4 x float> @ld4w_f32_1(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32_1:
+; CHECK: ld4w { z31.s, z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v2 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv16f32(<vscale x 16 x float> %res, i32 1);
+  ret <vscale x 4 x float> %v2
+}
+
+define <vscale x 4 x float> @ld4w_f32_2(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32_2:
+; CHECK: ld4w { z30.s, z31.s, z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v3 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv16f32(<vscale x 16 x float> %res, i32 2);
+  ret <vscale x 4 x float> %v3
+}
+
+define <vscale x 4 x float> @ld4w_f32_3(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32_3:
+; CHECK: ld4w { z29.s, z30.s, z31.s, z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  %v4 = call <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv16f32(<vscale x 16 x float> %res, i32 3);
+  ret <vscale x 4 x float> %v4
+}
+
+;
+; LD4D
+;
+
+define <vscale x 2 x i64> @ld4d_i64_0(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64_0:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v1 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv8i64(<vscale x 8 x i64> %res, i32 0);
+  ret <vscale x 2 x i64> %v1
+}
+
+define <vscale x 2 x i64> @ld4d_i64_1(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64_1:
+; CHECK: ld4d { z31.d, z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v2 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv8i64(<vscale x 8 x i64> %res, i32 1);
+  ret <vscale x 2 x i64> %v2
+}
+
+define <vscale x 2 x i64> @ld4d_i64_2(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64_2:
+; CHECK: ld4d { z30.d, z31.d, z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v3 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv8i64(<vscale x 8 x i64> %res, i32 2);
+  ret <vscale x 2 x i64> %v3
+}
+
+define <vscale x 2 x i64> @ld4d_i64_3(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64_3:
+; CHECK: ld4d { z29.d, z30.d, z31.d, z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  %v4 = call <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv8i64(<vscale x 8 x i64> %res, i32 3);
+  ret <vscale x 2 x i64> %v4
+}
+
+define <vscale x 2 x double> @ld4d_f64_0(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64_0:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v1 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv8f64(<vscale x 8 x double> %res, i32 0);
+  ret <vscale x 2 x double> %v1
+}
+
+define <vscale x 2 x double> @ld4d_f64_1(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64_1:
+; CHECK: ld4d { z31.d, z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v2 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv8f64(<vscale x 8 x double> %res, i32 1);
+  ret <vscale x 2 x double> %v2
+}
+
+define <vscale x 2 x double> @ld4d_f64_2(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64_2:
+; CHECK: ld4d { z30.d, z31.d, z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v3 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv8f64(<vscale x 8 x double> %res, i32 2);
+  ret <vscale x 2 x double> %v3
+}
+
+define <vscale x 2 x double> @ld4d_f64_3(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64_3:
+; CHECK: ld4d { z29.d, z30.d, z31.d, z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  %v4 = call <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv8f64(<vscale x 8 x double> %res, i32 3);
+  ret <vscale x 2 x double> %v4
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv32i8(<vscale x 32 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv16i16(<vscale x 16 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv8i32(<vscale x 8 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv4i64(<vscale x 4 x i64>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv16f16(<vscale x 16 x half>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv8f32(<vscale x 8 x float>, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv4f64(<vscale x 4 x double>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv48i8(<vscale x 48 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv24i16(<vscale x 24 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv12i32(<vscale x 12 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv6i64(<vscale x 6 x i64>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv24f16(<vscale x 24 x half>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv12f32(<vscale x 12 x float>, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv6f64(<vscale x 6 x double>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.tuple.get.nxv64i8(<vscale x 64 x i8>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.tuple.get.nxv32i16(<vscale x 32 x i16>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.tuple.get.nxv16i32(<vscale x 16 x i32>, i32)
+declare <vscale x 2 x i64> @llvm.aarch64.sve.tuple.get.nxv8i64(<vscale x 8 x i64>, i32)
+declare <vscale x 8 x half> @llvm.aarch64.sve.tuple.get.nxv32f16(<vscale x 32 x half>, i32)
+declare <vscale x 4 x float> @llvm.aarch64.sve.tuple.get.nxv16f32(<vscale x 16 x float>, i32)
+declare <vscale x 2 x double> @llvm.aarch64.sve.tuple.get.nxv8f64(<vscale x 8 x double>, i32)
+
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
Index: llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
===================================================================
--- llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
+++ llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s
 
 ;
 ; LDNT1B
@@ -79,6 +79,243 @@
   ret <vscale x 2 x double> %res
 }
 
+;
+; LD2B
+;
+
+define <vscale x 32 x i8> @ld2b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld2b_i8:
+; CHECK: ld2b { z0.b, z1.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  ret <vscale x 32 x i8> %res
+}
+
+;
+; LD2H
+;
+
+define <vscale x 16 x i16> @ld2h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld2h_i16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  ret <vscale x 16 x i16> %res
+}
+
+define <vscale x 16 x half> @ld2h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld2h_f16:
+; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16(<vscale x 8 x i1> %pred,
+                                                                  <vscale x 8 x half>* %addr)
+  ret <vscale x 16 x half> %res
+}
+
+;
+; LD2W
+;
+
+define <vscale x 8 x i32> @ld2w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld2w_i32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32(<vscale x 4 x i1> %pred,
+                                                               <vscale x 4 x i32>* %addr)
+  ret <vscale x 8 x i32> %res
+}
+
+define <vscale x 8 x float> @ld2w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld2w_f32:
+; CHECK: ld2w { z0.s, z1.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x float>* %addr)
+  ret <vscale x 8 x float> %res
+}
+
+;
+; LD2D
+;
+
+define <vscale x 4 x i64> @ld2d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld2d_i64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  ret <vscale x 4 x i64> %res
+}
+
+define <vscale x 4 x double> @ld2d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld2d_f64:
+; CHECK: ld2d { z0.d, z1.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  ret <vscale x 4 x double> %res
+}
+
+;
+; LD3B
+;
+
+define <vscale x 48 x i8> @ld3b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld3b_i8:
+; CHECK: ld3b { z0.b, z1.b, z2.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  ret <vscale x 48 x i8> %res
+}
+
+;
+; LD3H
+;
+
+define <vscale x 24 x i16> @ld3h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld3h_i16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  ret <vscale x 24 x i16> %res
+}
+
+define <vscale x 24 x half> @ld3h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld3h_f16:
+; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16(<vscale x 8 x i1> %pred,
+                                                                  <vscale x 8 x half>* %addr)
+  ret <vscale x 24 x half> %res
+}
+
+;
+; LD3W
+;
+
+define <vscale x 12 x i32> @ld3w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld3w_i32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  ret <vscale x 12 x i32> %res
+}
+
+define <vscale x 12 x float> @ld3w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld3w_f32:
+; CHECK: ld3w { z0.s, z1.s, z2.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  ret <vscale x 12 x float> %res
+}
+
+;
+; LD3D
+;
+
+define <vscale x 6 x i64> @ld3d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld3d_i64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 6 x double> @ld3d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld3d_f64:
+; CHECK: ld3d { z0.d, z1.d, z2.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  ret <vscale x 6 x double> %res
+}
+
+;
+; LD4B
+;
+
+define <vscale x 64 x i8> @ld4b_i8(<vscale x 16 x i1> %pred, <vscale x 16 x i8>* %addr) {
+; CHECK-LABEL: ld4b_i8:
+; CHECK: ld4b { z0.b, z1.b, z2.b, z3.b }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1> %pred,
+                                                               <vscale x 16 x i8>* %addr)
+  ret <vscale x 64 x i8> %res
+}
+
+;
+; LD4H
+;
+
+define <vscale x 32 x i16> @ld4h_i16(<vscale x 8 x i1> %pred, <vscale x 8 x i16>* %addr) {
+; CHECK-LABEL: ld4h_i16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1> %pred,
+                                                                 <vscale x 8 x i16>* %addr)
+  ret <vscale x 32 x i16> %res
+}
+
+define <vscale x 32 x half> @ld4h_f16(<vscale x 8 x i1> %pred, <vscale x 8 x half>* %addr) {
+; CHECK-LABEL: ld4h_f16:
+; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1> %pred,
+                                                                  <vscale x 8 x half>* %addr)
+  ret <vscale x 32 x half> %res
+}
+
+;
+; LD4W
+;
+
+define <vscale x 16 x i32> @ld4w_i32(<vscale x 4 x i1> %pred, <vscale x 4 x i32>* %addr) {
+; CHECK-LABEL: ld4w_i32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1> %pred,
+                                                                 <vscale x 4 x i32>* %addr)
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 16 x float> @ld4w_f32(<vscale x 4 x i1> %pred, <vscale x 4 x float>* %addr) {
+; CHECK-LABEL: ld4w_f32:
+; CHECK: ld4w { z0.s, z1.s, z2.s, z3.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1> %pred,
+                                                                   <vscale x 4 x float>* %addr)
+  ret <vscale x 16 x float> %res
+}
+
+;
+; LD4D
+;
+
+define <vscale x 8 x i64> @ld4d_i64(<vscale x 2 x i1> %pred, <vscale x 2 x i64>* %addr) {
+; CHECK-LABEL: ld4d_i64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1> %pred,
+                                                               <vscale x 2 x i64>* %addr)
+  ret <vscale x 8 x i64> %res
+}
+
+define <vscale x 8 x double> @ld4d_f64(<vscale x 2 x i1> %pred, <vscale x 2 x double>* %addr) {
+; CHECK-LABEL: ld4d_f64:
+; CHECK: ld4d { z0.d, z1.d, z2.d, z3.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+  %res = call <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1> %pred,
+                                                                  <vscale x 2 x double>* %addr)
+  ret <vscale x 8 x double> %res
+}
+
 declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
 declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
 declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
@@ -86,3 +323,27 @@
 declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
 declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
 declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 32 x i8> @llvm.aarch64.sve.ld2.nxv32i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 16 x i16> @llvm.aarch64.sve.ld2.nxv16i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 8 x i32> @llvm.aarch64.sve.ld2.nxv8i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 4 x i64> @llvm.aarch64.sve.ld2.nxv4i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 16 x half> @llvm.aarch64.sve.ld2.nxv16f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 8 x float> @llvm.aarch64.sve.ld2.nxv8f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 4 x double> @llvm.aarch64.sve.ld2.nxv4f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 48 x i8> @llvm.aarch64.sve.ld3.nxv48i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 24 x i16> @llvm.aarch64.sve.ld3.nxv24i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 6 x i64> @llvm.aarch64.sve.ld3.nxv6i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 24 x half> @llvm.aarch64.sve.ld3.nxv24f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 12 x float> @llvm.aarch64.sve.ld3.nxv12f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 6 x double> @llvm.aarch64.sve.ld3.nxv6f64(<vscale x 2 x i1>, <vscale x 2 x double>*)
+
+declare <vscale x 64 x i8> @llvm.aarch64.sve.ld4.nxv64i8(<vscale x 16 x i1>, <vscale x 16 x i8>*)
+declare <vscale x 32 x i16> @llvm.aarch64.sve.ld4.nxv32i16(<vscale x 8 x i1>, <vscale x 8 x i16>*)
+declare <vscale x 16 x i32> @llvm.aarch64.sve.ld4.nxv16i32(<vscale x 4 x i1>, <vscale x 4 x i32>*)
+declare <vscale x 8 x i64> @llvm.aarch64.sve.ld4.nxv8i64(<vscale x 2 x i1>, <vscale x 2 x i64>*)
+declare <vscale x 32 x half> @llvm.aarch64.sve.ld4.nxv32f16(<vscale x 8 x i1>, <vscale x 8 x half>*)
+declare <vscale x 16 x float> @llvm.aarch64.sve.ld4.nxv16f32(<vscale x 4 x i1>, <vscale x 4 x float>*)
+declare <vscale x 8 x double> @llvm.aarch64.sve.ld4.nxv8f64(<vscale x 2 x i1>, <vscale x 2 x double>*)