diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1241,6 +1241,31 @@
                ],
                [IntrWriteMem, IntrArgMemOnly]>;
 
+//
+// Non-temporal gather load/scatter store
+//
+
+class SVE2_NTGatherLoad_VectorBase_Intrinsic
+    : Intrinsic<[llvm_anyvector_ty],
+                [
+                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                  llvm_anyvector_ty,
+                  llvm_i64_ty
+                ],
+                [IntrReadMem, IntrArgMemOnly]>;
+
+class SVE2_NTScatterStore_VectorBase_Intrinsic
+    : Intrinsic<[],
+               [
+                 llvm_anyvector_ty,
+                 LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                 llvm_anyvector_ty, llvm_i64_ty
+               ],
+               [IntrWriteMem, IntrArgMemOnly]>;
+
+def int_aarch64_sve_ldnt1_gather : SVE2_NTGatherLoad_VectorBase_Intrinsic;
+def int_aarch64_sve_stnt1_scatter : SVE2_NTScatterStore_VectorBase_Intrinsic;
+
 //
 // Loads
 //
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -240,6 +240,11 @@
   GLD1S_UXTW_SCALED,
   GLD1S_SXTW_SCALED,
   GLD1S_IMM,
+
+  // Non-temporal gather loads
+  GLDNT1,
+  GLDNT1S,
+
   // Scatter store
   SST1,
   SST1_SCALED,
@@ -249,6 +254,9 @@
   SST1_SXTW_SCALED,
   SST1_IMM,
 
+  // Non-temporal scatter store
+  SSTNT1,
+
   // Strict (exception-raising) floating point comparison
   STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
   STRICT_FCMPE,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1415,6 +1415,8 @@
   case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
   case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
   case AArch64ISD::GLD1S_IMM:         return "AArch64ISD::GLD1S_IMM";
+  case AArch64ISD::GLDNT1:            return "AArch64ISD::GLDNT1";
+  case AArch64ISD::GLDNT1S:           return "AArch64ISD::GLDNT1S";
   case AArch64ISD::SST1:              return "AArch64ISD::SST1";
   case AArch64ISD::SST1_SCALED:       return "AArch64ISD::SST1_SCALED";
   case AArch64ISD::SST1_SXTW:         return "AArch64ISD::SST1_SXTW";
@@ -1422,6 +1424,7 @@
   case AArch64ISD::SST1_SXTW_SCALED:  return "AArch64ISD::SST1_SXTW_SCALED";
   case AArch64ISD::SST1_UXTW_SCALED:  return "AArch64ISD::SST1_UXTW_SCALED";
   case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
+  case AArch64ISD::SSTNT1:            return "AArch64ISD::SSTNT1";
   case AArch64ISD::LDP:               return "AArch64ISD::LDP";
   case AArch64ISD::STP:               return "AArch64ISD::STP";
   case AArch64ISD::STNP:              return "AArch64ISD::STNP";
@@ -10348,6 +10351,7 @@
   case AArch64ISD::GLD1_UXTW:
   case AArch64ISD::GLD1_UXTW_SCALED:
   case AArch64ISD::GLD1_IMM:
+  case AArch64ISD::GLDNT1:
     MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
     break;
   default:
@@ -12508,7 +12512,7 @@
                      DAG.getConstant(MinOffset, DL, MVT::i64));
 }
 
-static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
                                         unsigned Opcode,
                                         bool OnlyPackedOffsets = true) {
   const SDValue Src = N->getOperand(2);
@@ -12536,11 +12540,11 @@
   SDValue Offset = N->getOperand(5);
 
   // SST1_IMM requires that the offset is an immediate:
-  // * multiple of #SizeInBytes
-  // * in the range [0, 31 x #SizeInBytes]
-  // where #SizeInBytes is the size in bytes of the stored
-  // items. For immediates outside that range and non-immediate scalar offsets use
-  // SST1 or SST1_UXTW instead.
+  //  * multiple of #SizeInBytes
+  //  * in the range [0, 31 x #SizeInBytes]
+  // where #SizeInBytes is the size in bytes of the stored items. For
+  // immediates outside that range and non-immediate scalar offsets use SST1 or
+  // SST1_UXTW instead.
   if (Opcode == AArch64ISD::SST1_IMM) {
     uint64_t MaxIndex = 31;
     uint64_t SrcElSize = SrcElVT.getStoreSize().getKnownMinSize();
@@ -12576,7 +12580,8 @@
   EVT HwSrcVt = getSVEContainerType(SrcVT);
 
   // Keep the original type of the input data to store - this is needed to
-  // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
+  // differentiate between the actual data sizes and instructions, e.g. ST1B,
+  // ST1H, ST1W and ST1D for regular scatter stores. For FP values we want the
   // integer equivalent, so just use HwSrcVt.
   SDValue InputVT = DAG.getValueType(SrcVT);
   if (SrcVT.isFloatingPoint())
@@ -12600,7 +12605,7 @@
   return DAG.getNode(Opcode, DL, VTs, Ops);
 }
 
-static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
+static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
                                        unsigned Opcode,
                                        bool OnlyPackedOffsets = true) {
   EVT RetVT = N->getValueType(0);
@@ -12608,6 +12613,7 @@
          "Gather loads are only possible for SVE vectors");
   SDLoc DL(N);
 
+  // Make sure that the loaded data will fit into an SVE register
   if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
     return SDValue();
 
@@ -12619,9 +12625,9 @@
   SDValue Offset = N->getOperand(4);
 
   // GLD1_IMM requires that the offset is an immediate:
-  // * multiple of #SizeInBytes
-  // * in the range [0, 31 x #SizeInBytes]
-  // where #SizeInBytes is the size in bytes of the loaded items.  For immediates
+  //  * multiple of #SizeInBytes
+  //  * in the range [0, 31 x #SizeInBytes]
+  // where #SizeInBytes is the size in bytes of the loaded items. For immediates
   // outside that range and non-immediate scalar offsets use GLD1 or GLD1_UXTW
   // instead.
   if (Opcode == AArch64ISD::GLD1_IMM) {
@@ -12658,10 +12664,10 @@
   // Return value type that is representable in hardware
   EVT HwRetVt = getSVEContainerType(RetVT);
 
-  // Keep the original output value type around - this will better inform
-  // optimisations (e.g. instruction folding when load is followed by
-  // zext/sext). This will only be used for ints, so the value for FPs
-  // doesn't matter.
+  // Keep the original output value type around - this is needed to
+  // differentiate between the actual data sizes and instructions, e.g. LD1B,
+  // LD1H, LD1W and LD1D. For FP values we want the integer equivalent, so just
+  // use HwRetVT.
   SDValue OutVT = DAG.getValueType(RetVT);
   if (RetVT.isFloatingPoint())
     OutVT = DAG.getValueType(HwRetVt);
@@ -12685,7 +12691,6 @@
   return DAG.getMergeValues({Load, LoadChain}, DL);
 }
 
-
 static SDValue
 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                               SelectionDAG &DAG) {
@@ -12729,6 +12734,9 @@
   case AArch64ISD::GLD1_IMM:
     NewOpc = AArch64ISD::GLD1S_IMM;
     break;
+  case AArch64ISD::GLDNT1:
+    NewOpc = AArch64ISD::GLDNT1S;
+    break;
   default:
     return SDValue();
   }
@@ -12842,48 +12850,52 @@
       return performNEONPostLDSTCombine(N, DCI, DAG);
     case Intrinsic::aarch64_sve_ldnt1:
       return performLDNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_ldnt1_gather:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1);
     case Intrinsic::aarch64_sve_ldnf1:
       return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1);
     case Intrinsic::aarch64_sve_ldff1:
       return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1);
     case Intrinsic::aarch64_sve_stnt1:
       return performSTNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_stnt1_scatter:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1);
     case Intrinsic::aarch64_sve_ld1_gather:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1);
     case Intrinsic::aarch64_sve_ld1_gather_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SCALED);
     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM);
     case Intrinsic::aarch64_sve_st1_scatter:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1);
     case Intrinsic::aarch64_sve_st1_scatter_index:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED);
     case Intrinsic::aarch64_sve_st1_scatter_sxtw:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_uxtw:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM);
     default:
       break;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -15,6 +15,7 @@
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
+// Gather Loads
 def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
@@ -25,6 +26,12 @@
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
+def SDT_AArch64_GLDNT1 : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+// Scatter Stores
 def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
@@ -35,6 +42,11 @@
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
+def SDT_AArch64_SSTNT1 : SDTypeProfile<0, 5, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
 def AArch64st1_scatter               : SDNode<"AArch64ISD::SST1",               SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
 def AArch64st1_scatter_scaled        : SDNode<"AArch64ISD::SST1_SCALED",        SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
 def AArch64st1_scatter_uxtw          : SDNode<"AArch64ISD::SST1_UXTW",          SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
@@ -53,6 +65,11 @@
 def AArch64ld1_gather_sxtw_scaled    : SDNode<"AArch64ISD::GLD1_SXTW_SCALED",   SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 def AArch64ld1_gather_imm            : SDNode<"AArch64ISD::GLD1_IMM",           SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
 
+def AArch64ldnt1_gather              : SDNode<"AArch64ISD::GLDNT1",             SDT_AArch64_GLDNT1, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ldnt1s_gather             : SDNode<"AArch64ISD::GLDNT1S",            SDT_AArch64_GLDNT1, [SDNPHasChain, SDNPMayLoad]>;
+
+def AArch64stnt1_scatter             : SDNode<"AArch64ISD::SSTNT1",             SDT_AArch64_SSTNT1, [SDNPHasChain, SDNPMayStore]>;
+
 // SVE CNT/INC/RDVL
 def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
 def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
@@ -1752,32 +1769,32 @@
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
 
   // SVE2 non-temporal gather loads
-  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
-  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
-  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
-  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
-  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;
-
-  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
-  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
-  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
-  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
-  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
-  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
-  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;
+  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>;
+  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_32b_ptrs<0b00001, "ldnt1b",  AArch64ldnt1_gather,  nxv4i8>;
+  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>;
+  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_32b_ptrs<0b00101, "ldnt1h",  AArch64ldnt1_gather,  nxv4i16>;
+  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_32b_ptrs<0b01001, "ldnt1w",  AArch64ldnt1_gather,  nxv4i32>;
+
+  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>;
+  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_64b_ptrs<0b10010, "ldnt1b",  AArch64ldnt1_gather,  nxv2i8>;
+  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>;
+  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_64b_ptrs<0b10110, "ldnt1h",  AArch64ldnt1_gather,  nxv2i16>;
+  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>;
+  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_64b_ptrs<0b11010, "ldnt1w",  AArch64ldnt1_gather,  nxv2i32>;
+  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_64b_ptrs<0b11110, "ldnt1d",  AArch64ldnt1_gather,  nxv2i64>;
 
   // SVE2 vector splice (constructive)
   defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
 
   // SVE2 non-temporal scatter stores
-  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
-  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
-  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
-
-  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
-  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
-  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
-  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+  defm STNT1B_ZZR_S : sve2_mem_sstnt_32b_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
+  defm STNT1H_ZZR_S : sve2_mem_sstnt_32b_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
+  defm STNT1W_ZZR_S : sve2_mem_sstnt_32b_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
+
+  defm STNT1B_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
+  defm STNT1H_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
+  defm STNT1W_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
+  defm STNT1D_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
 
   // SVE2 table lookup (three sources)
   defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -4944,16 +4944,36 @@
   let mayStore = 1;
 }
 
-multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
-                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
+multiclass sve2_mem_sstnt_32b_ptrs<bits<3> opc, string asm,
+                             SDPatternOperator op,
+                             ValueType vt> {
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
 
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+                 (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+  def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
+             (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_sstnt_64b_ptrs<bits<3> opc, string asm,
+                             SDPatternOperator op,
+                             ValueType vt> {
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+  def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
+             (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
 }
 
 class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
@@ -6377,17 +6397,38 @@
   let mayLoad = 1;
 }
 
-multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
-                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
-                                     asm, listty>;
+multiclass sve2_mem_gldnt_32b_ptrs<bits<5> opc, string asm,
+                                  SDPatternOperator op,
+                                  ValueType vt> {
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
+                                     asm, Z_s>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+  def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
+             (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_gldnt_64b_ptrs<bits<5> opc, string asm,
+                                   SDPatternOperator op,
+                                   ValueType vt> {
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
+                                     asm, Z_d>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+                 (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+  def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
+             (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads.ll
@@ -0,0 +1,188 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; LDNT1B, LDNT1W, LDNT1H, LDNT1D: vector base + scalar offset
+;   ldnt1b { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1B
+define <vscale x 4 x i32> @gldnt1b_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1b_s:
+; CHECK:    ldnt1b { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                               <vscale x 4 x i32> %base,
+                                                                               i64 %offset)
+  %res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1b_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1b_d:
+; CHECK:    ldnt1b { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                               <vscale x 2 x i64> %base,
+                                                                               i64 %offset)
+  %res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1H
+define <vscale x 4 x i32> @gldnt1h_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1h_s:
+; CHECK:    ldnt1h { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv416.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                <vscale x 4 x i32> %base,
+                                                                                i64 %offset)
+  %res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1h_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1h_d:
+; CHECK:    ldnt1h { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                 <vscale x 2 x i64> %base,
+                                                                                 i64 %offset)
+  %res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1W
+define <vscale x 4 x i32> @gldnt1w_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_s:
+; CHECK:    ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv4i32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                 <vscale x 4 x i32> %base,
+                                                                                 i64 %offset)
+  ret <vscale x 4 x i32> %load
+}
+
+define <vscale x 4 x float> @gldnt1w_s_float(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_s_float:
+; CHECK:    ldnt1w { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.gather.nxv4f32.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                   <vscale x 4 x i32> %base,
+                                                                                   i64 %offset)
+  ret <vscale x 4 x float> %load
+}
+
+define <vscale x 2 x i64> @gldnt1w_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1w_d:
+; CHECK:    ldnt1w { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                 <vscale x 2 x i64> %base,
+                                                                                 i64 %offset)
+  %res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1D
+define <vscale x 2 x i64> @gldnt1d_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1d_d:
+; CHECK:    ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.nxv2i64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                 <vscale x 2 x i64> %base,
+                                                                                 i64 %offset)
+  ret <vscale x 2 x i64> %load
+}
+
+; LDNT1D
+define <vscale x 2 x double> @gldnt1d_d_double(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1d_d_double:
+; CHECK:    ldnt1d { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.nxv2f64.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                    <vscale x 2 x i64> %base,
+                                                                                    i64 %offset)
+  ret <vscale x 2 x double> %load
+}
+
+;
+; LDNT1SB, LDNT1SW, LDNT1SH, LDNT1SD: vector base + scalar offset
+;   ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+;
+
+; LDNT1SB
+define <vscale x 4 x i32> @gldnt1sb_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sb_s:
+; CHECK:    ldnt1sb { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv4i8.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                               <vscale x 4 x i32> %base,
+                                                                               i64 %offset)
+  %res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sb_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sb_d:
+; CHECK:    ldnt1sb { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                               <vscale x 2 x i64> %base,
+                                                                               i64 %offset)
+  %res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1SH
+define <vscale x 4 x i32> @gldnt1sh_s(<vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sh_s:
+; CHECK:    ldnt1sh { z0.s }, p0/z, [z0.s, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv416.nxv4i32(<vscale x 4 x i1> %pg,
+                                                                                <vscale x 4 x i32> %base,
+                                                                                i64 %offset)
+  %res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @gldnt1sh_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sh_d:
+; CHECK:    ldnt1sh { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                 <vscale x 2 x i64> %base,
+                                                                                 i64 %offset)
+  %res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1SW
+define <vscale x 2 x i64> @gldnt1sw_d(<vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: gldnt1sw_d:
+; CHECK:    ldnt1sw { z0.d }, p0/z, [z0.d, x0]
+; CHECK-NEXT:    ret
+  %load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32.nxv2i64(<vscale x 2 x i1> %pg,
+                                                                                 <vscale x 2 x i64> %base,
+                                                                                 i64 %offset)
+  %res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %res
+}
+
+; LDNT1B/LDNT1SB
+declare <vscale x 4 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv4i8.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i8> @llvm.aarch64.sve.ldnt1.gather.nxv2i8.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDNT1H/LDNT1SH
+declare <vscale x 4 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv416.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i16> @llvm.aarch64.sve.ldnt1.gather.nxv2i16.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+; LDNT1W/LDNT1SW
+declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv4i32.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+declare <vscale x 2 x i32> @llvm.aarch64.sve.ldnt1.gather.nxv2i32.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 4 x float>  @llvm.aarch64.sve.ldnt1.gather.nxv4f32.nxv4i32(<vscale x 4 x i1>,  <vscale x 4 x i32>, i64)
+
+; LDNT1D
+declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.gather.nxv2i64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.gather.nxv2f64.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores.ll
@@ -0,0 +1,134 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; STNT1B, STNT1W, STNT1H, STNT1D: vector base + scalar offset
+;   stnt1b { z0.s }, p0/z, [z0.s, x0]
+;
+
+; STNT1B
+define void @stnt1b_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1b_s:
+; CHECK: stnt1b { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i8>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv4i8.nxv4i32(<vscale x 4 x i8> %data_trunc,
+                                                           <vscale x 4 x i1> %pg,
+                                                           <vscale x 4 x i32> %base,
+                                                           i64 %offset)
+  ret void
+}
+
+define void @stnt1b_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1b_d:
+; CHECK: stnt1b { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i8>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i8.nxv2i64(<vscale x 2 x i8> %data_trunc,
+                                                           <vscale x 2 x i1> %pg,
+                                                           <vscale x 2 x i64> %base,
+                                                           i64 %offset)
+  ret void
+}
+
+; STNT1H
+define void @stnt1h_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1h_s:
+; CHECK: stnt1h { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 4 x i32> %data to <vscale x 4 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv4i16.nxv4i32(<vscale x 4 x i16> %data_trunc,
+                                                            <vscale x 4 x i1> %pg,
+                                                            <vscale x 4 x i32> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+define void @stnt1h_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1h_d:
+; CHECK: stnt1h { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i16>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16.nxv2i64(<vscale x 2 x i16> %data_trunc,
+                                                            <vscale x 2 x i1> %pg,
+                                                            <vscale x 2 x i64> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+; STNT1W
+define void @stnt1w_s(<vscale x 4 x i32> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_s:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv4i32.nxv4i32(<vscale x 4 x i32> %data,
+                                                            <vscale x 4 x i1> %pg,
+                                                            <vscale x 4 x i32> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+define void @stnt1w_f32_s(<vscale x 4 x float> %data, <vscale x 4 x i1> %pg, <vscale x 4 x i32> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_f32_s:
+; CHECK: stnt1w { z0.s }, p0, [z1.s, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv4f32.nxv4i32(<vscale x 4 x float> %data,
+                                                            <vscale x 4 x i1> %pg,
+                                                            <vscale x 4 x i32> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+define void @stnt1w_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1w_d:
+; CHECK: stnt1w { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  %data_trunc = trunc <vscale x 2 x i64> %data to <vscale x 2 x i32>
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i32.nxv2i64(<vscale x 2 x i32> %data_trunc,
+                                                            <vscale x 2 x i1> %pg,
+                                                            <vscale x 2 x i64> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+; STNT1D
+define void @stnt1d_d(<vscale x 2 x i64> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1d_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2i64.nxv2i64(<vscale x 2 x i64> %data,
+                                                            <vscale x 2 x i1> %pg,
+                                                            <vscale x 2 x i64> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+define void @stnt1d_f64_d(<vscale x 2 x double> %data, <vscale x 2 x i1> %pg, <vscale x 2 x i64> %base, i64 %offset) {
+; CHECK-LABEL: stnt1d_f64_d:
+; CHECK: stnt1d { z0.d }, p0, [z1.d, x0]
+; CHECK-NEXT: ret
+  call void @llvm.aarch64.sve.stnt1.scatter.nxv2f64.nxv2i64(<vscale x 2 x double> %data,
+                                                            <vscale x 2 x i1> %pg,
+                                                            <vscale x 2 x i64> %base,
+                                                            i64 %offset)
+  ret void
+}
+
+; STNT1B
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i8.nxv2i64(<vscale x 2 x i8>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv4i8.nxv4i32(<vscale x 4 x i8>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1H
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i16.nxv2i64(<vscale x 2 x i16>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv4i16.nxv4i32(<vscale x 4 x i16>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1W
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i32.nxv2i64(<vscale x 2 x i32>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv4f32.nxv4i32(<vscale x 4 x float>, <vscale x 4 x i1>, <vscale x 4 x i32>, i64)
+
+; STNT1D
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i64.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f32.nxv2i64(<vscale x 2 x float>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)
+declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f64.nxv2i64(<vscale x 2 x double>, <vscale x 2 x i1>, <vscale x 2 x i64>, i64)