Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9428,13 +9428,13 @@
 }
 
 // Fold sext/zext of index into index type.
-bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
-                     SelectionDAG &DAG) {
+bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index,
+                     bool Scaled, SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Op = Index.getOperand(0);
 
   if (Index.getOpcode() == ISD::ZERO_EXTEND) {
-    MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
+    MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
     if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
       Index = Op;
       return true;
@@ -9442,7 +9442,7 @@
   }
 
   if (Index.getOpcode() == ISD::SIGN_EXTEND) {
-    MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
+    MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
     if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
       Index = Op;
       return true;
@@ -9511,11 +9511,30 @@
 SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(N);
   SDValue Mask = MGT->getMask();
+  SDValue Chain = MGT->getChain();
+  SDValue Index = MGT->getIndex();
+  SDValue Scale = MGT->getScale();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue BasePtr = MGT->getBasePtr();
   SDLoc DL(N);
 
   // Zap gathers with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
-    return CombineTo(N, MGT->getPassThru(), MGT->getChain());
+    return CombineTo(N, PassThru, MGT->getChain());
+
+  if (refineUniformBase(BasePtr, Index, DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                               PassThru.getValueType(), DL, Ops,
+                               MGT->getMemOperand(), MGT->getIndexType());
+  }
+
+  if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) {
+    SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other),
+                               PassThru.getValueType(), DL, Ops,
+                               MGT->getMemOperand(), MGT->getIndexType());
+  }
 
   return SDValue();
 }
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1746,6 +1746,7 @@
   SDValue PassThru = MGT->getPassThru();
   SDValue Index = MGT->getIndex();
   SDValue Scale = MGT->getScale();
+  EVT MemoryVT = MGT->getMemoryVT();
   Align Alignment = MGT->getOriginalAlign();
 
   // Split Mask operand
@@ -1759,6 +1760,10 @@
       std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
   }
 
+  EVT LoMemVT, HiMemVT;
+  // Split MemoryVT
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
   SDValue PassThruLo, PassThruHi;
   if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(PassThru, PassThruLo, PassThruHi);
@@ -1777,11 +1782,11 @@
       MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
+  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
                            MMO, MGT->getIndexType());
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
+  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
                            MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
@@ -2421,11 +2426,11 @@
       MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
-  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
+  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
                                    OpsLo, MMO, MGT->getIndexType());
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
-  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
+  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
                                    OpsHi, MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7339,17 +7339,22 @@
     return SDValue(E, 0);
   }
 
+  IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
                                           VTs, VT, MMO, IndexType);
   createOperands(N, Ops);
 
   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
          "Incompatible type of the PassThru value in MaskedGatherSDNode");
-  assert(N->getMask().getValueType().getVectorNumElements() ==
-             N->getValueType(0).getVectorNumElements() &&
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValueType(0).getVectorElementCount() &&
          "Vector width mismatch between mask and data");
-  assert(N->getIndex().getValueType().getVectorNumElements() >=
-             N->getValueType(0).getVectorNumElements() &&
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+             N->getValueType(0).getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+             N->getIndex().getValueType().getVectorElementCount(),
+             N->getValueType(0).getVectorElementCount()) &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4416,7 +4416,7 @@
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
+    IndexType = ISD::SIGNED_UNSCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -805,6 +805,7 @@
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1056,6 +1056,7 @@
       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
@@ -1108,6 +1109,7 @@
                     MVT::nxv4f32, MVT::nxv2f64}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::MGATHER, VT, Custom);
       setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
@@ -3772,6 +3774,29 @@
   return ExtVal.getValueType().isScalableVector();
 }
 
+unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::GLD1_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::GLD1_UXTW_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::GLD1_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::GLD1_SXTW_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::GLD1_SCALED_MERGE_ZERO},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
+  };
+  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+  return AddrModes.find(Key)->second;
+}
+
 unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
   std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
       {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
@@ -3795,7 +3820,7 @@
   return AddrModes.find(Key)->second;
 }
 
-bool getScatterIndexIsExtended(SDValue Index) {
+bool getGatherScatterIndexIsExtended(SDValue Index) {
   unsigned Opcode = Index.getOpcode();
   if (Opcode == ISD::SIGN_EXTEND_INREG)
     return true;
@@ -3813,6 +3838,57 @@
   return false;
 }
 
+SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
+  assert(MGT && "Can only custom lower gather load nodes");
+
+  SDValue Index = MGT->getIndex();
+  SDValue Chain = MGT->getChain();
+  SDValue PassThru = MGT->getPassThru();
+  SDValue Mask = MGT->getMask();
+  SDValue BasePtr = MGT->getBasePtr();
+
+  ISD::MemIndexType IndexType = MGT->getIndexType();
+  bool IsScaled =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+  bool IsSigned =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+  bool IdxNeedsExtend =
+      getGatherScatterIndexIsExtended(Index) ||
+      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+  EVT VT = PassThru.getSimpleValueType();
+  EVT MemVT = MGT->getMemoryVT();
+  SDValue InputVT = DAG.getValueType(MemVT);
+
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
+  // Handle FP data
+  if (VT.isFloatingPoint()) {
+    VT = VT.changeVectorElementTypeToInteger();
+    ElementCount EC = VT.getVectorElementCount();
+    auto ScalarIntVT =
+        MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
+    PassThru = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
+                           MVT::getVectorVT(ScalarIntVT, EC), PassThru);
+
+    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+  }
+
+  SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
+
+  if (getGatherScatterIndexIsExtended(Index))
+    Index = Index.getOperand(0);
+
+  SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
+  return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
+                     VTs, Ops);
+}
+
 SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc DL(Op);
@@ -3831,7 +3907,7 @@
   bool IsSigned =
       IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
   bool NeedsExtend =
-      getScatterIndexIsExtended(Index) ||
+      getGatherScatterIndexIsExtended(Index) ||
       Index.getSimpleValueType().getVectorElementType() == MVT::i32;
 
   EVT VT = StoreVal.getSimpleValueType();
@@ -3855,7 +3931,7 @@
     InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
   }
 
-  if (getScatterIndexIsExtended(Index))
+  if (getGatherScatterIndexIsExtended(Index))
     Index = Index.getOperand(0);
 
   SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
@@ -4140,6 +4216,8 @@
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::STORE:
     return LowerSTORE(Op, DAG);
+  case ISD::MGATHER:
+    return LowerMGATHER(Op, DAG);
   case ISD::MSCATTER:
     return LowerMSCATTER(Op, DAG);
   case ISD::VECREDUCE_SEQ_FADD:
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll
@@ -0,0 +1,158 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, half* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, float* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll
@@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.zext
+}
+
+define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+  %vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
+  ret <vscale x 4 x i32> %vals
+}
+
+define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+  %vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
+  ret <vscale x 4 x half> %vals
+}
+
+define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+  %vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
+  ret <vscale x 4 x float> %vals
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw]
+; CHECK-NEXT:    ret
+  %offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  %vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
+  %vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
+declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
+declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
+  %vals.sext = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
Index: llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@@ -0,0 +1,62 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Tests that exercise various type legalisation scenarios for ISD::MGATHER.
+
+; Code generate load of an illegal datatype via promotion.
+define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK-DAG: mov x8, xzr
+; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK:     ret
+  %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  ret <vscale x 2 x i32> %data
+}
+
+; Code generate the worst case scenario when all vector types are illegal.
+define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i32> %indices, <vscale x 32 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv32i32:
+; CHECK-NOT: unpkhi
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z0.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z1.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z2.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z3.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z4.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z5.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z6.s, sxtw #2]
+; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z7.s, sxtw #2]
+; CHECK: ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %indices
+  %data = call <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i32> undef)
+  ret <vscale x 32 x i32> %data
+}
+
+; TODO: Currently, the sign extend gets applied to the values after a 'uzp1' of two
+; registers, so it doesn't get folded away. Same for any other vector-of-pointers
+; style gathers which don't fit in an <vscale x 2 x type*> single register. Better folding
+; is required before we can check those off.
+define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv4i8:
+; CHECK:         pfalse p1.b
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
+; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
+; CHECK-NEXT:    ld1b { z1.d }, p2/z, [x8, z1.d]
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
+  %svals = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
+  ret <vscale x 4 x i32> %svals
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+
+declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
+
+declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x i8*>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*>, i32, <vscale x 32 x i1>, <vscale x 32 x i32>)
Index: llvm/test/CodeGen/X86/masked_gather_scatter.ll
===================================================================
--- llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -765,45 +765,41 @@
 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 ; KNL_64-LABEL: test14:
 ; KNL_64:       # %bb.0:
-; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
-; KNL_64-NEXT:    vmovd %esi, %xmm1
-; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
-; KNL_64-NEXT:    vpmovsxdq %ymm1, %zmm1
-; KNL_64-NEXT:    vpsllq $2, %zmm1, %zmm1
-; KNL_64-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    vmovq %xmm0, %rax
+; KNL_64-NEXT:    vmovd %esi, %xmm0
+; KNL_64-NEXT:    vpbroadcastd %xmm0, %ymm0
+; KNL_64-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_64-NEXT:    vpsllq $2, %zmm0, %zmm0
 ; KNL_64-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_64-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
+; KNL_64-NEXT:    vgatherqps (%rax,%zmm0), %ymm1 {%k1}
 ; KNL_64-NEXT:    vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test14:
 ; KNL_32:       # %bb.0:
-; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
+; KNL_32-NEXT:    vmovd %xmm0, %eax
 ; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
-; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; KNL_32-NEXT:    kxnorw %k0, %k0, %k1
-; KNL_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
+; KNL_32-NEXT:    vgatherdps (%eax,%zmm1), %zmm0 {%k1}
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test14:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
-; SKX-NEXT:    vpbroadcastd %esi, %ymm1
-; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
-; SKX-NEXT:    vpsllq $2, %zmm1, %zmm1
-; SKX-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
+; SKX-NEXT:    vmovq %xmm0, %rax
+; SKX-NEXT:    vpbroadcastd %esi, %ymm0
+; SKX-NEXT:    vpmovsxdq %ymm0, %zmm0
+; SKX-NEXT:    vpsllq $2, %zmm0, %zmm0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
-; SKX-NEXT:    vgatherqps (,%zmm0), %ymm1 {%k1}
+; SKX-NEXT:    vgatherqps (%rax,%zmm0), %ymm1 {%k1}
 ; SKX-NEXT:    vinsertf64x4 $1, %ymm1, %zmm1, %zmm0
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test14:
 ; SKX_32:       # %bb.0:
-; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
+; SKX_32-NEXT:    vmovd %xmm0, %eax
 ; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
-; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
 ; SKX_32-NEXT:    kxnorw %k0, %k0, %k1
-; SKX_32-NEXT:    vgatherdps (,%zmm1), %zmm0 {%k1}
+; SKX_32-NEXT:    vgatherdps (%eax,%zmm1), %zmm0 {%k1}
 ; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1