Index: llvm/include/llvm/CodeGen/SelectionDAG.h
===================================================================
--- llvm/include/llvm/CodeGen/SelectionDAG.h
+++ llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1361,7 +1361,8 @@
                           ISD::MemIndexType IndexType);
   SDValue getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                            ArrayRef<SDValue> Ops, MachineMemOperand *MMO,
-                           ISD::MemIndexType IndexType);
+                           ISD::MemIndexType IndexType,
+                           bool IsTruncating = false);
 
   /// Construct a node to track a Value* through the backend.
   SDValue getSrcValue(const Value *v);
Index: llvm/include/llvm/CodeGen/SelectionDAGNodes.h
===================================================================
--- llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -523,6 +523,7 @@
   class StoreSDNodeBitfields {
     friend class StoreSDNode;
     friend class MaskedStoreSDNode;
+    friend class MaskedScatterSDNode;
 
     uint16_t : NumLSBaseSDNodeBits;
 
@@ -2390,6 +2391,9 @@
   ISD::MemIndexType getIndexType() const {
     return static_cast<ISD::MemIndexType>(LSBaseSDNodeBits.AddressingMode);
   }
+  void setIndexType(ISD::MemIndexType IndexType) {
+    LSBaseSDNodeBits.AddressingMode = IndexType;
+  }
   bool isIndexScaled() const {
     return (getIndexType() == ISD::SIGNED_SCALED) ||
            (getIndexType() == ISD::UNSIGNED_SCALED);
@@ -2440,10 +2444,17 @@
   friend class SelectionDAG;
 
   MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                      EVT MemVT, MachineMemOperand *MMO,
+                      bool IsTrunc, EVT MemVT, MachineMemOperand *MMO,
                       ISD::MemIndexType IndexType)
       : MaskedGatherScatterSDNode(ISD::MSCATTER, Order, dl, VTs, MemVT, MMO,
-                                  IndexType) {}
+                                  IndexType) {
+    StoreSDNodeBits.IsTruncating = IsTrunc;
+  }
+
+  /// Return true if the op does a truncation before store.
+  /// For integers this is the same as doing a TRUNCATE and storing the result.
+  /// For floats, it is the same as doing an FP_ROUND and storing the result.
+  bool isTruncatingStore() const { return StoreSDNodeBits.IsTruncating; }
 
   const SDValue &getValue() const { return getOperand(1); }
 
Index: llvm/include/llvm/CodeGen/TargetLowering.h
===================================================================
--- llvm/include/llvm/CodeGen/TargetLowering.h
+++ llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1312,6 +1312,12 @@
             getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
   }
 
+  // Returns true if VT is a legal index type for masked gathers/scatters
+  // on this target
+  virtual bool isLegalMaskedGSIndexType(EVT VT) const {
+    return false;
+  }
+
   /// Return how the condition code should be treated: either it is legal, needs
   /// to be expanded to some other code sequence, or the target has a custom
   /// expander for it.
@@ -4504,6 +4510,13 @@
   // combiner can fold the new nodes.
   SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
 
+  /// Give targets the chance to reduce the number of distinct addresing modes.
+  virtual ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType,
+                                                  EVT MemVT,
+                                                  SDValue Offsets) const {
+    return IndexType;
+  }
+
 private:
   SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                            const SDLoc &DL, DAGCombinerInfo &DCI) const;
Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9374,16 +9374,79 @@
       TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
+bool refineUniformBase(SDValue &BasePtr, SDValue &Index) {
+  if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD)
+    return false;
+
+  // For now we check only the LHS of the add.
+  SDValue LHS = Index.getOperand(0);
+  if (LHS.getOpcode() != ISD::SPLAT_VECTOR)
+    return false;
+
+  SDValue SplatValue = LHS.getOperand(0);
+  if (SplatValue.getSimpleValueType() != MVT::i64)
+    return false;
+
+  BasePtr = SplatValue;
+  Index = Index.getOperand(1);
+  return true;
+}
+
+// Fold sext/zext of index into index type.
+bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index,
+                     bool Scaled, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Op = Index.getOperand(0);
+
+  if (Index.getOpcode() == ISD::ZERO_EXTEND) {
+    MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
+    if (TLI.isLegalMaskedGSIndexType(Op.getValueType())) {
+      Index = Op;
+      return true;
+    }
+  }
+
+  if (Index.getOpcode() == ISD::SIGN_EXTEND) {
+    MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
+    if (TLI.isLegalMaskedGSIndexType(Op.getValueType())) {
+      Index = Op;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
   SDValue Mask = MSC->getMask();
   SDValue Chain = MSC->getChain();
+  SDValue Index = MSC->getIndex();
+  SDValue Scale = MSC->getScale();
+  SDValue StoreVal = MSC->getValue();
+  SDValue BasePtr = MSC->getBasePtr();
   SDLoc DL(N);
 
   // Zap scatters with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
+  if (refineUniformBase(BasePtr, Index)) {
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+                                StoreVal.getValueType(), DL, Ops,
+                                MSC->getMemOperand(),
+                                MSC->getIndexType(), MSC->isTruncatingStore());
+  }
+
+  if (refineIndexType(MSC, Index, MSC->isIndexScaled(), DAG)) {
+    SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
+    return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+                                StoreVal.getValueType(), DL, Ops,
+                                MSC->getMemOperand(),
+                                MSC->getIndexType(), MSC->isTruncatingStore());
+  }
+
   return SDValue();
 }
 
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -1851,6 +1851,7 @@
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
                                                 unsigned OpNo) {
+  bool TruncateStore = N->isTruncatingStore();
   SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
   if (OpNo == 2) {
     // The Mask
@@ -1863,9 +1864,17 @@
       NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
     else
       NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
-  } else
+
+    N->setIndexType(TLI.getCanonicalIndexType(N->getIndexType(),
+                                              N->getMemoryVT(), NewOps[OpNo]));
+  } else {
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
-  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+    TruncateStore = true;
+  }
+
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(),
+                              SDLoc(N), NewOps, N->getMemOperand(),
+                              N->getIndexType(), TruncateStore);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -2498,11 +2498,15 @@
   SDValue Index = N->getIndex();
   SDValue Scale = N->getScale();
   SDValue Data = N->getValue();
+  EVT MemoryVT = N->getMemoryVT();
   Align Alignment = N->getOriginalAlign();
   SDLoc DL(N);
 
   // Split all operands
 
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
   SDValue DataLo, DataHi;
   if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Data operand
@@ -2533,15 +2537,17 @@
       MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
 
   SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
-  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
-                            DL, OpsLo, MMO, N->getIndexType());
+  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT,
+                            DL, OpsLo, MMO, N->getIndexType(),
+                            N->isTruncatingStore());
 
   // The order of the Scatter operation after split is well defined. The "Hi"
   // part comes after the "Lo". So these two operations should be chained one
   // after another.
   SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
-  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
-                              DL, OpsHi, MMO, N->getIndexType());
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT,
+                              DL, OpsHi, MMO, N->getIndexType(),
+                              N->isTruncatingStore());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -4718,7 +4724,8 @@
                    Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
                               MSC->getMemoryVT(), SDLoc(N), Ops,
-                              MSC->getMemOperand(), MSC->getIndexType());
+                              MSC->getMemOperand(), MSC->getIndexType(),
+                              MSC->isTruncatingStore());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7398,29 +7398,36 @@
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
                                        MachineMemOperand *MMO,
-                                       ISD::MemIndexType IndexType) {
+                                       ISD::MemIndexType IndexType,
+                                       bool IsTrunc) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
-      dl.getIROrder(), VTs, VT, MMO, IndexType));
+      dl.getIROrder(), VTs, IsTrunc, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
     cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
     return SDValue(E, 0);
   }
+
+  IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                           VTs, VT, MMO, IndexType);
+                                           VTs, IsTrunc, VT, MMO, IndexType);
   createOperands(N, Ops);
 
-  assert(N->getMask().getValueType().getVectorNumElements() ==
-             N->getValue().getValueType().getVectorNumElements() &&
+  assert(N->getMask().getValueType().getVectorElementCount() ==
+             N->getValue().getValueType().getVectorElementCount() &&
          "Vector width mismatch between mask and data");
-  assert(N->getIndex().getValueType().getVectorNumElements() >=
-             N->getValue().getValueType().getVectorNumElements() &&
+  assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
+         N->getValue().getValueType().getVectorElementCount().isScalable() &&
+         "Scalable flags of index and data do not match");
+  assert(ElementCount::isKnownGE(
+           N->getIndex().getValueType().getVectorElementCount(),
+           N->getValue().getValueType().getVectorElementCount()) &&
          "Vector width mismatch between index and data");
   assert(isa<ConstantSDNode>(N->getScale()) &&
          cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4297,12 +4297,12 @@
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
-    IndexType = ISD::SIGNED_SCALED;
+    IndexType = ISD::SIGNED_UNSCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
-                                         Ops, MMO, IndexType);
+                                         Ops, MMO, IndexType, false);
   DAG.setRoot(Scatter);
   setValue(&I, Scatter);
 }
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -735,7 +735,19 @@
       OS << ", compressing";
 
     OS << ">";
-  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
+  } else if (const auto *MScatter = dyn_cast<MaskedScatterSDNode>(this)) {
+    OS << "<";
+    printMemOperand(OS, *MScatter->getMemOperand(), G);
+
+    if (MScatter->isTruncatingStore())
+      OS << ", trunc to " << MScatter->getMemoryVT().getEVTString();
+
+    auto Signed = MScatter->isIndexSigned() ? "signed" : "unsigned";
+    auto Scaled = MScatter->isIndexScaled() ? "scaled" : "unscaled";
+    OS << ", " << Signed << " " << Scaled << " offset";
+
+    OS << ">";
+  } else if (const MemSDNode *M = dyn_cast<MemSDNode>(this)) {
     OS << "<";
     printMemOperand(OS, *M->getMemOperand(), G);
     OS << ">";
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -777,6 +777,12 @@
     return !useSVEForFixedLengthVectors();
   }
 
+  // Convert redundant addressing modes (e.g. scaling is redundant
+  // when accessing bytes).
+  ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType,
+                                          EVT MemVT,
+                                          SDValue Offsets) const override;
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -807,6 +813,8 @@
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
@@ -979,6 +987,7 @@
     return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
   }
 
+  bool isLegalMaskedGSIndexType(EVT VT) const override;
   bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
   bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -782,6 +782,8 @@
   if (Subtarget->supportsAddressTopByteIgnored())
     setTargetDAGCombine(ISD::LOAD);
 
+  setTargetDAGCombine(ISD::MSCATTER);
+
   setTargetDAGCombine(ISD::MUL);
 
   setTargetDAGCombine(ISD::SELECT);
@@ -1001,6 +1003,7 @@
       setOperationAction(ISD::SINT_TO_FP, VT, Custom);
       setOperationAction(ISD::FP_TO_UINT, VT, Custom);
       setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::MUL, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
@@ -1041,6 +1044,7 @@
                     MVT::nxv4f32, MVT::nxv2f64}) {
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+      setOperationAction(ISD::MSCATTER, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
       setOperationAction(ISD::SELECT, VT, Custom);
       setOperationAction(ISD::FADD, VT, Custom);
@@ -1064,6 +1068,9 @@
 
     setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom);
 
+    for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16})
+      setOperationAction(ISD::MSCATTER, VT, Custom);
+
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
 
@@ -3690,6 +3697,14 @@
   }
 }
 
+bool AArch64TargetLowering::isLegalMaskedGSIndexType(EVT VT) const {
+ if (VT.getVectorElementType() == MVT::i32 &&
+     VT.getVectorElementCount().getKnownMinValue() >= 4)
+     return true;
+
+ return false;
+}
+
 bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   return ExtVal.getValueType().isScalableVector();
 }
@@ -3726,6 +3741,96 @@
                       ST->getBasePtr(), ST->getMemOperand());
 }
 
+unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
+  std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::SST1_PRED},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::SST1_UXTW_PRED},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::SST1_PRED},
+      {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::SST1_SXTW_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false),
+       AArch64ISD::SST1_SCALED_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true),
+       AArch64ISD::SST1_UXTW_SCALED_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false),
+       AArch64ISD::SST1_SCALED_PRED},
+      {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true),
+       AArch64ISD::SST1_SXTW_SCALED_PRED},
+  };
+  auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
+  return AddrModes.find(Key)->second;
+}
+
+bool getScatterIndexIsExtended(SDValue Index) {
+  unsigned Opcode = Index.getOpcode();
+  if (Opcode == ISD::SIGN_EXTEND_INREG)
+    return true;
+
+  if (Opcode == ISD::AND) {
+    SDValue Splat = Index.getOperand(1);
+    if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
+      return false;
+    ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
+    if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF)
+      return false;
+    return true;
+  }
+
+  return false;
+}
+
+SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
+  assert(MSC && "Can only custom lower scatter store nodes");
+
+  SDValue Index = MSC->getIndex();
+  SDValue Chain = MSC->getChain();
+  SDValue StoreVal = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+  SDValue BasePtr = MSC->getBasePtr();
+
+  ISD::MemIndexType IndexType = MSC->getIndexType();
+  bool IsScaled =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED;
+  bool IsSigned =
+      IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED;
+  bool NeedsExtend = getScatterIndexIsExtended(Index) ||
+      Index.getSimpleValueType().getVectorElementType() == MVT::i32;
+
+  EVT VT = StoreVal.getSimpleValueType();
+  SDVTList VTs = DAG.getVTList(MVT::Other);
+  EVT MemVT = MSC->getMemoryVT();
+  SDValue InputVT = DAG.getValueType(MemVT);
+
+  if (VT.getVectorElementType() == MVT::bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
+  // Handle FP data
+  if (VT.isFloatingPoint()) {
+    VT = VT.changeVectorElementTypeToInteger();
+    ElementCount EC = VT.getVectorElementCount();
+    auto ScalarIntVT =
+        MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
+    StoreVal = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
+                           MVT::getVectorVT(ScalarIntVT, EC), StoreVal);
+
+    InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
+  }
+
+  if (getScatterIndexIsExtended(Index))
+    Index = Index.getOperand(0);
+
+  SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
+  return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,
+                     VTs, Ops);
+}
+
 // Custom lowering for any store, vector or scalar and/or default or with
 // a truncate operations.  Currently only custom lower truncate operation
 // from vector v4i16 to v4i8 or volatile stores of i128.
@@ -3973,6 +4078,8 @@
     return LowerSTORE(Op, DAG);
   case ISD::VECREDUCE_SEQ_FADD:
     return LowerVECREDUCE_SEQ_FADD(Op, DAG);
+  case ISD::MSCATTER:
+    return LowerMSCATTER(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_AND:
   case ISD::VECREDUCE_OR:
@@ -13764,6 +13871,44 @@
   return SDValue();
 }
 
+static SDValue performMSCATTERCombine(SDNode *N,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      SelectionDAG &DAG) {
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+  assert(MSC && "Can only combine scatter store nodes");
+
+  SDLoc DL(MSC);
+  SDValue Chain = MSC->getChain();
+  SDValue Scale = MSC->getScale();
+  SDValue Index = MSC->getIndex();
+  SDValue Data = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+  SDValue BasePtr = MSC->getBasePtr();
+  ISD::MemIndexType IndexType = MSC->getIndexType();
+
+  EVT IdxVT = Index.getValueType();
+
+  if (DCI.isBeforeLegalize()) {
+    // SVE gather/scatter requires indices of i32/i64. Promote anything smaller
+    // prior to legalisation so the result can be split if required.
+    if ((IdxVT.getVectorElementType() == MVT::i8) ||
+        (IdxVT.getVectorElementType() == MVT::i16)) {
+      EVT NewIdxVT = IdxVT.changeVectorElementType(MVT::i32);
+      if (MSC->isIndexSigned())
+        Index = DAG.getNode(ISD::SIGN_EXTEND, DL, NewIdxVT, Index);
+      else
+        Index = DAG.getNode(ISD::ZERO_EXTEND, DL, NewIdxVT, Index);
+
+      SDValue Ops[] = { Chain, Data, Mask, BasePtr, Index, Scale };
+      return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+                                  MSC->getMemoryVT(), DL, Ops,
+                                  MSC->getMemOperand(), IndexType,
+                                  MSC->isTruncatingStore());
+    }
+  }
+
+  return SDValue();
+}
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
@@ -14956,6 +15101,8 @@
     break;
   case ISD::STORE:
     return performSTORECombine(N, DCI, DAG, Subtarget);
+  case ISD::MSCATTER:
+    return performMSCATTERCombine(N, DCI, DAG);
   case AArch64ISD::BRCOND:
     return performBRCONDCombine(N, DCI, DAG);
   case AArch64ISD::TBNZ:
@@ -15999,6 +16146,23 @@
   return false;
 }
 
+ISD::MemIndexType
+AArch64TargetLowering::getCanonicalIndexType(ISD::MemIndexType IndexType,
+                                             EVT MemVT, SDValue Offsets) const {
+  bool ScaledIndex = (IndexType == ISD::SIGNED_SCALED) ||
+                     (IndexType == ISD::UNSIGNED_SCALED);
+  bool SignedIndex = (IndexType == ISD::SIGNED_SCALED) ||
+                     (IndexType == ISD::SIGNED_UNSCALED);
+
+  // Scaling is unimportant for bytes, canonicalize to unscaled.
+  if (ScaledIndex && MemVT.getScalarType() == MVT::i8) {
+    ScaledIndex = false;
+    IndexType = SignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED;
+  }
+
+  return IndexType;
+}
+
 // Return the largest legal scalable vector type that matches VT's element type.
 static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
   assert(VT.isFixedLengthVector() &&
Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
===================================================================
--- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1769,6 +1769,16 @@
   def : Pat<(nxv2i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
   def : Pat<(nxv2i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
 
+  def : Pat<(nxv2i64 (reinterpret_cast (nxv2f64 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2i64 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv2i64 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4i32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  def : Pat<(nxv4i32 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv2i64 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+    def : Pat<(nxv4i32 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>;
+  }
+
   def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
             (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
   def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -47550,7 +47550,8 @@
   return DAG.getMaskedScatter(Scatter->getVTList(),
                               Scatter->getMemoryVT(), DL,
                               Ops, Scatter->getMemOperand(),
-                              Scatter->getIndexType());
+                              Scatter->getIndexType(),
+                              Scatter->isTruncatingStore());
 }
 
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll
@@ -0,0 +1,265 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve  < %s -asm-verbose=0 | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; scaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-LABEL: masked_scatter_nxv2i16_sext:
+; CHECK-NEXT:  st1h    { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2i16_sext(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i32_sext:
+; CHECK-NEXT:  st1w    { z0.d }, p0, [x0, z1.d, sxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2i32_sext(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i64_sext:
+; CHECK-NEXT:  st1d    { z0.d }, p0, [x0, z1.d, sxtw #3]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2i64_sext(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f16_sext:
+; CHECK-NEXT:  st1h    { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2f16_sext(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2bf16_sext:
+; CHECK-NEXT:  st1h    { z0.d }, p0, [x0, z1.d, sxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2bf16_sext(<vscale x 2 x bfloat> %data, bfloat* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind #0 {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f32_sext:
+; CHECK-NEXT:  st1w    { z0.d }, p0, [x0, z1.d, sxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2f32_sext(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f64_sext:
+; CHECK-NEXT:  st1d    { z0.d }, p0, [x0, z1.d, sxtw #3]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2f64_sext(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = sext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i16_zext:
+; CHECK-NEXT:  st1h    { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2i16_zext(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i32_zext:
+; CHECK-NEXT:  st1w    { z0.d }, p0, [x0, z1.d, uxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2i32_zext(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i64_zext:
+; CHECK-NEXT:  st1d    { z0.d }, p0, [x0, z1.d, uxtw #3]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2i64_zext(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f16_zext:
+; CHECK-NEXT:  st1h    { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2f16_zext(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2bf16_zext:
+; CHECK-NEXT:  st1h    { z0.d }, p0, [x0, z1.d, uxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2bf16_zext(<vscale x 2 x bfloat> %data, bfloat* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind #0 {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f32_zext:
+; CHECK-NEXT:  st1w    { z0.d }, p0, [x0, z1.d, uxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2f32_zext(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f64_zext:
+; CHECK-NEXT:  st1d    { z0.d }, p0, [x0, z1.d, uxtw #3]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv2f64_zext(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i32> %indexes, <vscale x 2 x i1> %masks) nounwind {
+  %ext = zext <vscale x 2 x i32> %indexes to <vscale x 2 x i64>
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %ext
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; scaled packed 32-bit offset
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-LABEL: masked_scatter_nxv4i16_sext:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, sxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i16_sext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+  %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i32_sext:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i32_sext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+  %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f16_sext:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, sxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f16_sext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+  %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4bf16_sext:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, sxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4bf16_sext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+  %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f32_sext:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, sxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f32_sext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+  %ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i16_zext:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, uxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i16_zext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+  %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i32_zext:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, uxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i32_zext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+  %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f16_zext:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, uxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f16_zext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
+  %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4bf16_zext:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, uxtw #1]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4bf16_zext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+  %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f32_zext:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, uxtw #2]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f32_zext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
+  %ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
+  call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll
@@ -0,0 +1,332 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve  < %s  2>%t -asm-verbose=0 | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled unpacked 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-LABEL: masked_scatter_nxv2i8_sext_offsets:
+; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i8_sext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i16_sext_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i16_sext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i32_sext_offsets:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i32_sext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i64_sext_offsets:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i64_sext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f16_sext_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f16_sext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2bf16_sext_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2bf16_sext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f32_sext_offsets:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f32_sext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f64_sext_offsets:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f64_sext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i8_zext_offsets:
+; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i8_zext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i16_zext_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i16_zext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i32_zext_offsets:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i32_zext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i64_zext_offsets:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i64_zext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f16_zext_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f16_zext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2bf16_zext_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2bf16_zext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f32_zext_offsets:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f32_zext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f64_zext_offsets:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f64_zext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled packed 32-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; CHECK-LABEL: masked_scatter_nxv4i8_sext_offsets:
+; CHECK-NEXT:  st1b    { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i8_sext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i8*>
+  call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i16_sext_offsets:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i16_sext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i32_sext_offsets:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i32_sext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+  call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f16_sext_offsets:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f16_sext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+  call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4bf16_sext_offsets:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4bf16_sext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+  %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
+  call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f32_sext_offsets:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, sxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f32_sext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+  %offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+  call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i8_zext_offsets:
+; CHECK-NEXT:  st1b    { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i8_zext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i8*>
+  call void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8> %data, <vscale x 4 x i8*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i16_zext_offsets:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i16_zext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i16*>
+  call void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16> %data, <vscale x 4 x i16*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4i32_zext_offsets:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4i32_zext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x i32*>
+  call void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i32*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f16_zext_offsets:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f16_zext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
+  %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x half*>
+  call void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half> %data, <vscale x 4 x half*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4bf16_zext_offsets:
+; CHECK-NEXT:  st1h    { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4bf16_zext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+  %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x bfloat*>
+  call void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat> %data, <vscale x 4 x bfloat*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv4f32_zext_offsets:
+; CHECK-NEXT:  st1w    { z0.s }, p0, [x0, z1.s, uxtw]
+; CHECK-NEXT:   ret
+define void @masked_scatter_nxv4f32_zext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
+  %offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
+  %ptrs = bitcast <vscale x 4 x i8*> %byte_ptrs to <vscale x 4 x float*>
+  call void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x float*> %ptrs, i32 0, <vscale x 4 x i1> %masks)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; scaled 64-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define void @masked_scatter_nxv2i16(<vscale x 2 x i16> %data, i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i16:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
+; CHECK-NEXT: ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i32(<vscale x 2 x i32> %data, i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i32:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
+; CHECK-NEXT: ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i64(<vscale x 2 x i64> %data, i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i64:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT: ret
+  %ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f16(<vscale x 2 x half> %data, half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f16:
+; CHECK: st1h { z0.d }, p0, [x0, z1.d, lsl #1]
+; CHECK-NEXT: ret
+  %ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f32(<vscale x 2 x float> %data, float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f32:
+; CHECK: st1w { z0.d }, p0, [x0, z1.d, lsl #2]
+; CHECK-NEXT: ret
+  %ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f64:
+; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3]
+; CHECK-NEXT: ret
+  %ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve  < %s  2>%t -asm-verbose=0 | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; unscaled 64-bit offsets
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-LABEL: masked_scatter_nxv2i8_unscaled_64bit_offsets:
+; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i8_unscaled_64bit_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i16_unscaled_64bit_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i16_unscaled_64bit_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i32_unscaled_64bit_offsets:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i32_unscaled_64bit_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2i64_unscaled_64bit_offsets:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2i64_unscaled_64bit_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f16_unscaled_64bit_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f16_unscaled_64bit_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2bf16_unscaled_64bit_offsets:
+; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2bf16_unscaled_64bit_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f32_unscaled_64bit_offsets:
+; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f32_unscaled_64bit_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+; CHECK-LABEL: masked_scatter_nxv2f64_unscaled_64bit_offsets:
+; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT: ret
+define void @masked_scatter_nxv2f64_unscaled_64bit_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
+  %byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32*>, i32, <vscale x 4 x i1>)
+declare void @llvm.masked.scatter.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8*>, i32, <vscale x 4 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-legalise.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/sve-masked-scatter-legalise.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+; Tests that exercise various type legalisation scenarios for ISD::MSCATTER.
+
+; Code generate the scenario where the offset vector type is illegal.
+define void @masked_scatter_nxv16i8(<vscale x 16 x i8> %data, i8* %base, <vscale x 16 x i8> %offsets, <vscale x 16 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv16i8:
+; CHECK-DAG: st1b { {{z[0-9]+}}.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw]
+; CHECK-DAG: st1b { {{z[0-9]+}}.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw]
+; CHECK-DAG: st1b { {{z[0-9]+}}.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw]
+; CHECK-DAG: st1b { {{z[0-9]+}}.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw]
+; CHECK: ret
+  %ptrs = getelementptr i8, i8* %base, <vscale x 16 x i8> %offsets
+  call void @llvm.masked.scatter.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i8*> %ptrs, i32 1, <vscale x 16 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv8i16(<vscale x 8 x i16> %data, i16* %base, <vscale x 8 x i16> %offsets, <vscale x 8 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv8i16
+; CHECK-DAG: st1h { {{z[0-9]+}}.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #1]
+; CHECK-DAG: st1h { {{z[0-9]+}}.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #1]
+; CHECK: ret
+  %ptrs = getelementptr i16, i16* %base, <vscale x 8 x i16> %offsets
+  call void @llvm.masked.scatter.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i16*> %ptrs, i32 1, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv8f32(<vscale x 8 x float> %data, float* %base, <vscale x 8 x i32> %indexes, <vscale x 8 x i1> %masks) {
+; CHECK-LABEL: masked_scatter_nxv8f32
+; CHECK-DAG: st1w { z0.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, uxtw #2]
+; CHECK-DAG: st1w { z1.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, uxtw #2]
+  %ext = zext <vscale x 8 x i32> %indexes to <vscale x 8 x i64>
+  %ptrs = getelementptr float, float* %base, <vscale x 8 x i64> %ext
+  call void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float> %data, <vscale x 8 x float*> %ptrs, i32 0, <vscale x 8 x i1> %masks)
+  ret void
+}
+
+; Code generate the worst case scenario when all vector types are illegal.
+define void @masked_scatter_nxv32i32(<vscale x 32 x i32> %data, i32* %base, <vscale x 32 x i32> %offsets, <vscale x 32 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv32i32:
+; CHECK-NOT: unpkhi
+; CHECK-DAG: st1w { z0.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z1.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z2.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z3.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z4.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z5.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z6.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK-DAG: st1w { z7.s }, {{p[0-9]+}}, [x0, {{z[0-9]+}}.s, sxtw #2]
+; CHECK: ret
+  %ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %offsets
+  call void @llvm.masked.scatter.nxv32i32(<vscale x 32 x i32> %data, <vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8*>,  i32, <vscale x 16 x i1>)
+declare void @llvm.masked.scatter.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16*>,  i32, <vscale x 8 x i1>)
+declare void @llvm.masked.scatter.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float*>, i32, <vscale x 8 x i1>)
+declare void @llvm.masked.scatter.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32*>,  i32, <vscale x 32 x i1>)