Index: llvm/include/llvm/CodeGen/SelectionDAGNodes.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -2391,6 +2391,9 @@ ISD::MemIndexType getIndexType() const { return static_cast(LSBaseSDNodeBits.AddressingMode); } + void setIndexType(ISD::MemIndexType IndexType) { + LSBaseSDNodeBits.AddressingMode = IndexType; + } bool isIndexScaled() const { return (getIndexType() == ISD::SIGNED_SCALED) || (getIndexType() == ISD::UNSIGNED_SCALED); Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -4504,6 +4504,10 @@ // combiner can fold the new nodes. SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const; + /// Give targets the chance to reduce the number of distinct addresing modes. + ISD::MemIndexType getCanonicalIndexType(ISD::MemIndexType IndexType, + EVT MemVT, SDValue Offsets) const; + private: SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, DAGCombinerInfo &DCI) const; Index: llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -1865,6 +1865,8 @@ else NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo)); + N->setIndexType(TLI.getCanonicalIndexType(N->getIndexType(), + N->getMemoryVT(), NewOps[OpNo])); } else { NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo)); TruncateStore = true; Index: llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -2500,11 +2500,15 @@ SDValue Index = N->getIndex(); SDValue Scale = N->getScale(); SDValue Data = N->getValue(); + EVT MemoryVT = N->getMemoryVT(); Align Alignment = N->getOriginalAlign(); SDLoc DL(N); // Split all operands + EVT LoMemVT, HiMemVT; + std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); + SDValue DataLo, DataHi; if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) // Split Data operand @@ -2535,7 +2539,7 @@ MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale}; - Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), + Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), LoMemVT, DL, OpsLo, MMO, N->getIndexType(), N->isTruncatingStore()); @@ -2543,7 +2547,7 @@ // part comes after the "Lo". So these two operations should be chained one // after another. SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale}; - return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(), + return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), HiMemVT, DL, OpsHi, MMO, N->getIndexType(), N->isTruncatingStore()); } Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7402,15 +7402,20 @@ return SDValue(E, 0); } + IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]); auto *N = newSDNode(dl.getIROrder(), dl.getDebugLoc(), VTs, IsTrunc, VT, MMO, IndexType); createOperands(N, Ops); - assert(N->getMask().getValueType().getVectorNumElements() == - N->getValue().getValueType().getVectorNumElements() && + assert(N->getMask().getValueType().getVectorElementCount() == + N->getValue().getValueType().getVectorElementCount() && "Vector width mismatch between mask and data"); - assert(N->getIndex().getValueType().getVectorNumElements() >= - N->getValue().getValueType().getVectorNumElements() && + assert(N->getIndex().getValueType().getVectorElementCount().isScalable() == + N->getValue().getValueType().getVectorElementCount().isScalable() && + "Scalable flags of index and data do not match"); + assert(ElementCount::isKnownGE( + N->getIndex().getValueType().getVectorElementCount(), + N->getValue().getValueType().getVectorElementCount()) && "Vector width mismatch between index and data"); assert(isa(N->getScale()) && cast(N->getScale())->getAPIntValue().isPowerOf2() && Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4297,7 +4297,7 @@ if (!UniformBase) { Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(Ptr); - IndexType = ISD::SIGNED_SCALED; + IndexType = ISD::SIGNED_UNSCALED; Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } SDValue Ops[] = { getMemoryRoot(), Src0, Mask, Base, Index, Scale }; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -7433,6 +7433,26 @@ return SDValue(); } +// Convert redundant addressing modes (e.g. scaling is redundant +// when accessing bytes). +ISD::MemIndexType TargetLowering::getCanonicalIndexType( + ISD::MemIndexType IndexType, + EVT MemVT, + SDValue Offsets) const { + bool isScaledIndex = (IndexType == ISD::SIGNED_SCALED) || + (IndexType == ISD::UNSIGNED_SCALED); + bool isSignedIndex = (IndexType == ISD::SIGNED_SCALED) || + (IndexType == ISD::SIGNED_UNSCALED); + + // Scaling is unimportant for bytes, canonicalize to unscaled. + if (isScaledIndex && MemVT.getScalarType() == MVT::i8) { + isScaledIndex = false; + IndexType = isSignedIndex ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED; + } + + return IndexType; +} + SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const { unsigned Opcode = Node->getOpcode(); SDValue LHS = Node->getOperand(0); Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -807,6 +807,8 @@ SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1001,6 +1001,7 @@ setOperationAction(ISD::SINT_TO_FP, VT, Custom); setOperationAction(ISD::FP_TO_UINT, VT, Custom); setOperationAction(ISD::FP_TO_SINT, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::MUL, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); @@ -1052,6 +1053,7 @@ MVT::nxv4f32, MVT::nxv2f64}) { setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::MSCATTER, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); setOperationAction(ISD::SELECT, VT, Custom); setOperationAction(ISD::FADD, VT, Custom); @@ -1073,6 +1075,9 @@ setOperationAction(ISD::FP_ROUND, VT, Custom); } + for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) + setOperationAction(ISD::MSCATTER, VT, Custom); + setOperationAction(ISD::SPLAT_VECTOR, MVT::nxv8bf16, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); @@ -3705,6 +3710,99 @@ return ExtVal.getValueType().isScalableVector(); } +unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) { + std::map, unsigned> AddrModes = { + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::SST1_PRED}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::SST1_UXTW_PRED}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::SST1_PRED}, + {std::make_tuple(/*Scaled*/ false, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::SST1_SXTW_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ false), + AArch64ISD::SST1_SCALED_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ false, /*Extend*/ true), + AArch64ISD::SST1_UXTW_SCALED_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ false), + AArch64ISD::SST1_SCALED_PRED}, + {std::make_tuple(/*Scaled*/ true, /*Signed*/ true, /*Extend*/ true), + AArch64ISD::SST1_SXTW_SCALED_PRED}, + }; + auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend); + return AddrModes.find(Key)->second; +} + +bool getScatterIndexIsExtended(SDValue Index) { + unsigned Opcode = Index.getOpcode(); + if (Opcode == ISD::SIGN_EXTEND_INREG) + return true; + + if (Opcode == ISD::AND) { + SDValue Splat = Index.getOperand(1); + if (Splat.getOpcode() != ISD::SPLAT_VECTOR) + return false; + ConstantSDNode *Mask = dyn_cast(Splat.getOperand(0)); + if (!Mask || Mask->getZExtValue() != 0xFFFFFFFF) + return false; + return true; + } + + return false; +} + +SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MaskedScatterSDNode *MSC = cast(Op); + assert(MSC && "Can only custom lower scatter store nodes"); + + SDValue Index = MSC->getIndex(); + SDValue Chain = MSC->getChain(); + SDValue StoreVal = MSC->getValue(); + SDValue Mask = MSC->getMask(); + SDValue BasePtr = MSC->getBasePtr(); + + ISD::MemIndexType IndexType = MSC->getIndexType(); + bool IsScaled = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::UNSIGNED_SCALED; + bool IsSigned = + IndexType == ISD::SIGNED_SCALED || IndexType == ISD::SIGNED_UNSCALED; + bool NeedsExtend = getScatterIndexIsExtended(Index) || + Index.getSimpleValueType().getVectorElementType() == MVT::i32; + + EVT VT = StoreVal.getSimpleValueType(); + SDVTList VTs = DAG.getVTList(MVT::Other); + EVT MemVT = MSC->getMemoryVT(); + SDValue InputVT = DAG.getValueType(MemVT); + + if (VT.getVectorElementType() == MVT::bf16 && + !static_cast(DAG.getSubtarget()).hasBF16()) + return SDValue(); + + // Handle FP data + if (VT.isFloatingPoint()) { + VT = VT.changeVectorElementTypeToInteger(); + ElementCount EC = VT.getVectorElementCount(); + auto ScalarIntVT = + MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue()); + StoreVal = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, + MVT::getVectorVT(ScalarIntVT, EC), StoreVal); + + InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger()); + } + + if (getScatterIndexIsExtended(Index)) { + if (Index.getOpcode() == ISD::AND) + IsSigned = false; + Index = Index.getOperand(0); + } + + SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT}; + return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL, + VTs, Ops); +} + // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, @@ -3982,6 +4080,8 @@ return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::STORE: return LowerSTORE(Op, DAG); + case ISD::MSCATTER: + return LowerMSCATTER(Op, DAG); case ISD::VECREDUCE_SEQ_FADD: return LowerVECREDUCE_SEQ_FADD(Op, DAG); case ISD::VECREDUCE_ADD: Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1191,6 +1191,13 @@ def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 2))), (UUNPKHI_ZZ_D ZPR:$Zs)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 0))), + (UUNPKLO_ZZ_D ZPR:$Zs)>; + def : Pat<(nxv2bf16 (extract_subvector (nxv4bf16 ZPR:$Zs), (i64 2))), + (UUNPKHI_ZZ_D ZPR:$Zs)>; + } + def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 0))), (UUNPKLO_ZZ_S ZPR:$Zs)>; def : Pat<(nxv4f16 (extract_subvector (nxv8f16 ZPR:$Zs), (i64 4))), @@ -1769,6 +1776,16 @@ def : Pat<(nxv2i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; def : Pat<(nxv2i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>; + def : Pat<(nxv2i64 (reinterpret_cast (nxv2f64 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv2i64 (reinterpret_cast (nxv2f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv2i64 (reinterpret_cast (nxv2f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4i32 (reinterpret_cast (nxv4f32 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4i32 (reinterpret_cast (nxv4f16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + let Predicates = [HasSVE, HasBF16] in { + def : Pat<(nxv2i64 (reinterpret_cast (nxv2bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + def : Pat<(nxv4i32 (reinterpret_cast (nxv4bf16 ZPR:$src))), (COPY_TO_REGCLASS ZPR:$src, ZPR)>; + } + def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)), (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>; def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)), Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-scaled.ll @@ -0,0 +1,370 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; scaled unpacked 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define void @masked_scatter_nxv2i16_sext( %data, i16* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr i16, i16* %base, %ext + call void @llvm.masked.scatter.nxv2i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i32_sext( %data, i32* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw #2] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr i32, i32* %base, %ext + call void @llvm.masked.scatter.nxv2i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i64_sext( %data, i64* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw #3] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr i64, i64* %base, %ext + call void @llvm.masked.scatter.nxv2i64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f16_sext( %data, half* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr half, half* %base, %ext + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2bf16_sext( %data, bfloat* %base, %indexes, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2bf16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr bfloat, bfloat* %base, %ext + call void @llvm.masked.scatter.nxv2bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f32_sext( %data, float* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw #2] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr float, float* %base, %ext + call void @llvm.masked.scatter.nxv2f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f64_sext( %data, double* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f64_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw #3] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr double, double* %base, %ext + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i16_zext( %data, i16* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr i16, i16* %base, %ext + call void @llvm.masked.scatter.nxv2i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i32_zext( %data, i32* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw #2] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr i32, i32* %base, %ext + call void @llvm.masked.scatter.nxv2i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i64_zext( %data, i64* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw #3] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr i64, i64* %base, %ext + call void @llvm.masked.scatter.nxv2i64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f16_zext( %data, half* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr half, half* %base, %ext + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2bf16_zext( %data, bfloat* %base, %indexes, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2bf16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr bfloat, bfloat* %base, %ext + call void @llvm.masked.scatter.nxv2bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f32_zext( %data, float* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw #2] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr float, float* %base, %ext + call void @llvm.masked.scatter.nxv2f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f64_zext( %data, double* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f64_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw #3] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr double, double* %base, %ext + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 0, %masks) + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; scaled packed 32-bit offset +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define void @masked_scatter_nxv4i16_sext( %data, i16* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr i16, i16* %base, %ext + call void @llvm.masked.scatter.nxv4i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i32_sext( %data, i32* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2] +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr i32, i32* %base, %ext + call void @llvm.masked.scatter.nxv4i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f16_sext( %data, half* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4f16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr half, half* %base, %ext + call void @llvm.masked.scatter.nxv4f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4bf16_sext( %data, bfloat* %base, %indexes, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4bf16_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr bfloat, bfloat* %base, %ext + call void @llvm.masked.scatter.nxv4bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f32_sext( %data, float* %base, %indexes, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4f32_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2] +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2] +; CHECK-NEXT: ret + %ext = sext %indexes to + %ptrs = getelementptr float, float* %base, %ext + call void @llvm.masked.scatter.nxv4f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i16_zext( %data, i16* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr i16, i16* %base, %ext + call void @llvm.masked.scatter.nxv4i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i32_zext( %data, i32* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2] +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr i32, i32* %base, %ext + call void @llvm.masked.scatter.nxv4i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f16_zext( %data, half* %base, %indexes, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4f16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr half, half* %base, %ext + call void @llvm.masked.scatter.nxv4f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4bf16_zext( %data, bfloat* %base, %indexes, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4bf16_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1] +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr bfloat, bfloat* %base, %ext + call void @llvm.masked.scatter.nxv4bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f32_zext( %data, float* %base, %indexes, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4f32_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: uunpkhi z2.d, z1.s +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2] +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2] +; CHECK-NEXT: ret + %ext = zext %indexes to + %ptrs = getelementptr float, float* %base, %ext + call void @llvm.masked.scatter.nxv4f32( %data, %ptrs, i32 0, %masks) + ret void +} + +declare void @llvm.masked.scatter.nxv2f16(, , i32, ) +declare void @llvm.masked.scatter.nxv4f16(, , i32, ) +declare void @llvm.masked.scatter.nxv4bf16(, , i32, ) +declare void @llvm.masked.scatter.nxv4f32(, , i32, ) +declare void @llvm.masked.scatter.nxv2bf16(, , i32, ) +declare void @llvm.masked.scatter.nxv2f32(, , i32, ) +declare void @llvm.masked.scatter.nxv2f64(, , i32, ) +declare void @llvm.masked.scatter.nxv2i16(, , i32, ) +declare void @llvm.masked.scatter.nxv2i32(, , i32, ) +declare void @llvm.masked.scatter.nxv2i64(, , i32, ) +declare void @llvm.masked.scatter.nxv2i8(, , i32, ) +declare void @llvm.masked.scatter.nxv4i16(, , i32, ) +declare void @llvm.masked.scatter.nxv4i32(, , i32, ) +declare void @llvm.masked.scatter.nxv4i8(, , i32, ) +attributes #0 = { "target-features"="+sve,+bf16" } Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-masked-scatter-32b-unscaled.ll @@ -0,0 +1,577 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled unpacked 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define void @masked_scatter_nxv2i8_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i8_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i8( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i16_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i16_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i32_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i32_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i64_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i64_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f16_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f16_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2bf16_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2bf16_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f32_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f32_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f64_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f64_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: sxtw z1.d, p1/m, z1.d +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i8_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i8_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i8( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i16_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i16_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i32_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i32_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i64_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i64_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f16_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f16_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2bf16_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2bf16_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f32_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f32_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f64_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f64_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: and z1.d, z1.d, #0xffffffff +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 0, %masks) + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled packed 32-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +define void @masked_scatter_nxv4i8_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i8_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4i8( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i16_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i16_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i32_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i32_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f16_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4f16_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4bf16_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4bf16_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f32_sext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4f32_sext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: sunpklo z3.d, z1.s +; CHECK-NEXT: sunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = sext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i8_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i8_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4i8( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i16_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i16_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4i32_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4i32_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f16_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv4f16_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4bf16_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4bf16_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv4f32_zext_offsets( %data, i8* %base, %i32offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv4f32_zext_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: uunpklo z3.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: add z2.d, z2.d, z3.d +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d] +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %offsets = zext %i32offsets to + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv4f32( %data, %ptrs, i32 0, %masks) + ret void +} + +declare void @llvm.masked.scatter.nxv2f16(, , i32, ) +declare void @llvm.masked.scatter.nxv4f16(, , i32, ) +declare void @llvm.masked.scatter.nxv4bf16(, , i32, ) +declare void @llvm.masked.scatter.nxv4f32(, , i32, ) +declare void @llvm.masked.scatter.nxv2bf16(, , i32, ) +declare void @llvm.masked.scatter.nxv2f32(, , i32, ) +declare void @llvm.masked.scatter.nxv2f64(, , i32, ) +declare void @llvm.masked.scatter.nxv2i16(, , i32, ) +declare void @llvm.masked.scatter.nxv2i32(, , i32, ) +declare void @llvm.masked.scatter.nxv2i64(, , i32, ) +declare void @llvm.masked.scatter.nxv2i8(, , i32, ) +declare void @llvm.masked.scatter.nxv4i16(, , i32, ) +declare void @llvm.masked.scatter.nxv4i32(, , i32, ) +declare void @llvm.masked.scatter.nxv4i8(, , i32, ) +attributes #0 = { "target-features"="+sve,+bf16" } Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-scaled.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; scaled 64-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define void @masked_scatter_nxv2i16( %data, i16* %base, %offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, lsl #1] +; CHECK-NEXT: ret + %ptrs = getelementptr i16, i16* %base, %offsets + call void @llvm.masked.scatter.nxv2i16( %data, %ptrs, i32 2, %mask) + ret void +} + +define void @masked_scatter_nxv2i32( %data, i32* %base, %offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, lsl #2] +; CHECK-NEXT: ret + %ptrs = getelementptr i32, i32* %base, %offsets + call void @llvm.masked.scatter.nxv2i32( %data, %ptrs, i32 4, %mask) + ret void +} + +define void @masked_scatter_nxv2i64( %data, i64* %base, %offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + %ptrs = getelementptr i64, i64* %base, %offsets + call void @llvm.masked.scatter.nxv2i64( %data, %ptrs, i32 8, %mask) + ret void +} + +define void @masked_scatter_nxv2f16( %data, half* %base, %offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, lsl #1] +; CHECK-NEXT: ret + %ptrs = getelementptr half, half* %base, %offsets + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 2, %mask) + ret void +} + +define void @masked_scatter_nxv2f32( %data, float* %base, %offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, lsl #2] +; CHECK-NEXT: ret + %ptrs = getelementptr float, float* %base, %offsets + call void @llvm.masked.scatter.nxv2f32( %data, %ptrs, i32 4, %mask) + ret void +} + +define void @masked_scatter_nxv2f64( %data, double* %base, %offsets, %mask) { +; CHECK-LABEL: masked_scatter_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + %ptrs = getelementptr double, double* %base, %offsets + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 8, %mask) + ret void +} + +declare void @llvm.masked.scatter.nxv2i16(, , i32, ) +declare void @llvm.masked.scatter.nxv2i32(, , i32, ) +declare void @llvm.masked.scatter.nxv2i64(, , i32, ) +declare void @llvm.masked.scatter.nxv2f16(, , i32, ) +declare void @llvm.masked.scatter.nxv2f32(, , i32, ) +declare void @llvm.masked.scatter.nxv2f64(, , i32, ) Index: llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve-masked-scatter-64b-unscaled.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; unscaled 64-bit offsets +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define void @masked_scatter_nxv2i8_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i8_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i8( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i16_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i16_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i32_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i32_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2i64_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2i64_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2i64( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f16_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind { +; CHECK-LABEL: masked_scatter_nxv2f16_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2bf16_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2bf16_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2bf16( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f32_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2f32_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f32( %data, %ptrs, i32 0, %masks) + ret void +} + +define void @masked_scatter_nxv2f64_unscaled_64bit_offsets( %data, i8* %base, %offsets, %masks) nounwind #0 { +; CHECK-LABEL: masked_scatter_nxv2f64_unscaled_64bit_offsets: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z2.d, x0 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add z1.d, z2.d, z1.d +; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d] +; CHECK-NEXT: ret + %byte_ptrs = getelementptr i8, i8* %base, %offsets + %ptrs = bitcast %byte_ptrs to + call void @llvm.masked.scatter.nxv2f64( %data, %ptrs, i32 0, %masks) + ret void +} + +declare void @llvm.masked.scatter.nxv2f16(, , i32, ) +declare void @llvm.masked.scatter.nxv4f16(, , i32, ) +declare void @llvm.masked.scatter.nxv2bf16(, , i32, ) +declare void @llvm.masked.scatter.nxv2f32(, , i32, ) +declare void @llvm.masked.scatter.nxv2f64(, , i32, ) +declare void @llvm.masked.scatter.nxv2i16(, , i32, ) +declare void @llvm.masked.scatter.nxv2i32(, , i32, ) +declare void @llvm.masked.scatter.nxv2i64(, , i32, ) +declare void @llvm.masked.scatter.nxv2i8(, , i32, ) +declare void @llvm.masked.scatter.nxv4i16(, , i32, ) +declare void @llvm.masked.scatter.nxv4i32(, , i32, ) +declare void @llvm.masked.scatter.nxv4i8(, , i32, ) +attributes #0 = { "target-features"="+sve,+bf16" }