diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -952,7 +952,7 @@ llvm_i32_ty], [IntrNoMem]>; -class AdvSIMD_GatherLoad_64bitOffset_Intrinsic + class AdvSIMD_GatherLoad_64bitOffset_Intrinsic : Intrinsic<[llvm_anyvector_ty], [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, @@ -961,6 +961,50 @@ ], [IntrReadMem, IntrArgMemOnly]>; + class AdvSIMD_GatherLoad_32bitOffset_Intrinsic + : Intrinsic<[ llvm_anyvector_ty ], + [ + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, llvm_anyvector_ty + ], + [IntrReadMem, IntrArgMemOnly]>; + + class AdvSIMD_GatherLoad_VectorBase_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [ + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty, + llvm_i64_ty + ], + [IntrReadMem, IntrArgMemOnly]>; + + class AdvSIMD_ScatterStore_64bitOffset_Intrinsic + : Intrinsic< + [], + [ + llvm_anyvector_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, LLVMScalarOrSameVectorWidth<0, llvm_i64_ty> + ], + [IntrWriteMem, IntrArgMemOnly]>; + + class AdvSIMD_ScatterStore_32bitOffset_Intrinsic + : Intrinsic<[], + [ + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, llvm_anyvector_ty + ], + [IntrWriteMem, IntrArgMemOnly]>; + + class AdvSIMD_ScatterStore_VectorBase_Intrinsic + : Intrinsic<[], + [ + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty, llvm_i64_ty + ], + [IntrWriteMem, IntrArgMemOnly]>; + class SVE2_3VectorArg_Long_Intrinsic : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, @@ -980,13 +1024,6 @@ // to reuse currently identical class definitions. class AdvSIMD_SVE_LOGB_Intrinsic : AdvSIMD_SVE_CNT_Intrinsic; -class AdvSIMD_GatherLoad_32bitOffset_Intrinsic - : Intrinsic<[ llvm_anyvector_ty ], - [ - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - LLVMPointerToElt<0>, llvm_anyvector_ty - ], - [ IntrReadMem, IntrArgMemOnly ]>; // This class of intrinsics are not intended to be useful within LLVM IR but // are instead here to support some of the more regid parts of the ACLE. @@ -1017,14 +1054,6 @@ [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem]>; -class AdvSIMD_GatherLoad_VecTorBase_Intrinsic - : Intrinsic<[llvm_anyvector_ty], - [ - LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, - llvm_anyvector_ty, - llvm_i64_ty - ], - [IntrReadMem, IntrArgMemOnly]>; // // Integer arithmetic @@ -1324,7 +1353,37 @@ def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_32bitOffset_Intrinsic; // vector base + immediate index -def int_aarch64_sve_ld1_gather_imm : AdvSIMD_GatherLoad_VecTorBase_Intrinsic; +def int_aarch64_sve_ld1_gather_imm : AdvSIMD_GatherLoad_VectorBase_Intrinsic; + +// +// Scatter stores: +// + +// scalar + vector, 64 bit unscaled offsets +def int_aarch64_sve_st1_scatter : AdvSIMD_ScatterStore_64bitOffset_Intrinsic; + +// scalar + vector, 64 bit scaled offsets +def int_aarch64_sve_st1_scatter_index + : AdvSIMD_ScatterStore_64bitOffset_Intrinsic; + +// scalar + vector, 32 bit unscaled offsets, sign (sxtw) or zero (zxtw) +// extended to 64 bits +def int_aarch64_sve_st1_scatter_sxtw + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +def int_aarch64_sve_st1_scatter_uxtw + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +// scalar + vector, 32 bit scaled offsets, sign (sxtw) or zero (zxtw) extended +// to 64 bits +def int_aarch64_sve_st1_scatter_sxtw_index + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +def int_aarch64_sve_st1_scatter_uxtw_index + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +// vector base + immediate index +def int_aarch64_sve_st1_scatter_imm : AdvSIMD_ScatterStore_VectorBase_Intrinsic; // // SVE2 - Non-widening pairwise arithmetic @@ -1354,4 +1413,5 @@ // def int_aarch64_sve_flogb : AdvSIMD_SVE_LOGB_Intrinsic; + } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -224,6 +224,15 @@ GLD1S_SXTW_SCALED, GLD1S_IMM, + // Unsigned scatter store + SST1, + SST1_SCALED, + SST1_UXTW, + SST1_SXTW, + SST1_UXTW_SCALED, + SST1_SXTW_SCALED, + SST1_IMM, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1358,6 +1358,13 @@ case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; + case AArch64ISD::SST1: return "AArch64ISD::SST1"; + case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; + case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; + case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW"; + case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; + case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; + case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; } return nullptr; } @@ -11906,6 +11913,62 @@ } } +static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode) { + EVT SrcVT = N->getOperand(2)->getValueType(0); + assert(SrcVT.isScalableVector() && + "Scatter stores are only possible for SVE vectors"); + + SDLoc DL(N); + MVT RetElVT = SrcVT.getVectorElementType().getSimpleVT(); + unsigned NumElements = AArch64::SVEBitsPerBlock / RetElVT.getSizeInBits(); + + EVT MaxVT = llvm::MVT::getScalableVectorVT(RetElVT, NumElements); + if (SrcVT.getSizeInBits().getKnownMinSize() > + MaxVT.getSizeInBits().getKnownMinSize()) + return SDValue(); + + const SDValue Dst = N->getOperand(2); + // Depending on the addressing mode, this is either a pointer or a vector of + // pointers (that fits into one register) + const SDValue Base = N->getOperand(4); + // Depending on the addressing mode, this is either a single offset or a + // vector of offsets (that fits into one register) + const SDValue Offset = N->getOperand(5); + + if (!DAG.getTargetLoweringInfo().isTypeLegal(Base.getValueType()) || + !DAG.getTargetLoweringInfo().isTypeLegal(Offset.getValueType())) + return SDValue(); + + // Return value type that is representable in hardware + EVT HwRetVt = getSVEContainerType(SrcVT); + + // Keep the original type of the data to store - this is needed to + // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the + // integer equivalent, so just use HwRetVt. + SDValue OutVT = DAG.getValueType(SrcVT); + if (SrcVT.isFloatingPoint()) + OutVT = DAG.getValueType(HwRetVt); + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue DstNew; + + if (Dst.getValueType().isFloatingPoint()) + DstNew = DAG.getNode(ISD::BITCAST, DL, HwRetVt, Dst); + else + DstNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwRetVt, Dst); + + SDValue Ops[] = {N->getOperand(0), // Chain + DstNew, + N->getOperand(3), // Pg + Base, Offset, OutVT}; + + SDValue Store = DAG.getNode(Opcode, DL, VTs, Ops); + SDValue StoreChain = SDValue(Store.getNode(), 0); + + return DAG.getMergeValues({Store, StoreChain}, DL); +} + static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode) { EVT RetVT = N->getValueType(0); @@ -12122,6 +12185,20 @@ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED); case Intrinsic::aarch64_sve_ld1_gather_imm: return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_st1_scatter: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + case Intrinsic::aarch64_sve_st1_scatter_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + case Intrinsic::aarch64_sve_st1_scatter_sxtw: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW); + case Intrinsic::aarch64_sve_st1_scatter_uxtw: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW); + case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED); + case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED); + case Intrinsic::aarch64_sve_st1_scatter_imm: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); default: break; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -20,6 +20,24 @@ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; +def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; + def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; @@ -584,51 +602,55 @@ defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>; defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>; - // Scatters using unscaled 32-bit offsets, e.g. - // st1h z0.s, p0, [x0, z0.s, uxtw] - // and unpacked: + // Scatters using umpacked, unscaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw] - defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - - // Scatters using scaled 32-bit offsets, e.g. + defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + + // Scatters using packed, unscaled 32-bit offsets, e.g. + // st1h z0.s, p0, [x0, z0.s, uxtw] + defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + + // Scatters using packed, scaled 32-bit offsets, e.g. // st1h z0.s, p0, [x0, z0.s, uxtw #1] - // and unpacked: + defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + + // Scatters using unpacked, scaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw #1] - defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.s, p0, [z0.s, #16] + defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>; + defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>; + + // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.d, p0, [z0.d, #16] - defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>; - defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>; - defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>; - defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>; - defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>; - defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>; - defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>; + defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>; + defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>; // Scatters using unscaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d] - defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">; - defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">; - defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">; - defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">; + defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>; + defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>; + defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>; + defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>; // Scatters using scaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, lsl #1] - defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>; - defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>; - defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>; + defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>; + defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>; + defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>; // ST(2|3|4) structured stores (register + immediate) defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4402,32 +4402,84 @@ let mayStore = 1; } -multiclass sve_mem_sst_sv_32_scaled opc, string asm, - RegisterOperand listty, - ZPRRegOp zprty, +multiclass sve_mem_32b_sst_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd > { - def _UXTW_SCALED : sve_mem_sst_sv; - def _SXTW_SCALED : sve_mem_sst_sv; + RegisterOperand uxtw_opnd, + ValueType vt > { + def _UXTW_SCALED : sve_mem_sst_sv; + def _SXTW_SCALED : sve_mem_sst_sv; def : InstAlias(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + (!cast(NAME # _UXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + (!cast(NAME # _SXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } -multiclass sve_mem_sst_sv_32_unscaled opc, string asm, - RegisterOperand listty, - ZPRRegOp zprty, - RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { - def _UXTW : sve_mem_sst_sv; - def _SXTW : sve_mem_sst_sv; +multiclass sve_mem_64b_sst_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt > { + def _UXTW_SCALED : sve_mem_sst_sv; + def _SXTW_SCALED : sve_mem_sst_sv; + + def : InstAlias(NAME # _UXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + def : InstAlias(NAME # _SXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; +} + +multiclass sve_mem_64b_sst_sv_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt> { + def _UXTW : sve_mem_sst_sv; + def _SXTW : sve_mem_sst_sv; def : InstAlias(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + (!cast(NAME # _UXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + (!cast(NAME # _SXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; +} + +multiclass sve_mem_32b_sst_sv_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt> { + def _UXTW : sve_mem_sst_sv; + def _SXTW : sve_mem_sst_sv; + + def : InstAlias(NAME # _UXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + def : InstAlias(NAME # _SXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_sv2 msz, bit scaled, string asm, @@ -4454,19 +4506,28 @@ } multiclass sve_mem_sst_sv_64_scaled msz, string asm, - RegisterOperand zprext> { - def "" : sve_mem_sst_sv2; + SDPatternOperator op, + RegisterOperand zprext, + ValueType vt> { + def _SCALED_REAL : sve_mem_sst_sv2; def : InstAlias(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + (!cast(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt), + (!cast(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } -multiclass sve_mem_sst_sv_64_unscaled msz, string asm> { - def "" : sve_mem_sst_sv2; +multiclass sve_mem_sst_sv_64_unscaled msz, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve_mem_sst_sv2; def : InstAlias(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_vi opc, string asm, ZPRRegOp zprty, @@ -4492,16 +4553,38 @@ let mayStore = 1; } -multiclass sve_mem_sst_vi_ptrs opc, string asm, RegisterOperand listty, - ZPRRegOp zprty, Operand imm_ty> { - def _IMM : sve_mem_sst_vi; +multiclass sve_mem_32b_sst_vi_ptrs opc, string asm, + Operand imm_ty, + SDPatternOperator op, + ValueType vt> { + def _IMM : sve_mem_sst_vi; def : InstAlias(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>; + (!cast(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>; def : InstAlias(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>; + (!cast(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>; + (!cast(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt), + (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; +} + +multiclass sve_mem_64b_sst_vi_ptrs opc, string asm, + Operand imm_ty, + SDPatternOperator op, + ValueType vt> { + def _IMM : sve_mem_sst_vi; + + def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>; + def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>; + def : InstAlias(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt), + (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_z_spill diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll @@ -0,0 +1,193 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1H, ST1W, ST1D: base + 32-bit scaled offset, sign (sxtw) or zero +; (uxtw) extended to 64 bits. +; e.g. st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; + +; ST1H +define void @sst1h_s_uxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_s_uxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, uxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16.nxv4i32( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +define void @sst1h_s_sxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_s_sxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16.nxv4i32( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +define void @sst1h_d_uxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_d_uxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i16.nxv2i64( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +define void @sst1h_d_sxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_d_sxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i16.nxv2i64( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +; ST1W +define void @sst1w_s_uxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_s_uxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i32.nxv4i32( %data, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_s_sxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_s_sxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i32.nxv4i32( %data, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_d_uxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_d_uxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, uxtw #2] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i32.nxv2i64( %data_trunc, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_d_sxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_d_sxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, sxtw #2] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i32.nxv2i64( %data_trunc, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_s_uxtw_float( %data, %pg, float* %base, %indices) { +; CHECK-LABEL: sst1w_s_uxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4f32.nxv4i32( %data, + %pg, + float* %base, + %indices) + ret void +} + +define void @sst1w_s_sxtw_float( %data, %pg, float* %base, %indices) { +; CHECK-LABEL: sst1w_s_sxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32.nxv4i32( %data, + %pg, + float* %base, + %indices) + ret void +} + +; ST1D +define void @sst1d_d_uxtw( %data, %pg, i64* %base, %indices) { +; CHECK-LABEL: sst1d_d_uxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i64.nxv2i64( %data, + %pg, + i64* %base, + %indices) + ret void +} + +define void @sst1d_d_sxtw( %data, %pg, i64* %base, %indices) { +; CHECK-LABEL: sst1d_d_sxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i64.nxv2i64( %data, + %pg, + i64* %base, + %indices) + ret void +} + +define void @sst1d_d_uxtw_double( %data, %pg, double* %base, %indices) { +; CHECK-LABEL: sst1d_d_uxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2f64.nxv2i64( %data, + %pg, + double* %base, + %indices) + ret void +} + +define void @sst1d_d_sxtw_double( %data, %pg, double* %base, %indices) { +; CHECK-LABEL: sst1d_d_sxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2f64.nxv2i64( %data, + %pg, + double* %base, + %indices) + ret void +} + + +; ST1H +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16.nxv4i32(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i16.nxv2i64(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16.nxv4i32(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i16.nxv2i64(, , i16*, ) + +; ST1W +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i32.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i32.nxv2i64(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i32.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i32.nxv2i64(, , i32*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32.nxv4i32(, , float*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4f32.nxv4i32(, , float*, ) + +; ST1D +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i64.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i64.nxv2i64(, , i64*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2f64.nxv2i64(, , double*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2f64.nxv2i64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll @@ -0,0 +1,248 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1B, ST1W, ST1H, ST1D: base + 32-bit unscaled offset, sign (sxtw) or zero +; (uxtw) extended to 64 bits. +; e.g. st1h { z0.d }, p0, [x0, z1.d, uxtw] +; + +; ST1B +define void @sst1b_s_uxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_s_uxtw: +; CHECK: st1b { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8.nxv4i32( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +define void @sst1b_s_sxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_s_sxtw: +; CHECK: st1b { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8.nxv4i32( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +define void @sst1b_d_uxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_d_uxtw: +; CHECK: st1b { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i8.nxv2i64( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +define void @sst1b_d_sxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_d_sxtw: +; CHECK: st1b { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i8.nxv2i64( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +; ST1H +define void @sst1h_s_uxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_s_uxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i16.nxv4i32( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1h_s_sxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_s_sxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i16.nxv4i32( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1h_d_uxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_d_uxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i16.nxv2i64( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1h_d_sxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_d_sxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i16.nxv2i64( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +; ST1W +define void @sst1w_s_uxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_s_uxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i32.nxv4i32( %data, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_s_sxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_s_sxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i32.nxv4i32( %data, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_d_uxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_d_uxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i32.nxv2i64( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_d_sxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_d_sxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i32.nxv2i64( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_s_uxtw_float( %data, %pg, float* %base, %offsets) { +; CHECK-LABEL: sst1w_s_uxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4f32.nxv4i32( %data, + %pg, + float* %base, + %offsets) + ret void +} + +define void @sst1w_s_sxtw_float( %data, %pg, float* %base, %offsets) { +; CHECK-LABEL: sst1w_s_sxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4f32.nxv4i32( %data, + %pg, + float* %base, + %offsets) + ret void +} + +; ST1D +define void @sst1d_d_uxtw( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sst1d_d_uxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i64.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sst1d_d_sxtw( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sst1d_d_sxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i64.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sst1d_d_uxtw_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sst1d_d_uxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2f64.nxv2i64( %data, + %pg, + double* %base, + %offsets) + ret void +} + +define void @sst1d_d_sxtw_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sst1d_d_sxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2f64.nxv2i64( %data, + %pg, + double* %base, + %offsets) + ret void +} + + +; ST1B +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8.nxv4i32(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i8.nxv2i64(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8.nxv4i32(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i8.nxv2i64(, , i8*, ) + +; ST1H +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i16.nxv4i32(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i16.nxv2i64(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i16.nxv4i32(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i16.nxv2i64(, , i16*, ) + +; ST1W +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i32.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i32.nxv2i64(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i32.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i32.nxv2i64(, , i32*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4f32.nxv4i32(, , float*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4f32.nxv4i32(, , float*, ) + +; ST1D +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i64.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i64.nxv2i64(, , i64*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2f64.nxv2i64(, , double*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2f64.nxv2i64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1H, ST1W, ST1D: base + 64-bit scaled offset +; e.g. st1h { z0.d }, p0, [x0, z0.d, lsl #1] +; + +define void @sst1h_index( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_index +; CHECK: st1h { z0.d }, p0, [x0, z1.d, lsl #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.index.nxv2i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1w_index( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_index +; CHECK: st1w { z0.d }, p0, [x0, z1.d, lsl #2] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1d_index( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sst1d_index +; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sst1d_index_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sst1d_index_double +; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64( %data, + %pg, + double* %base, + %offsets) + ret void +} + + +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll @@ -0,0 +1,70 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1B, ST1W, ST1H, ST1D: base + 64-bit unscaled offset +; e.g. st1h { z0.d }, p0, [x0, z1.d] +; + +define void @sst1b_d( %data, %pg, i8* %base, %b) { +; CHECK-LABEL: sst1b_d: +; CHECK: st1b { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.nxv2i8( %data_trunc, + %pg, + i8* %base, + %b) + ret void +} + +define void @sst1h_d( %data, %pg, i16* %base, %b) { +; CHECK-LABEL: sst1h_d: +; CHECK: st1h { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.nxv2i16( %data_trunc, + %pg, + i16* %base, + %b) + ret void +} + +define void @sst1w_d( %data, %pg, i32* %base, %b) { +; CHECK-LABEL: sst1w_d: +; CHECK: st1w { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.nxv2i32( %data_trunc, + %pg, + i32* %base, + %b) + ret void +} + +define void @sst1d_d( %data, %pg, i64* %base, %b) { +; CHECK-LABEL: sst1d_d: +; CHECK: st1d { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.nxv2i64( %data, + %pg, + i64* %base, + %b) + ret void +} + +define void @sst1d_d_double( %data, %pg, double* %base, %b) { +; CHECK-LABEL: sst1d_d_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.nxv2f64( %data, + %pg, + double* %base, + %b) + ret void +} + +declare void @llvm.aarch64.sve.st1.scatter.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll @@ -0,0 +1,133 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1B, ST1W, ST1H, ST1D: vector + immediate (index) +; e.g. st1h { z0.s }, p0, [z1.s, #16] +; + +; ST1B +define void @sst1b_s_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1b_s_imm: +; CHECK: st1b { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i8.nxv4i32( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1b_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1b_d_imm: +; CHECK: st1b { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i8.nxv2i64( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +; ST1H +define void @sst1h_s_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1h_s_imm: +; CHECK: st1h { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i16.nxv4i32( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1h_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1h_d_imm: +; CHECK: st1h { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i16.nxv2i64( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +; ST1W +define void @sst1w_s_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1w_s_imm: +; CHECK: st1w { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i32.nxv4i32( %data, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1w_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1w_d_imm: +; CHECK: st1w { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i32.nxv2i64( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1w_s_imm_float( %data, %pg, %base) { +; CHECK-LABEL: sst1w_s_imm_float: +; CHECK: st1w { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4f32.nxv4i32( %data, + %pg, + %base, + i64 16) + ret void +} + +; ST1D +define void @sst1d_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1d_d_imm: +; CHECK: st1d { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i64.nxv2i64( %data, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1d_d_imm_double( %data, %pg, %base) { +; CHECK-LABEL: sst1d_d_imm_double: +; CHECK: st1d { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2f64.nxv2i64( %data, + %pg, + %base, + i64 16) + ret void +} + +; ST1B +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i8.nxv4i32(, , , i64) +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i8.nxv2i64(, , , i64) + +; ST1H +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i16.nxv4i32(, , , i64) +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i16.nxv2i64(, , , i64) + +; ST1W +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i32.nxv4i32(, , , i64) +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i32.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4f32.nxv4i32(, , , i64) + +; ST1D +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i64.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2f64.nxv2i64(, , , i64)