diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1064,6 +1064,35 @@ llvm_i32_ty], [IntrNoMem, ImmArg<1>]>; +class AdvSIMD_ScatterStore_64bitOffset_Intrinsic + : Intrinsic<[], + [ + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i64_ty> + ], + [IntrWriteMem, IntrArgMemOnly]>; + +class AdvSIMD_ScatterStore_32bitOffset_Intrinsic + : Intrinsic<[], + [ + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMPointerToElt<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i32_ty> + ], + [IntrWriteMem, IntrArgMemOnly]>; + +class AdvSIMD_ScatterStore_VectorBase_Intrinsic + : Intrinsic<[], + [ + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty, llvm_i64_ty + ], + [IntrWriteMem, IntrArgMemOnly, ImmArg<3>]>; + // // Loads // @@ -1407,6 +1436,36 @@ def int_aarch64_sve_ld1_gather_imm : AdvSIMD_GatherLoad_VecTorBase_Intrinsic; // +// Scatter stores: +// + +// scalar + vector, 64 bit unscaled offsets +def int_aarch64_sve_st1_scatter : AdvSIMD_ScatterStore_64bitOffset_Intrinsic; + +// scalar + vector, 64 bit scaled offsets +def int_aarch64_sve_st1_scatter_index + : AdvSIMD_ScatterStore_64bitOffset_Intrinsic; + +// scalar + vector, 32 bit unscaled offsets, sign (sxtw) or zero (zxtw) +// extended to 64 bits +def int_aarch64_sve_st1_scatter_sxtw + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +def int_aarch64_sve_st1_scatter_uxtw + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +// scalar + vector, 32 bit scaled offsets, sign (sxtw) or zero (zxtw) extended +// to 64 bits +def int_aarch64_sve_st1_scatter_sxtw_index + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +def int_aarch64_sve_st1_scatter_uxtw_index + : AdvSIMD_ScatterStore_32bitOffset_Intrinsic; + +// vector base + immediate index +def int_aarch64_sve_st1_scatter_imm : AdvSIMD_ScatterStore_VectorBase_Intrinsic; + +// // SVE2 - Non-widening pairwise arithmetic // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -223,6 +223,14 @@ GLD1S_UXTW_SCALED, GLD1S_SXTW_SCALED, GLD1S_IMM, + // Scatter store + SST1, + SST1_SCALED, + SST1_UXTW, + SST1_SXTW, + SST1_UXTW_SCALED, + SST1_SXTW_SCALED, + SST1_IMM, // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1357,6 +1357,13 @@ case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; + case AArch64ISD::SST1: return "AArch64ISD::SST1"; + case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; + case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; + case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW"; + case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; + case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; + case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; } return nullptr; } @@ -12080,6 +12087,75 @@ } } +static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { + const SDValue Src = N->getOperand(2); + const EVT SrcVT = Src->getValueType(0); + assert(SrcVT.isScalableVector() && + "Scatter stores are only possible for SVE vectors"); + + SDLoc DL(N); + MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); + + // Make sure that source data will fit into an SVE register + if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + // For FPs, ACLE only supports _packed_ single and double precision types. + if (SrcElVT.isFloatingPoint()) + if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) + return SDValue(); + + // Depending on the addressing mode, this is either a pointer or a vector of + // pointers (that fits into one register) + const SDValue Base = N->getOperand(4); + // Depending on the addressing mode, this is either a single offset or a + // vector of offsets (that fits into one register) + SDValue Offset = N->getOperand(5); + + auto &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(Base.getValueType())) + return SDValue(); + + // Some scatter store variants allow unpacked offsets, but only as nxv2i32 + // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to + // nxv2i64. Legalize accordingly. + if (!OnlyPackedOffsets && + Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); + + if (!TLI.isTypeLegal(Offset.getValueType())) + return SDValue(); + + // Source value type that is representable in hardware + EVT HwSrcVt = getSVEContainerType(SrcVT); + + // Keep the original type of the input data to store - this is needed to + // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the + // integer equivalent, so just use HwSrcVt. + SDValue InputVT = DAG.getValueType(SrcVT); + if (SrcVT.isFloatingPoint()) + InputVT = DAG.getValueType(HwSrcVt); + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue SrcNew; + + if (Src.getValueType().isFloatingPoint()) + SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); + else + SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); + + SDValue Ops[] = {N->getOperand(0), // Chain + SrcNew, + N->getOperand(3), // Pg + Base, + Offset, + InputVT}; + + return DAG.getNode(Opcode, DL, VTs, Ops); +} + static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode) { EVT RetVT = N->getValueType(0); @@ -12300,6 +12376,24 @@ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED); case Intrinsic::aarch64_sve_ld1_gather_imm: return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_st1_scatter: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + case Intrinsic::aarch64_sve_st1_scatter_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + case Intrinsic::aarch64_sve_st1_scatter_sxtw: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_uxtw: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_imm: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); default: break; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -393,6 +393,27 @@ let PrintMethod = "printImmScale<8>"; } +// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant) +// instead of ImmLeaf (Constant) +def tuimm5s2 : Operand, TImmLeaf= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { + let ParserMatchClass = UImm5s2Operand; + let PrintMethod = "printImmScale<2>"; +} +def tuimm5s4 : Operand, TImmLeaf= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { + let ParserMatchClass = UImm5s4Operand; + let PrintMethod = "printImmScale<4>"; +} +def tuimm5s8 : Operand, TImmLeaf= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { + let ParserMatchClass = UImm5s8Operand; + let PrintMethod = "printImmScale<8>"; +} + // uimm6sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 64 * N]. def UImm6s1Operand : UImmScaledMemoryIndexed<6, 1>; @@ -750,6 +771,14 @@ let ParserMatchClass = Imm0_31Operand; } +// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf) +// instead of Contant (ImmLeaf) +def timm0_31 : Operand, TImmLeaf { + let ParserMatchClass = Imm0_31Operand; +} + // True if the 32-bit immediate is in the range [0,31] def imm32_0_31 : Operand, ImmLeaf, SDTCisSameNumEltsAs<0,1> ]>; +def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; + def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; @@ -584,51 +602,55 @@ defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>; defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>; - // Scatters using unscaled 32-bit offsets, e.g. - // st1h z0.s, p0, [x0, z0.s, uxtw] - // and unpacked: + // Scatters using unpacked, unscaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw] - defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - - // Scatters using scaled 32-bit offsets, e.g. + defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + + // Scatters using packed, unscaled 32-bit offsets, e.g. + // st1h z0.s, p0, [x0, z0.s, uxtw] + defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + + // Scatters using packed, scaled 32-bit offsets, e.g. // st1h z0.s, p0, [x0, z0.s, uxtw #1] - // and unpacked: + defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + + // Scatters using unpacked, scaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw #1] - defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.s, p0, [z0.s, #16] + defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>; + defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>; + + // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.d, p0, [z0.d, #16] - defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>; - defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>; - defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>; - defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>; - defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>; - defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>; - defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>; + defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>; + defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>; // Scatters using unscaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d] - defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">; - defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">; - defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">; - defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">; + defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>; + defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>; + defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>; + defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>; // Scatters using scaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, lsl #1] - defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>; - defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>; - defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>; + defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>; + defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>; + defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>; // ST(2|3|4) structured stores (register + immediate) defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4564,32 +4564,84 @@ let mayStore = 1; } -multiclass sve_mem_sst_sv_32_scaled opc, string asm, - RegisterOperand listty, - ZPRRegOp zprty, +multiclass sve_mem_32b_sst_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd > { - def _UXTW_SCALED : sve_mem_sst_sv; - def _SXTW_SCALED : sve_mem_sst_sv; + RegisterOperand uxtw_opnd, + ValueType vt > { + def _UXTW_SCALED : sve_mem_sst_sv; + def _SXTW_SCALED : sve_mem_sst_sv; def : InstAlias(NAME # _UXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + (!cast(NAME # _UXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW_SCALED) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + (!cast(NAME # _SXTW_SCALED) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } -multiclass sve_mem_sst_sv_32_unscaled opc, string asm, - RegisterOperand listty, - ZPRRegOp zprty, - RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { - def _UXTW : sve_mem_sst_sv; - def _SXTW : sve_mem_sst_sv; +multiclass sve_mem_64b_sst_sv_32_scaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt > { + def _UXTW_SCALED : sve_mem_sst_sv; + def _SXTW_SCALED : sve_mem_sst_sv; + + def : InstAlias(NAME # _UXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + def : InstAlias(NAME # _SXTW_SCALED) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _UXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _SXTW_SCALED) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; +} + +multiclass sve_mem_64b_sst_sv_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt> { + def _UXTW : sve_mem_sst_sv; + def _SXTW : sve_mem_sst_sv; def : InstAlias(NAME # _UXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + (!cast(NAME # _UXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; def : InstAlias(NAME # _SXTW) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + (!cast(NAME # _SXTW) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; +} + +multiclass sve_mem_32b_sst_sv_32_unscaled opc, string asm, + SDPatternOperator sxtw_op, + SDPatternOperator uxtw_op, + RegisterOperand sxtw_opnd, + RegisterOperand uxtw_opnd, + ValueType vt> { + def _UXTW : sve_mem_sst_sv; + def _SXTW : sve_mem_sst_sv; + + def : InstAlias(NAME # _UXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), 0>; + def : InstAlias(NAME # _SXTW) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + + def : Pat<(uxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _UXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + def : Pat<(sxtw_op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt), + (!cast(NAME # _SXTW) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_sv2 msz, bit scaled, string asm, @@ -4616,19 +4668,28 @@ } multiclass sve_mem_sst_sv_64_scaled msz, string asm, - RegisterOperand zprext> { - def "" : sve_mem_sst_sv2; + SDPatternOperator op, + RegisterOperand zprext, + ValueType vt> { + def _SCALED_REAL : sve_mem_sst_sv2; def : InstAlias(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + (!cast(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt), + (!cast(NAME # _SCALED_REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } -multiclass sve_mem_sst_sv_64_unscaled msz, string asm> { - def "" : sve_mem_sst_sv2; +multiclass sve_mem_sst_sv_64_unscaled msz, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve_mem_sst_sv2; def : InstAlias(NAME) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt), + (!cast(NAME # _REAL) ZPR:$data, PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_sst_vi opc, string asm, ZPRRegOp zprty, @@ -4654,16 +4715,38 @@ let mayStore = 1; } -multiclass sve_mem_sst_vi_ptrs opc, string asm, RegisterOperand listty, - ZPRRegOp zprty, Operand imm_ty> { - def _IMM : sve_mem_sst_vi; +multiclass sve_mem_32b_sst_vi_ptrs opc, string asm, + Operand imm_ty, + SDPatternOperator op, + ValueType vt> { + def _IMM : sve_mem_sst_vi; def : InstAlias(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 0>; + (!cast(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 0>; def : InstAlias(NAME # _IMM) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, imm_ty:$imm5), 0>; + (!cast(NAME # _IMM) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, 0), 1>; + (!cast(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i32 ZPR:$data), (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt), + (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; +} + +multiclass sve_mem_64b_sst_vi_ptrs opc, string asm, + Operand imm_ty, + SDPatternOperator op, + ValueType vt> { + def _IMM : sve_mem_sst_vi; + + def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 0>; + def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>; + def : InstAlias(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + + def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt), + (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_z_spill diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-scaled-offsets.ll @@ -0,0 +1,193 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1H, ST1W, ST1D: base + 32-bit scaled offset, sign (sxtw) or zero +; (uxtw) extended to 64 bits. +; e.g. st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; + +; ST1H +define void @sst1h_s_uxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_s_uxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, uxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +define void @sst1h_s_sxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_s_sxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +define void @sst1h_d_uxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_d_uxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, uxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i16( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +define void @sst1h_d_sxtw( %data, %pg, i16* %base, %indices) { +; CHECK-LABEL: sst1h_d_sxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i16( %data_trunc, + %pg, + i16* %base, + %indices) + ret void +} + +; ST1W +define void @sst1w_s_uxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_s_uxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i32( %data, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_s_sxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_s_sxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i32( %data, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_d_uxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_d_uxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, uxtw #2] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i32( %data_trunc, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_d_sxtw( %data, %pg, i32* %base, %indices) { +; CHECK-LABEL: sst1w_d_sxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, sxtw #2] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i32( %data_trunc, + %pg, + i32* %base, + %indices) + ret void +} + +define void @sst1w_s_uxtw_float( %data, %pg, float* %base, %indices) { +; CHECK-LABEL: sst1w_s_uxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4f32( %data, + %pg, + float* %base, + %indices) + ret void +} + +define void @sst1w_s_sxtw_float( %data, %pg, float* %base, %indices) { +; CHECK-LABEL: sst1w_s_sxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32( %data, + %pg, + float* %base, + %indices) + ret void +} + +; ST1D +define void @sst1d_d_uxtw( %data, %pg, i64* %base, %indices) { +; CHECK-LABEL: sst1d_d_uxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i64( %data, + %pg, + i64* %base, + %indices) + ret void +} + +define void @sst1d_d_sxtw( %data, %pg, i64* %base, %indices) { +; CHECK-LABEL: sst1d_d_sxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i64( %data, + %pg, + i64* %base, + %indices) + ret void +} + +define void @sst1d_d_uxtw_double( %data, %pg, double* %base, %indices) { +; CHECK-LABEL: sst1d_d_uxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2f64( %data, + %pg, + double* %base, + %indices) + ret void +} + +define void @sst1d_d_sxtw_double( %data, %pg, double* %base, %indices) { +; CHECK-LABEL: sst1d_d_sxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2f64( %data, + %pg, + double* %base, + %indices) + ret void +} + + +; ST1H +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i16(, , i16*, ) + +; ST1W +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i32(, , i32*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv4f32(, , float*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv4f32(, , float*, ) + +; ST1D +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2i64(, , i64*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.index.nxv2f64(, , double*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.index.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-32bit-unscaled-offsets.ll @@ -0,0 +1,248 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1B, ST1W, ST1H, ST1D: base + 32-bit unscaled offset, sign (sxtw) or zero +; (uxtw) extended to 64 bits. +; e.g. st1h { z0.d }, p0, [x0, z1.d, uxtw] +; + +; ST1B +define void @sst1b_s_uxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_s_uxtw: +; CHECK: st1b { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +define void @sst1b_s_sxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_s_sxtw: +; CHECK: st1b { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +define void @sst1b_d_uxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_d_uxtw: +; CHECK: st1b { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i8( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +define void @sst1b_d_sxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sst1b_d_sxtw: +; CHECK: st1b { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i8( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +; ST1H +define void @sst1h_s_uxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_s_uxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1h_s_sxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_s_sxtw: +; CHECK: st1h { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1h_d_uxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_d_uxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1h_d_sxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_d_sxtw: +; CHECK: st1h { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +; ST1W +define void @sst1w_s_uxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_s_uxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i32( %data, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_s_sxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_s_sxtw: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i32( %data, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_d_uxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_d_uxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i32( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_d_sxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_d_sxtw: +; CHECK: st1w { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i32( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1w_s_uxtw_float( %data, %pg, float* %base, %offsets) { +; CHECK-LABEL: sst1w_s_uxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4f32( %data, + %pg, + float* %base, + %offsets) + ret void +} + +define void @sst1w_s_sxtw_float( %data, %pg, float* %base, %offsets) { +; CHECK-LABEL: sst1w_s_sxtw_float: +; CHECK: st1w { z0.s }, p0, [x0, z1.s, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4f32( %data, + %pg, + float* %base, + %offsets) + ret void +} + +; ST1D +define void @sst1d_d_uxtw( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sst1d_d_uxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sst1d_d_sxtw( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sst1d_d_sxtw: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sst1d_d_uxtw_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sst1d_d_uxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2f64( %data, + %pg, + double* %base, + %offsets) + ret void +} + +define void @sst1d_d_sxtw_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sst1d_d_sxtw_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2f64( %data, + %pg, + double* %base, + %offsets) + ret void +} + + +; ST1B +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i8(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i8(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i8(, , i8*, ) + +; ST1H +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i16(, , i16*, ) + +; ST1W +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i32(, , i32*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv4f32(, , float*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv4f32(, , float*, ) + +; ST1D +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2i64(, , i64*, ) + +declare void @llvm.aarch64.sve.st1.scatter.sxtw.nxv2f64(, , double*, ) +declare void @llvm.aarch64.sve.st1.scatter.uxtw.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-scaled-offset.ll @@ -0,0 +1,58 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1H, ST1W, ST1D: base + 64-bit scaled offset +; e.g. st1h { z0.d }, p0, [x0, z0.d, lsl #1] +; + +define void @sst1h_index( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sst1h_index +; CHECK: st1h { z0.d }, p0, [x0, z1.d, lsl #1] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.index.nxv2i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sst1w_index( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sst1w_index +; CHECK: st1w { z0.d }, p0, [x0, z1.d, lsl #2] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.index.nxv2i32( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sst1d_index( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sst1d_index +; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.index.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sst1d_index_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sst1d_index_double +; CHECK: st1d { z0.d }, p0, [x0, z1.d, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.index.nxv2f64( %data, + %pg, + double* %base, + %offsets) + ret void +} + + +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.index.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-64bit-unscaled-offset.ll @@ -0,0 +1,70 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1B, ST1W, ST1H, ST1D: base + 64-bit unscaled offset +; e.g. st1h { z0.d }, p0, [x0, z1.d] +; + +define void @sst1b_d( %data, %pg, i8* %base, %b) { +; CHECK-LABEL: sst1b_d: +; CHECK: st1b { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.nxv2i8( %data_trunc, + %pg, + i8* %base, + %b) + ret void +} + +define void @sst1h_d( %data, %pg, i16* %base, %b) { +; CHECK-LABEL: sst1h_d: +; CHECK: st1h { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.nxv2i16( %data_trunc, + %pg, + i16* %base, + %b) + ret void +} + +define void @sst1w_d( %data, %pg, i32* %base, %b) { +; CHECK-LABEL: sst1w_d: +; CHECK: st1w { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.nxv2i32( %data_trunc, + %pg, + i32* %base, + %b) + ret void +} + +define void @sst1d_d( %data, %pg, i64* %base, %b) { +; CHECK-LABEL: sst1d_d: +; CHECK: st1d { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.nxv2i64( %data, + %pg, + i64* %base, + %b) + ret void +} + +define void @sst1d_d_double( %data, %pg, double* %base, %b) { +; CHECK-LABEL: sst1d_d_double: +; CHECK: st1d { z0.d }, p0, [x0, z1.d] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.nxv2f64( %data, + %pg, + double* %base, + %b) + ret void +} + +declare void @llvm.aarch64.sve.st1.scatter.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.st1.scatter.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-scatter-stores-vector-base.ll @@ -0,0 +1,133 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ST1B, ST1W, ST1H, ST1D: vector + immediate (index) +; e.g. st1h { z0.s }, p0, [z1.s, #16] +; + +; ST1B +define void @sst1b_s_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1b_s_imm: +; CHECK: st1b { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i8.nxv4i32( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1b_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1b_d_imm: +; CHECK: st1b { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i8.nxv2i64( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +; ST1H +define void @sst1h_s_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1h_s_imm: +; CHECK: st1h { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i16.nxv4i32( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1h_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1h_d_imm: +; CHECK: st1h { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i16.nxv2i64( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +; ST1W +define void @sst1w_s_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1w_s_imm: +; CHECK: st1w { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4i32.nxv4i32( %data, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1w_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1w_d_imm: +; CHECK: st1w { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i32.nxv2i64( %data_trunc, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1w_s_imm_float( %data, %pg, %base) { +; CHECK-LABEL: sst1w_s_imm_float: +; CHECK: st1w { z0.s }, p0, [z1.s, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv4f32.nxv4i32( %data, + %pg, + %base, + i64 16) + ret void +} + +; ST1D +define void @sst1d_d_imm( %data, %pg, %base) { +; CHECK-LABEL: sst1d_d_imm: +; CHECK: st1d { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2i64.nxv2i64( %data, + %pg, + %base, + i64 16) + ret void +} + +define void @sst1d_d_imm_double( %data, %pg, %base) { +; CHECK-LABEL: sst1d_d_imm_double: +; CHECK: st1d { z0.d }, p0, [z1.d, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.scatter.imm.nxv2f64.nxv2i64( %data, + %pg, + %base, + i64 16) + ret void +} + +; ST1B +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i8.nxv4i32(, , , i64) +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i8.nxv2i64(, , , i64) + +; ST1H +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i16.nxv4i32(, , , i64) +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i16.nxv2i64(, , , i64) + +; ST1W +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4i32.nxv4i32(, , , i64) +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i32.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv4f32.nxv4i32(, , , i64) + +; ST1D +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2i64.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.st1.scatter.imm.nxv2f64.nxv2i64(, , , i64)