diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1263,6 +1263,27 @@ ], [IntrWriteMem, IntrArgMemOnly]>; + +class SVE_gather_prf_scalar_base_vector_offset_scaled + : Intrinsic<[], + [ + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate + llvm_ptr_ty, // Base address + llvm_anyvector_ty, // Offsets + llvm_i32_ty // Prfop + ], + [IntrInaccessibleMemOrArgMemOnly, NoCapture<1>, ImmArg<3>]>; + +class SVE_gather_prf_vector_base_scalar_offset + : Intrinsic<[], + [ + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, // Predicate + llvm_anyvector_ty, // Base addresses + llvm_i64_ty, // Scalar offset + llvm_i32_ty // Prfop + ], + [IntrInaccessibleMemOrArgMemOnly, ImmArg<3>]>; + // // Loads // @@ -1279,13 +1300,39 @@ def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic; // -// Prefetch +// Prefetches // def int_aarch64_sve_prf : Intrinsic<[], [llvm_anyvector_ty, llvm_ptr_ty, llvm_i32_ty], [IntrArgMemOnly, ImmArg<2>]>; +// Scalar + 32-bit scaled offset vector, zero extend, packed and +// unpacked. +def int_aarch64_sve_gather_prfb_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfh_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfw_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfd_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; + +// Scalar + 32-bit scaled offset vector, sign extend, packed and +// unpacked. +def int_aarch64_sve_gather_prfb_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfw_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfh_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfd_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; + +// Scalar + 64-bit scaled offset vector. +def int_aarch64_sve_gather_prfb_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfh_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfw_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_gather_prfd_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; + +// Vector + scalar. +def int_aarch64_sve_gather_prfb : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_gather_prfh : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_gather_prfw : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_gather_prfd : SVE_gather_prf_vector_base_scalar_offset; + // // Scalar to vector operations // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12646,6 +12646,20 @@ return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); } +/// Check if the value of \p Offset represents a valid immediate for the SVE +/// gather load/prefetch and scatter store instructiona with vector base and +/// immediate offset addressing mode: +/// +/// [.[S|D]{, #}] +/// +/// where = sizeof() * k, for k = 0, 1, ..., 31. +static bool isValidImmForSVEVecImmAddrMode(SDValue Offset, + unsigned ScalarSizeInBytes) { + ConstantSDNode *OffsetConst = dyn_cast(Offset.getNode()); + return OffsetConst && AArch64_AM::isValidImmForSVEVecImmAddrMode( + OffsetConst->getZExtValue(), ScalarSizeInBytes); +} + static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { @@ -12697,13 +12711,9 @@ // immediates outside that range and non-immediate scalar offsets use SST1 or // SST1_UXTW instead. if (Opcode == AArch64ISD::SST1_IMM) { - uint64_t MaxIndex = 31; - uint64_t SrcElSize = SrcElVT.getStoreSize().getKnownMinSize(); - ConstantSDNode *OffsetConst = dyn_cast(Offset.getNode()); - if (nullptr == OffsetConst || - OffsetConst->getZExtValue() > MaxIndex * SrcElSize || - OffsetConst->getZExtValue() % SrcElSize) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + SrcVT.getScalarSizeInBits() / 8)) { if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) Opcode = AArch64ISD::SST1_UXTW; else @@ -12763,7 +12773,6 @@ "Gather loads are only possible for SVE vectors"); SDLoc DL(N); - MVT RetElVT = RetVT.getVectorElementType().getSimpleVT(); // Make sure that the loaded data will fit into an SVE register if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) @@ -12780,8 +12789,8 @@ // applies to non-temporal gathers because there's no instruction that takes // indicies. if (Opcode == AArch64ISD::GLDNT1_INDEX) { - Offset = - getScaledOffsetForBitWidth(DAG, Offset, DL, RetElVT.getSizeInBits()); + Offset = getScaledOffsetForBitWidth(DAG, Offset, DL, + RetVT.getScalarSizeInBits()); Opcode = AArch64ISD::GLDNT1; } @@ -12800,13 +12809,8 @@ // immediates outside that range and non-immediate scalar offsets use GLD1 or // GLD1_UXTW instead. if (Opcode == AArch64ISD::GLD1_IMM || Opcode == AArch64ISD::GLDFF1_IMM) { - uint64_t MaxIndex = 31; - uint64_t RetElSize = RetElVT.getStoreSize().getKnownMinSize(); - - ConstantSDNode *OffsetConst = dyn_cast(Offset.getNode()); - if (nullptr == OffsetConst || - OffsetConst->getZExtValue() > MaxIndex * RetElSize || - OffsetConst->getZExtValue() % RetElSize) { + if (!isValidImmForSVEVecImmAddrMode(Offset, + RetVT.getScalarSizeInBits() / 8)) { if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1_UXTW : AArch64ISD::GLDFF1_UXTW; @@ -12950,6 +12954,51 @@ return SDValue(N, 0); } +/// Legalize the gather prefetch (scalar + vector addressing mode) when the +/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset +/// != nxv2i32) do not need legalization. +static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { + const unsigned OffsetPos = 4; + SDValue Offset = N->getOperand(OffsetPos); + + // Not an unpacked vector, bail out. + if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32) + return SDValue(); + + // Extend the unpacked offset vector to 64-bit lanes. + SDLoc DL(N); + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset); + SmallVector Ops(N->op_begin(), N->op_end()); + // Replace the offset operand with the 64-bit one. + Ops[OffsetPos] = Offset; + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + +/// Combines a node carrying the intrinsic `aarch64_sve_gather_prf` into a +/// node that uses `aarch64_sve_gather_prf_scaled_uxtw` when the scalar +/// offset passed to `aarch64_sve_gather_prf` is not a valid immediate for +/// the sve gather prefetch instruction with vector plus immediate addressing +/// mode. +static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, + unsigned NewIID, + unsigned ScalarSizeInBytes) { + const unsigned ImmPos = 4, OffsetPos = 3; + // No need to combine the node if the immediate is valid... + if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes)) + return SDValue(); + + // ...otherwise swap the offset base with the offset... + SmallVector Ops(N->op_begin(), N->op_end()); + std::swap(Ops[ImmPos], Ops[OffsetPos]); + // ...and remap the intrinsic `aarch64_sve_gather_prf` to + // `aarch64_sve_gather_prf_scaled_uxtw`. + SDLoc DL(N); + Ops[1] = DAG.getConstant(NewIID, DL, MVT::i64); + + return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -13014,6 +13063,31 @@ case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { + case Intrinsic::aarch64_sve_gather_prfb: + return combineSVEPrefetchVecBaseImmOff( + N, DAG, Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw, + 1 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_gather_prfh: + return combineSVEPrefetchVecBaseImmOff( + N, DAG, Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw, + 2 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_gather_prfw: + return combineSVEPrefetchVecBaseImmOff( + N, DAG, Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw, + 4 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_gather_prfd: + return combineSVEPrefetchVecBaseImmOff( + N, DAG, Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw, + 8 /*=ScalarSizeInBytes*/); + case Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw: + case Intrinsic::aarch64_sve_gather_prfb_scaled_sxtw: + case Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw: + case Intrinsic::aarch64_sve_gather_prfh_scaled_sxtw: + case Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw: + case Intrinsic::aarch64_sve_gather_prfw_scaled_sxtw: + case Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw: + case Intrinsic::aarch64_sve_gather_prfd_scaled_sxtw: + return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: case Intrinsic::aarch64_neon_ld4: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -880,37 +880,37 @@ // Gather prefetch using scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.s, uxtw #1] - defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>; + defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>; + defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>; + defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>; + defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>; // Gather prefetch using unpacked, scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, uxtw #1] - defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>; + defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>; + defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>; + defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>; // Gather prefetch using scaled 64-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, lsl #1] - defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>; - defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>; - defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>; - defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>; + defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_gather_prfb_scaled>; + defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_gather_prfh_scaled>; + defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_gather_prfw_scaled>; + defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_gather_prfd_scaled>; // Gather prefetch using 32/64-bit pointers with offset, e.g. // prfh pldl1keep, p0, [z0.s, #16] // prfh pldl1keep, p0, [z0.d, #16] - defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>; - - defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>; - defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>; - defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>; - defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>; + defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>; + defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>; + defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>; + defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>; + + defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>; + defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>; + defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>; + defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>; defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">; defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">; diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h --- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h +++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h @@ -840,6 +840,26 @@ return isAnyMOVZMovAlias(Value, RegWidth); } +/// Check if the value of \p OffsetInBytes can be used as an immediate for +/// the gather load/prefetch and scatter store instructions with vector base and +/// immediate offset addressing mode: +/// +/// [.[S|D]{, #}] +/// +/// where = sizeof() * k, for k = 0, 1, ..., 31. +static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes, + unsigned ScalarSizeInBytes) { + // The immediate is not a multiple of the scalar size. + if (OffsetInBytes % ScalarSizeInBytes) + return false; + + // The immediate is out of range. + if (OffsetInBytes / ScalarSizeInBytes > 31) + return false; + + return true; +} + } // end namespace AArch64_AM } // end namespace llvm diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6455,9 +6455,17 @@ multiclass sve_mem_32b_prfm_sv_scaled msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + PatFrag op_sxtw, + PatFrag op_uxtw> { def _UXTW_SCALED : sve_mem_32b_prfm_sv; def _SXTW_SCALED : sve_mem_32b_prfm_sv; + + def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; } class sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> @@ -6480,11 +6488,14 @@ let Inst{3-0} = prfop; } -multiclass sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty> { +multiclass sve_mem_32b_prfm_vi msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_32b_prfm_vi; def : InstAlias(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + + def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; } class sve_mem_z_fill @@ -6798,14 +6809,27 @@ multiclass sve_mem_64b_prfm_sv_ext_scaled msz, string asm, RegisterOperand sxtw_opnd, - RegisterOperand uxtw_opnd> { + RegisterOperand uxtw_opnd, + PatFrag op_sxtw, + PatFrag op_uxtw> { def _UXTW_SCALED : sve_mem_64b_prfm_sv; def _SXTW_SCALED : sve_mem_64b_prfm_sv; + + def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + + def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } multiclass sve_mem_64b_prfm_sv_lsl_scaled msz, string asm, - RegisterOperand zprext> { + RegisterOperand zprext, PatFrag frag> { def NAME : sve_mem_64b_prfm_sv; + + def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } @@ -6831,11 +6855,14 @@ let hasSideEffects = 1; } -multiclass sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty> { +multiclass sve_mem_64b_prfm_vi msz, string asm, Operand imm_ty, SDPatternOperator op> { def NAME : sve_mem_64b_prfm_vi; def : InstAlias(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + + def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)), + (!cast(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll @@ -0,0 +1,200 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; PRFB , , [, .S, ] -> 32-bit scaled offset +define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFB , , [, .D, ] -> 32-bit unpacked scaled offset + +define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } +; PRFB , , [, .D] -> 64-bit scaled offset +define void @llvm_aarch64_sve_gather_prfb_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_nx2vi64: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; PRFH , , [, .S, ] -> 32-bit scaled offset +define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, sxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFH , , [, .D, #1] -> 32-bit unpacked scaled offset +define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFH , , [, .D] -> 64-bit scaled offset +define void @llvm_aarch64_sve_gather_prfh_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_nx2vi64: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, lsl #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; PRFW , , [, .S, ] -> 32-bit scaled offset +define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFW , , [, .D, #2] -> 32-bit unpacked scaled offset +define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFW , , [, .D] -> 64-bit scaled offset +define void @llvm_aarch64_sve_gather_prfw_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_nx2vi64: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, lsl #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; PRFD , , [, .S, ] -> 32-bit scaled offset +define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, sxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFD , , [, .D, #3] -> 32-bit unpacked scaled offset +define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +; PRFD , , [, .D] -> 64-bit scaled offset +define void @llvm_aarch64_sve_gather_prfd_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_nx2vi64: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll @@ -0,0 +1,82 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; PRFB , , [.S{, #}] -> 32-bit element +define void @llvm_aarch64_sve_gather_prfb_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32: +; CHECK-NEXT: prfb pldl1strm, p0, [z0.s, #7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 7, i32 1) + ret void +} + +; PRFB , , [.D{, #}] -> 64-bit element +define void @llvm_aarch64_sve_gather_prfb_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64: +; CHECK-NEXT: prfb pldl1strm, p0, [z0.d, #7] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 7, i32 1) + ret void +} + +; PRFH , , [.S{, #}] -> 32-bit element +define void @llvm_aarch64_sve_gather_prfh_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32: +; CHECK-NEXT: prfh pldl1strm, p0, [z0.s, #6] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 6, i32 1) + ret void +} + +; PRFH , , [.D{, #}] -> 64-bit element +define void @llvm_aarch64_sve_gather_prfh_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64: +; CHECK-NEXT: prfh pldl1strm, p0, [z0.d, #6] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 6, i32 1) + ret void +} + +; PRFW , , [.S{, #}] -> 32-bit element +define void @llvm_aarch64_sve_gather_prfw_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32: +; CHECK-NEXT: prfw pldl1strm, p0, [z0.s, #12] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 12, i32 1) + ret void +} + +; PRFW , , [.D{, #}] -> 64-bit element +define void @llvm_aarch64_sve_gather_prfw_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64: +; CHECK-NEXT: prfw pldl1strm, p0, [z0.d, #12] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 12, i32 1) + ret void +} + +; PRFD , , [.S{, #}] -> 32-bit element +define void @llvm_aarch64_sve_gather_prfd_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32: +; CHECK-NEXT: prfd pldl1strm, p0, [z0.s, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 16, i32 1) + ret void +} + +; PRFD , , [.D{, #}] -> 64-bit element +define void @llvm_aarch64_sve_gather_prfd_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64: +; CHECK-NEXT: prfd pldl1strm, p0, [z0.d, #16] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 16, i32 1) + ret void +} + +declare void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll @@ -0,0 +1,286 @@ +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s + +; PRFB , , [.S{, #}] -> 32-bit element, imm = 0, 1, ..., 31 +define void @llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #32 +; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 32, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 -1, i32 1) + ret void +} + +; PRFB , , [.D{, #}] -> 64-bit element, imm = 0, 1, ..., 31 +define void @llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset: +; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #32 +; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 32, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 -1, i32 1) + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; PRFH , , [.S{, #}] -> 32-bit element, imm = 0, 2, ..., 62 +define void @llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #63 +; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.s, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 63, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 -1, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2: +; CHECK-NEXT: mov w[[N:[0-9]+]], #33 +; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 33, i32 1) + ret void +} + +; PRFH , , [.D{, #}] -> 64-bit element, imm = 0, 2, ..., 62 +define void @llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset: +; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #63 +; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.d, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 63, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 -1, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2: +; CHECK-NEXT: mov w[[N:[0-9]+]], #33 +; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 33, i32 1) + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; PRFW , , [.S{, #}] -> 32-bit element, imm = 0, 4, ..., 124 +define void @llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #125 +; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 125, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 -1, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4: +; CHECK-NEXT: mov w[[N:[0-9]+]], #33 +; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 33, i32 1) + ret void +} + +; PRFW , , [.D{, #}] -> 64-bit element, imm = 0, 4, ..., 124 +define void @llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset: +; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #125 +; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.d, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 125, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 -1, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4: +; CHECK-NEXT: mov w[[N:[0-9]+]], #33 +; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 33, i32 1) + ret void +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +; PRFD , , [.S{, #}] -> 32-bit element, imm = 0, 8, ..., 248 +define void @llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #125 +; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.s, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 125, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 -1, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8: +; CHECK-NEXT: mov w[[N:[0-9]+]], #33 +; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 33, i32 1) + ret void +} + +; PRFD , , [.D{, #}] -> 64-bit element, imm = 0, 4, ..., 248 +define void @llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset: +; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound: +; CHECK-NEXT: mov w[[N:[0-9]+]], #125 +; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 125, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound: +; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 +; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 -1, i32 1) + ret void +} + +define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8: +; CHECK-NEXT: mov w[[N:[0-9]+]], #33 +; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 33, i32 1) + ret void +} + +declare void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop)