diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1727,6 +1727,32 @@ def int_aarch64_sve_ld1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic; + +// +// First-faulting gather loads: scalar base + vector offsets +// + +// 64 bit unscalled offsets +def int_aarch64_sve_ldff1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; + +// 64 bit scaled offsets +def int_aarch64_sve_ldff1_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; + +// 32 bit unscaled offsets, sign (sxtw) or zero (zxtw) extended to 64 bits +def int_aarch64_sve_ldff1_gather_sxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; +def int_aarch64_sve_ldff1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; + +// 32 bit scaled offsets, sign (sxtw) or zero (zxtw) extended to 64 bits +def int_aarch64_sve_ldff1_gather_sxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; +def int_aarch64_sve_ldff1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; + +// +// First-faulting gather loads: vector base + scalar offset +// + +def int_aarch64_sve_ldff1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic; + + // // Scatter stores: scalar base + vector offsets // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -242,6 +242,25 @@ GLD1S_UXTW_SCALED, GLD1S_SXTW_SCALED, GLD1S_IMM, + + // Unsigned gather loads. + GLDFF1, + GLDFF1_SCALED, + GLDFF1_UXTW, + GLDFF1_SXTW, + GLDFF1_UXTW_SCALED, + GLDFF1_SXTW_SCALED, + GLDFF1_IMM, + + // Signed gather loads. + GLDFF1S, + GLDFF1S_SCALED, + GLDFF1S_UXTW, + GLDFF1S_SXTW, + GLDFF1S_UXTW_SCALED, + GLDFF1S_SXTW_SCALED, + GLDFF1S_IMM, + // Scatter store SST1, SST1_SCALED, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1422,6 +1422,22 @@ case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; + case AArch64ISD::GLDFF1: return "AArch64ISD::GLDFF1"; + case AArch64ISD::GLDFF1_SCALED: return "AArch64ISD::GLDFF1_SCALED"; + case AArch64ISD::GLDFF1_SXTW: return "AArch64ISD::GLDFF1_SXTW"; + case AArch64ISD::GLDFF1_UXTW: return "AArch64ISD::GLDFF1_UXTW"; + case AArch64ISD::GLDFF1_SXTW_SCALED:return "AArch64ISD::GLDFF1_SXTW_SCALED"; + case AArch64ISD::GLDFF1_UXTW_SCALED:return "AArch64ISD::GLDFF1_UXTW_SCALED"; + case AArch64ISD::GLDFF1_IMM: return "AArch64ISD::GLDFF1_IMM"; + case AArch64ISD::GLDFF1S: return "AArch64ISD::GLDFF1S"; + case AArch64ISD::GLDFF1S_SCALED: return "AArch64ISD::GLDFF1S_SCALED"; + case AArch64ISD::GLDFF1S_SXTW: return "AArch64ISD::GLDFF1S_SXTW"; + case AArch64ISD::GLDFF1S_UXTW: return "AArch64ISD::GLDFF1S_UXTW"; + case AArch64ISD::GLDFF1S_SXTW_SCALED: + return "AArch64ISD::GLDFF1S_SXTW_SCALED"; + case AArch64ISD::GLDFF1S_UXTW_SCALED: + return "AArch64ISD::GLDFF1S_UXTW_SCALED"; + case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM"; case AArch64ISD::SST1: return "AArch64ISD::SST1"; case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; @@ -10423,6 +10439,13 @@ case AArch64ISD::GLD1_UXTW: case AArch64ISD::GLD1_UXTW_SCALED: case AArch64ISD::GLD1_IMM: + case AArch64ISD::GLDFF1: + case AArch64ISD::GLDFF1_SCALED: + case AArch64ISD::GLDFF1_SXTW: + case AArch64ISD::GLDFF1_SXTW_SCALED: + case AArch64ISD::GLDFF1_UXTW: + case AArch64ISD::GLDFF1_UXTW_SCALED: + case AArch64ISD::GLDFF1_IMM: MemVT = cast(Src->getOperand(4))->getVT(); break; default: @@ -12696,13 +12719,13 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); - // GLD1_IMM requires that the offset is an immediate that is: + // GLD{FF}1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], // where #SizeInBytes is the size in bytes of the loaded items. For // immediates outside that range and non-immediate scalar offsets use GLD1 or // GLD1_UXTW instead. - if (Opcode == AArch64ISD::GLD1_IMM) { + if (Opcode == AArch64ISD::GLD1_IMM || Opcode == AArch64ISD::GLDFF1_IMM) { uint64_t MaxIndex = 31; uint64_t RetElSize = RetElVT.getStoreSize().getKnownMinSize(); @@ -12711,9 +12734,11 @@ OffsetConst->getZExtValue() > MaxIndex * RetElSize || OffsetConst->getZExtValue() % RetElSize) { if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy) - Opcode = AArch64ISD::GLD1_UXTW; + Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1_UXTW + : AArch64ISD::GLDFF1_UXTW; else - Opcode = AArch64ISD::GLD1; + Opcode = (Opcode == AArch64ISD::GLD1_IMM) ? AArch64ISD::GLD1 + : AArch64ISD::GLDFF1; std::swap(Base, Offset); } @@ -12802,6 +12827,27 @@ case AArch64ISD::GLD1_IMM: NewOpc = AArch64ISD::GLD1S_IMM; break; + case AArch64ISD::GLDFF1: + NewOpc = AArch64ISD::GLDFF1S; + break; + case AArch64ISD::GLDFF1_SCALED: + NewOpc = AArch64ISD::GLDFF1S_SCALED; + break; + case AArch64ISD::GLDFF1_SXTW: + NewOpc = AArch64ISD::GLDFF1S_SXTW; + break; + case AArch64ISD::GLDFF1_SXTW_SCALED: + NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED; + break; + case AArch64ISD::GLDFF1_UXTW: + NewOpc = AArch64ISD::GLDFF1S_UXTW; + break; + case AArch64ISD::GLDFF1_UXTW_SCALED: + NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED; + break; + case AArch64ISD::GLDFF1_IMM: + NewOpc = AArch64ISD::GLDFF1S_IMM; + break; default: return SDValue(); } @@ -12939,6 +12985,24 @@ /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_ldff1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1); + case Intrinsic::aarch64_sve_ldff1_gather_index: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SCALED); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_UXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_SXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_UXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_IMM); case Intrinsic::aarch64_sve_st1_scatter: return performScatterStoreCombine(N, DAG, AArch64ISD::SST1); case Intrinsic::aarch64_sve_st1_scatter_index: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -53,6 +53,22 @@ def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather : SDNode<"AArch64ISD::GLDFF1", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather_scaled : SDNode<"AArch64ISD::GLDFF1_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather_uxtw : SDNode<"AArch64ISD::GLDFF1_UXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather_sxtw : SDNode<"AArch64ISD::GLDFF1_SXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1_gather_imm : SDNode<"AArch64ISD::GLDFF1_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +def AArch64ldff1s_gather : SDNode<"AArch64ISD::GLDFF1S", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s_gather_scaled : SDNode<"AArch64ISD::GLDFF1S_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s_gather_uxtw : SDNode<"AArch64ISD::GLDFF1S_UXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s_gather_sxtw : SDNode<"AArch64ISD::GLDFF1S_SXTW", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldff1s_gather_imm : SDNode<"AArch64ISD::GLDFF1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + // Scatter stores - node definitions // def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [ @@ -581,114 +597,114 @@ // Gathers using unscaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw] defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; - defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; // Gathers using scaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw #1] - defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled, AArch64ldff1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; // Gathers using 32-bit pointers with scaled offset, e.g. // ld1h z0.s, p0/z, [z0.s, #16] - defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>; - defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>; - defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>; - defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>; - defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>; - defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>; - defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>; - defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>; - defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>; - defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>; + defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm, nxv4i32>; // Gathers using 64-bit pointers with scaled offset, e.g. // ld1h z0.d, p0/z, [z0.d, #16] - defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, AArch64ldff1_gather_imm, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, AArch64ldff1_gather_imm, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, AArch64ldff1_gather_imm, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, AArch64ldff1_gather_imm, nxv2i64>; // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", AArch64ldff1_gather, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", AArch64ldff1_gather, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", AArch64ldff1_gather, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", AArch64ldff1_gather, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", AArch64ldff1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", AArch64ldff1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", AArch64ldff1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; - defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw, AArch64ldff1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw, AArch64ldff1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] - defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; - defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; - defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; - defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled, AArch64ldff1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled, AArch64ldff1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", AArch64ldff1_gather_sxtw_scaled, AArch64ldff1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Non-temporal contiguous loads (register + immediate) defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -6203,10 +6203,19 @@ def : InstAlias(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), - (!cast(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)), - (!cast(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_32b_gld_vs_32_unscaled opc, string asm, @@ -6223,10 +6232,19 @@ def : InstAlias(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), - (!cast(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)), - (!cast(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } @@ -6265,8 +6283,15 @@ def : InstAlias(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>; + } + def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)), - (!cast(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; + (!cast(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } class sve_mem_prfm_si msz, string asm> @@ -6515,10 +6540,19 @@ def : InstAlias(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs_32_unscaled opc, string asm, @@ -6535,10 +6569,19 @@ def : InstAlias(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>; + def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>; + } + def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } multiclass sve_mem_64b_gld_sv2_64_scaled opc, string asm, @@ -6549,8 +6592,15 @@ def : InstAlias(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)), - (!cast(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; + (!cast(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>; } multiclass sve_mem_64b_gld_vs2_64_unscaled opc, string asm, @@ -6560,8 +6610,15 @@ def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>, + PseudoInstExpansion<(!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)), - (!cast(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; + (!cast(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>; } class sve_mem_64b_gld_vi opc, string asm, Operand imm_ty> @@ -6599,8 +6656,15 @@ def : InstAlias(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + // We need a layer of indirection because early machine code passes balk at + // physical register (i.e. FFR) uses that have no previous definition. + let hasSideEffects = 1, hasNoSchedulingInfo = 1 in { + def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>, + PseudoInstExpansion<(!cast(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>; + } + def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)), - (!cast(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; + (!cast(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; } // bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-scaled-offsets.ll @@ -0,0 +1,255 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1H, LDFF1W, LDFF1D: base + 32-bit scaled offset, sign (sxtw) or zero (uxtw) +; extended to 64 bits +; e.g. ldff1h z0.d, p0/z, [x0, z0.d, uxtw #1] +; + +; LDFF1H +define @gldff1h_s_uxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_s_uxtw_index: +; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_s_sxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_s_sxtw_index: +; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_d_uxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_d_uxtw_index: +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_d_sxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_d_sxtw_index: +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +; LDFF1W +define @gldff1w_s_uxtw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_s_uxtw_index: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32( %pg, + i32* %base, + %b) + ret %load +} + +define @gldff1w_s_sxtw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_s_sxtw_index: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32( %pg, + i32* %base, + %b) + ret %load +} + +define @gldff1w_d_uxtw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_d_uxtw_index: +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1w_d_sxtw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_d_sxtw_index: +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1w_s_uxtw_index_float( %pg, float* %base, %b) { +; CHECK-LABEL: gldff1w_s_uxtw_index_float: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4f32( %pg, + float* %base, + %b) + ret %load +} + +define @gldff1w_s_sxtw_index_float( %pg, float* %base, %b) { +; CHECK-LABEL: gldff1w_s_sxtw_index_float: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4f32( %pg, + float* %base, + %b) + ret %load +} + +; LDFF1D +define @gldff1d_s_uxtw_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gldff1d_s_uxtw_index: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldff1d_sxtw_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gldff1d_sxtw_index: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldff1d_uxtw_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldff1d_uxtw_index_double: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +define @gldff1d_sxtw_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldff1d_sxtw_index_double: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDFF1SH, LDFF1SW, LDFF1SD: base + 32-bit scaled offset, sign (sxtw) or zero (uxtw) +; extended to 64 bits +; e.g. ldff1sh z0.d, p0/z, [x0, z0.d, uxtw #1] +; + +; LDFF1SH +define @gldff1sh_s_uxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_s_uxtw_index: +; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_s_sxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_s_sxtw_index: +; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_uxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_d_uxtw_index: +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_sxtw_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_d_sxtw_index: +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +; LDFF1SW +define @gldff1sw_d_uxtw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1sw_d_uxtw_index: +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sw_d_sxtw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1sw_d_sxtw_index: +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + + +; LDFF1H/LDFF1SH +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i16(, i16*, ) + +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i16(, i16*, ) + +; LDFF1W/LDFF1SW +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4i32(, i32*, ) + +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i32(, i32*, ) + +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv4f32(, float*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv4f32(, float*, ) + +; LDFF1D +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2i64(, i64*, ) + +declare @llvm.aarch64.sve.ldff1.gather.uxtw.index.nxv2f64(, double*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.index.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-32bit-unscaled-offsets.ll @@ -0,0 +1,348 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1B, LDFF1W, LDFF1H, LDFF1D: base + 32-bit unscaled offset, sign (sxtw) or zero +; (uxtw) extended to 64 bits. +; e.g. ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw] +; + +; LDFF1B +define @gldff1b_s_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1b_s_uxtw: +; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1b_s_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1b_s_sxtw: +; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1b_d_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1b_d_uxtw: +; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1b_d_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1b_d_sxtw: +; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +; LDFF1H +define @gldff1h_s_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_s_uxtw: +; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_s_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_s_sxtw: +; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_d_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_d_uxtw: +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_d_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_d_sxtw: +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +; LDFF1W +define @gldff1w_s_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_s_uxtw: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32( %pg, + i32* %base, + %b) + ret %load +} + +define @gldff1w_s_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_s_sxtw: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32( %pg, + i32* %base, + %b) + ret %load +} + +define @gldff1w_d_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_d_uxtw: +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1w_d_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_d_sxtw: +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1w_s_uxtw_float( %pg, float* %base, %b) { +; CHECK-LABEL: gldff1w_s_uxtw_float: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4f32( %pg, + float* %base, + %b) + ret %load +} + +define @gldff1w_s_sxtw_float( %pg, float* %base, %b) { +; CHECK-LABEL: gldff1w_s_sxtw_float: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4f32( %pg, + float* %base, + %b) + ret %load +} + +; LDFF1D +define @gldff1d_d_uxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gldff1d_d_uxtw: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldff1d_d_sxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gldff1d_d_sxtw: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldff1d_d_uxtw_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldff1d_d_uxtw_double: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +define @gldff1d_d_sxtw_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldff1d_d_sxtw_double: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDFF1SB, LDFF1SW, LDFF1SH: base + 32-bit unscaled offset, sign (sxtw) or zero +; (uxtw) extended to 64 bits. +; e.g. ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw] +; + +; LDFF1SB +define @gldff1sb_s_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1sb_s_uxtw: +; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sb_s_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1sb_s_sxtw: +; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sb_d_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1sb_d_uxtw: +; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sb_d_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1sb_d_sxtw: +; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +; LDFF1SH +define @gldff1sh_s_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_s_uxtw: +; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_s_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_s_sxtw: +; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_d_uxtw: +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_d_sxtw: +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +; LDFF1SW +define @gldff1sw_d_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1sw_d_uxtw: +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sw_d_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1sw_d_sxtw: +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + +; LDFF1B/LDFF1SB +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i8(, i8*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i8(, i8*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i8(, i8*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i8(, i8*, ) + +; LDFF1H/LDFF1SH +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i16(, i16*, ) + +; LDFF1W/LDFF1SW +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i32(, i32*, ) + +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv4f32(, float*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv4f32(, float*, ) + +; LDFF1D +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2i64(, i64*, ) + +declare @llvm.aarch64.sve.ldff1.gather.sxtw.nxv2f64(, double*, ) +declare @llvm.aarch64.sve.ldff1.gather.uxtw.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-scaled-offset.ll @@ -0,0 +1,80 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1H, LDFF1W, LDFF1D: base + 64-bit scaled offset +; e.g. ldff1h z0.d, p0/z, [x0, z0.d, lsl #1] +; + +define @gldff1h_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_index +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1w_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1w_index +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1d_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gldff1d_index +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldff1d_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldff1d_index_double +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDFF1SH, LDFF1SW: base + 64-bit scaled offset +; e.g. ldff1sh z0.d, p0/z, [x0, z0.d, lsl #1] +; + +define @gldff1sh_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_index +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d, lsl #1] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldff1sw_index +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d, lsl #2] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + +declare @llvm.aarch64.sve.ldff1.gather.index.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.index.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.index.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldff1.gather.index.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-64bit-unscaled-offset.ll @@ -0,0 +1,103 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1B, LDFF1W, LDFF1H, LDFF1D: base + 64-bit unscaled offset +; e.g. ldff1h { z0.d }, p0/z, [x0, z0.d] +; + +define @gldff1b_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1b_d: +; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1h_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1h_d: +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldff1w_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gldff1w_d: +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = zext %load to + ret %res +} + +define @gldff1d_d( %pg, i64* %base, %b) { +; CHECK-LABEL: gldff1d_d: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldff1d_d_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldff1d_d_double: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDFF1SB, LDFF1SW, LDFF1SH: base + 64-bit unscaled offset +; e.g. ldff1sh { z0.d }, p0/z, [x0, z0.d] +; + +define @gldff1sb_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gldff1sb_d: +; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sh_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gldff1sh_d: +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldff1sw_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gldff1sw_d: +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = sext %load to + ret %res +} + +declare @llvm.aarch64.sve.ldff1.gather.nxv2i8(, i8*, ) +declare @llvm.aarch64.sve.ldff1.gather.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldff1.gather.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldff1.gather.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldff1.gather.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-imm-offset.ll @@ -0,0 +1,368 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1B, LDFF1W, LDFF1H, LDFF1D: vector base + immediate offset (index) +; e.g. ldff1h { z0.s }, p0/z, [z0.s, #16] +; + +; LDFF1B +define @gldff1b_s_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1b_s_imm_offset: +; CHECK: ldff1b { z0.s }, p0/z, [z0.s, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 16) + %res = zext %load to + ret %res +} + +define @gldff1b_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1b_d_imm_offset: +; CHECK: ldff1b { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 16) + %res = zext %load to + ret %res +} + +; LDFF1H +define @gldff1h_s_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1h_s_imm_offset: +; CHECK: ldff1h { z0.s }, p0/z, [z0.s, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32( %pg, + %base, + i64 16) + %res = zext %load to + ret %res +} + +define @gldff1h_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1h_d_imm_offset: +; CHECK: ldff1h { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 16) + %res = zext %load to + ret %res +} + +; LDFF1W +define @gldff1w_s_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1w_s_imm_offset: +; CHECK: ldff1w { z0.s }, p0/z, [z0.s, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32( %pg, + %base, + i64 16) + ret %load +} + +define @gldff1w_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1w_d_imm_offset: +; CHECK: ldff1w { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 16) + %res = zext %load to + ret %res +} + +define @gldff1w_s_imm_offset_float( %pg, %base) { +; CHECK-LABEL: gldff1w_s_imm_offset_float: +; CHECK: ldff1w { z0.s }, p0/z, [z0.s, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32( %pg, + %base, + i64 16) + ret %load +} + +; LDFF1D +define @gldff1d_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1d_d_imm_offset: +; CHECK: ldff1d { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64( %pg, + %base, + i64 16) + ret %load +} + +define @gldff1d_d_imm_offset_double( %pg, %base) { +; CHECK-LABEL: gldff1d_d_imm_offset_double: +; CHECK: ldff1d { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64( %pg, + %base, + i64 16) + ret %load +} + +; +; LDFF1SB, LDFF1SW, LDFF1SH: vector base + immediate offset (index) +; e.g. ldff1sh { z0.s }, p0/z, [z0.s, #16] +; + +; LDFF1SB +define @gldff1sb_s_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1sb_s_imm_offset: +; CHECK: ldff1sb { z0.s }, p0/z, [z0.s, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 16) + %res = sext %load to + ret %res +} + +define @gldff1sb_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1sb_d_imm_offset: +; CHECK: ldff1sb { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 16) + %res = sext %load to + ret %res +} + +; LDFF1SH +define @gldff1sh_s_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1sh_s_imm_offset: +; CHECK: ldff1sh { z0.s }, p0/z, [z0.s, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32( %pg, + %base, + i64 16) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1sh_d_imm_offset: +; CHECK: ldff1sh { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 16) + %res = sext %load to + ret %res +} + +; LDFF1SW +define @gldff1sw_d_imm_offset( %pg, %base) { +; CHECK-LABEL: gldff1sw_d_imm_offset: +; CHECK: ldff1sw { z0.d }, p0/z, [z0.d, #16] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 16) + %res = sext %load to + ret %res +} + +; +; LDFF1B, LDFF1W, LDFF1H, LDFF1D: vector base + out of range immediate offset +; e.g. ldff1b { z0.d }, p0/z, [x0, z0.d] +; + +; LDFF1B +define @gldff1b_s_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1b_s_imm_offset_out_of_range: +; CHECK: mov w8, #32 +; CHECK-NEXT: ldff1b { z0.s }, p0/z, [x8, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 32) + %res = zext %load to + ret %res +} + +define @gldff1b_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1b_d_imm_offset_out_of_range: +; CHECK: mov w8, #32 +; CHECK-NEXT: ldff1b { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 32) + %res = zext %load to + ret %res +} + +; LDFF1H +define @gldff1h_s_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1h_s_imm_offset_out_of_range: +; CHECK: mov w8, #63 +; CHECK-NEXT: ldff1h { z0.s }, p0/z, [x8, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32( %pg, + %base, + i64 63) + %res = zext %load to + ret %res +} + +define @gldff1h_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1h_d_imm_offset_out_of_range: +; CHECK: mov w8, #63 +; CHECK-NEXT: ldff1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 63) + %res = zext %load to + ret %res +} + +; LDFF1W +define @gldff1w_s_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1w_s_imm_offset_out_of_range: +; CHECK: mov w8, #125 +; CHECK-NEXT: ldff1w { z0.s }, p0/z, [x8, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32( %pg, + %base, + i64 125) + ret %load +} + +define @gldff1w_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1w_d_imm_offset_out_of_range: +; CHECK: mov w8, #125 +; CHECK-NEXT: ldff1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 125) + %res = zext %load to + ret %res +} + +define @gldff1w_s_imm_offset_out_of_range_float( %pg, %base) { +; CHECK-LABEL: gldff1w_s_imm_offset_out_of_range_float: +; CHECK: mov w8, #125 +; CHECK-NEXT: ldff1w { z0.s }, p0/z, [x8, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32( %pg, + %base, + i64 125) + ret %load +} + +; LDFF1D +define @gldff1d_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1d_d_imm_offset_out_of_range: +; CHECK: mov w8, #249 +; CHECK-NEXT: ldff1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64( %pg, + %base, + i64 249) + ret %load +} + +define @gldff1d_d_imm_offset_out_of_range_double( %pg, %base) { +; CHECK-LABEL: gldff1d_d_imm_offset_out_of_range_double: +; CHECK: mov w8, #249 +; CHECK-NEXT: ldff1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64( %pg, + %base, + i64 249) + ret %load +} + +; +; LDFF1SB, LDFF1SW, LDFF1SH: vector base + out of range immediate offset +; e.g. ldff1sb { z0.s }, p0/z, [x8, z0.s, uxtw] +; + +; LDFF1SB +define @gldff1sb_s_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1sb_s_imm_offset_out_of_range: +; CHECK: mov w8, #32 +; CHECK-NEXT: ldff1sb { z0.s }, p0/z, [x8, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 32) + %res = sext %load to + ret %res +} + +define @gldff1sb_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1sb_d_imm_offset_out_of_range: +; CHECK: mov w8, #32 +; CHECK-NEXT: ldff1sb { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 32) + %res = sext %load to + ret %res +} + +; LDFF1SH +define @gldff1sh_s_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1sh_s_imm_offset_out_of_range: +; CHECK: mov w8, #63 +; CHECK-NEXT: ldff1sh { z0.s }, p0/z, [x8, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32( %pg, + %base, + i64 63) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1sh_d_imm_offset_out_of_range: +; CHECK: mov w8, #63 +; CHECK-NEXT: ldff1sh { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 63) + %res = sext %load to + ret %res +} + +; LDFF1SW +define @gldff1sw_d_imm_offset_out_of_range( %pg, %base) { +; CHECK-LABEL: gldff1sw_d_imm_offset_out_of_range: +; CHECK: mov w8, #125 +; CHECK-NEXT: ldff1sw { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 125) + %res = sext %load to + ret %res +} + +; LDFF1B/LDFF1SB +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(, , i64) + +; LDFF1H/LDFF1SH +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(, , i64) + +; LDFF1W/LDFF1SW +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(, , i64) + +; LDFF1D +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ff-gather-loads-vector-base-scalar-offset.ll @@ -0,0 +1,186 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LDFF1B, LDFF1W, LDFF1H, LDFF1D: vector base + scalar offset (index) +; e.g. ldff1b { z0.d }, p0/z, [x0, z0.d] +; + +; LDFF1B +define @gldff1b_s_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1b_s_scalar_offset: +; CHECK: ldff1b { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldff1b_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1b_d_scalar_offset: +; CHECK: ldff1b { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDFF1H +define @gldff1h_s_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1h_s_scalar_offset: +; CHECK: ldff1h { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldff1h_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1h_d_scalar_offset: +; CHECK: ldff1h { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDFF1W +define @gldff1w_s_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1w_s_scalar_offset: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldff1w_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1w_d_scalar_offset: +; CHECK: ldff1w { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldff1w_s_scalar_offset_float( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1w_s_scalar_offset_float: +; CHECK: ldff1w { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +; LDFF1D +define @gldff1d_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1d_d_scalar_offset: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldff1d_d_scalar_offset_double( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1d_d_scalar_offset_double: +; CHECK: ldff1d { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; LDFF1SB, LDFF1SW, LDFF1SH: vector base + scalar offset (index) +; e.g. ldff1b { z0.d }, p0/z, [x0, z0.d] +; + +; LDFF1SB +define @gldff1sb_s_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1sb_s_scalar_offset: +; CHECK: ldff1sb { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldff1sb_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1sb_d_scalar_offset: +; CHECK: ldff1sb { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDFF1SH +define @gldff1sh_s_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1sh_s_scalar_offset: +; CHECK: ldff1sh { z0.s }, p0/z, [x0, z0.s, uxtw] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldff1sh_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1sh_d_scalar_offset: +; CHECK: ldff1sh { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDFF1SW +define @gldff1sw_d_scalar_offset( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldff1sw_d_scalar_offset: +; CHECK: ldff1sw { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDFF1B/LDFF1SB +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i8.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i8.nxv2i64(, , i64) + +; LDFF1H/LDFF1SH +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i16.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i16.nxv2i64(, , i64) + +; LDFF1W/LDFF1SW +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4i32.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i32.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv4f32.nxv4i32(, , i64) + +; LDFF1D +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2i64.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldff1.gather.scalar.offset.nxv2f64.nxv2i64(, , i64)