diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1761,6 +1761,22 @@ // +// Non-temporal gather loads: scalar base + vector offsets +// + +// 64 bit unscaled offsets +def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; + +// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits +def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; + +// +// Non-temporal gather loads: vector base + scalar offset +// + +def int_aarch64_sve_ldnt1_gather_scalar_offset : AdvSIMD_GatherLoad_VS_Intrinsic; + +// // Scatter stores: scalar base + vector offsets // @@ -1792,6 +1808,22 @@ def int_aarch64_sve_st1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic; // +// Non-temporal scatter stores: scalar base + vector offsets +// + +// 64 bit unscaled offsets +def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic; + +// 32 bit unscaled offsets, zero (zxtw) extended to 64 bits +def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic; + +// +// Non-temporal scatter stores: vector base + scalar offset +// + +def int_aarch64_sve_stnt1_scatter_scalar_offset : AdvSIMD_ScatterStore_VS_Intrinsic; + +// // SVE2 - Uniform DSP operations // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -261,6 +261,10 @@ GLDFF1S_SXTW_SCALED, GLDFF1S_IMM, + // Non-temporal gather loads + GLDNT1, + GLDNT1S, + // Scatter store SST1, SST1_SCALED, @@ -270,6 +274,9 @@ SST1_SXTW_SCALED, SST1_IMM, + // Non-temporal scatter store + SSTNT1, + // Strict (exception-raising) floating point comparison STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1438,6 +1438,10 @@ case AArch64ISD::GLDFF1S_UXTW_SCALED: return "AArch64ISD::GLDFF1S_UXTW_SCALED"; case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM"; + + case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1"; + case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S"; + case AArch64ISD::SST1: return "AArch64ISD::SST1"; case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; @@ -1445,6 +1449,9 @@ case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; + + case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1"; + case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; case AArch64ISD::STNP: return "AArch64ISD::STNP"; @@ -10457,6 +10464,7 @@ case AArch64ISD::GLDFF1_UXTW: case AArch64ISD::GLDFF1_UXTW_SCALED: case AArch64ISD::GLDFF1_IMM: + case AArch64ISD::GLDNT1: MemVT = cast(Src->getOperand(4))->getVT(); break; default: @@ -12644,6 +12652,14 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::SSTNT1 && Offset.getValueType().isVector()) + std::swap(Base, Offset); + // SST1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], @@ -12730,6 +12746,14 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // In the case of non-temporal gather loads there's only one SVE instruction + // per data-size: "scalar + vector", i.e. + // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] + // Since we do have intrinsics that allow the arguments to be in a different + // order, we may need to swap them to match the spec. + if (Opcode == AArch64ISD::GLDNT1 && Offset.getValueType().isVector()) + std::swap(Base, Offset); + // GLD{FF}1_IMM requires that the offset is an immediate that is: // * a multiple of #SizeInBytes, // * in the range [0, 31 x #SizeInBytes], @@ -12859,6 +12883,9 @@ case AArch64ISD::GLDFF1_IMM: NewOpc = AArch64ISD::GLDFF1S_IMM; break; + case AArch64ISD::GLDNT1: + NewOpc = AArch64ISD::GLDNT1S; + break; default: return SDValue(); } @@ -12972,12 +12999,24 @@ return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); + case Intrinsic::aarch64_sve_ldnt1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); + case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnf1: return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1); case Intrinsic::aarch64_sve_ldff1: return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); + case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); + case Intrinsic::aarch64_sve_stnt1_scatter: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); case Intrinsic::aarch64_sve_ld1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1); case Intrinsic::aarch64_sve_ld1_gather_index: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -69,6 +69,9 @@ def AArch64ldff1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ldff1s_gather_imm : SDNode<"AArch64ISD::GLDFF1S_IMM", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldnt1_gather : SDNode<"AArch64ISD::GLDNT1", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ldnt1s_gather : SDNode<"AArch64ISD::GLDNT1S", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>; + // Scatter stores - node definitions // def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [ @@ -89,6 +92,8 @@ def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>; + // AArch64 SVE/SVE2 - the remaining node definitions // @@ -1909,32 +1914,32 @@ def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; // SVE2 non-temporal gather loads - defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - - defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather, nxv4i8>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather, nxv4i16>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather, nxv4i32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather, nxv2i8>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather, nxv2i16>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather, nxv2i32>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather, nxv2i64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; // SVE2 non-temporal scatter stores - defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; - - defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>; + + defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; // SVE2 table lookup (three sources) defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5071,16 +5071,36 @@ let mayStore = 1; } -multiclass sve2_mem_sstnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_sstnt_vs_base; +multiclass sve2_mem_sstnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>; +} + +multiclass sve2_mem_sstnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; } class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, @@ -6529,17 +6549,38 @@ let mayLoad = 1; } -multiclass sve2_mem_gldnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_gldnt_vs_base; +multiclass sve2_mem_gldnt_vs_32_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>; +} + +multiclass sve2_mem_gldnt_vs_64_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-32bit-unscaled-offset.ll @@ -0,0 +1,96 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 32-bit unscaled offsets, zero (uxtw) +; extended to 64 bits. +; e.g. ldnt1h { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1B +define @gldnt1b_s_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1b_s_uxtw: +; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +; LDNT1H +define @gldnt1h_s_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1h_s_uxtw: +; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +; LDNT1W +define @gldnt1w_s_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gldnt1w_s_uxtw: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32( %pg, + i32* %base, + %b) + ret %load +} + +define @gldnt1w_s_uxtw_float( %pg, float* %base, %b) { +; CHECK-LABEL: gldnt1w_s_uxtw_float: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32( %pg, + float* %base, + %b) + ret %load +} + +; LDNT1SB, LDNT1SW, LDNT1SH: base + 32-bit unscaled offsets, zero (uxtw) +; extended to 64 bits. +; e.g. ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1SB +define @gldnt1sb_s_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1sb_s_uxtw: +; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +; LDNT1SH +define @gldnt1sh_s_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1sh_s_uxtw: +; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +; LDNT1B/LDNT1SB +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i8(, i8*, ) +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i8(, i8*, ) + +; LDNT1H/LDNT1SH +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i16(, i16*, ) +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i16(, i16*, ) + +; LDNT1W/LDNT1SW +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4i32(, i32*, ) +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4i32(, i32*, ) + +declare @llvm.aarch64.sve.ldnt1.gather.sxtw.nxv4f32(, float*, ) +declare @llvm.aarch64.sve.ldnt1.gather.uxtw.nxv4f32(, float*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-unscaled-offset.ll @@ -0,0 +1,103 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: base + 64-bit unscaled offsets +; e.g. ldnt1h { z0.d }, p0/z, [z0.d, x0] +; + +define @gldnt1b_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1b_d: +; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1h_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1h_d: +; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1w_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gldnt1w_d: +; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = zext %load to + ret %res +} + +define @gldnt1d_d( %pg, i64* %base, %b) { +; CHECK-LABEL: gldnt1d_d: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldnt1d_d_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldnt1d_d_double: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDNT1SB, LDNT1SW, LDNT1SH: base + 64-bit unscaled offsets +; e.g. ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; + +define @gldnt1sb_d( %pg, i8* %base, %b) { +; CHECK-LABEL: gldnt1sb_d: +; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i8( %pg, + i8* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldnt1sh_d( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1sh_d: +; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldnt1sw_d( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gldnt1sw_d: +; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i32( %pg, + i32* %base, + %offsets) + %res = sext %load to + ret %res +} + +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i8(, i8*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-vector-base-scalar-offset.ll @@ -0,0 +1,188 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: vector base + scalar offset +; ldnt1b { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1B +define @gldnt1b_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1b_s: +; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldnt1b_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1b_d: +; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1H +define @gldnt1h_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1h_s: +; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldnt1h_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1h_d: +; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1W +define @gldnt1w_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_s: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldnt1w_s_float( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_s_float: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldnt1w_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_d: +; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1D +define @gldnt1d_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1d_d: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; LDNT1D +define @gldnt1d_d_double( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1d_d_double: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; +; LDNT1SB, LDNT1SW, LDNT1SH, LDNT1SD: vector base + scalar offset +; ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1SB +define @gldnt1sb_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sb_s: +; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldnt1sb_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sb_d: +; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1SH +define @gldnt1sh_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sh_s: +; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldnt1sh_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sh_d: +; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1SW +define @gldnt1sw_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sw_d: +; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1B/LDNT1SB +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i8.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i8.nxv2i64(, , i64) + +; LDNT1H/LDNT1SH +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv416.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i16.nxv2i64(, , i64) + +; LDNT1W/LDNT1SW +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4i32.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i32.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv4f32.nxv4i32(, , i64) + +; LDNT1D +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2i64.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldnt1.gather.scalar.offset.nxv2f64.nxv2i64(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-32bit-unscaled-offset.ll @@ -0,0 +1,77 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: base + 32-bit unscaled offset, zero (uxtw) +; extended to 64 bits. +; e.g. stnt1h { z0.d }, p0, [z1.d, x0] +; + +; STNT1B +define void @sstnt1b_s_uxtw( %data, %pg, i8* %base, %offsets) { +; CHECK-LABEL: sstnt1b_s_uxtw: +; CHECK: stnt1b { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8( %data_trunc, + %pg, + i8* %base, + %offsets) + ret void +} + +; STNT1H +define void @sstnt1h_s_uxtw( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sstnt1h_s_uxtw: +; CHECK: stnt1h { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +; STNT1W +define void @sstnt1w_s_uxtw( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sstnt1w_s_uxtw: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32( %data, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sstnt1w_s_uxtw_float( %data, %pg, float* %base, %offsets) { +; CHECK-LABEL: sstnt1w_s_uxtw_float: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32( %data, + %pg, + float* %base, + %offsets) + ret void +} + +; STNT1B +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i8(, , i8*, ) + +; STNT1H +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i16(, , i16*, ) + +; STNT1W +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv2i32(, , i32*, ) + +declare void @llvm.aarch64.sve.stnt1.scatter.sxtw.nxv4f32(, , float*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.uxtw.nxv4f32(, , float*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-unscaled-offset.ll @@ -0,0 +1,70 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: base + 64-bit unscaled offset +; e.g. stnt1h { z0.d }, p0, [z1.d, x0] +; + +define void @sstnt1b_d( %data, %pg, i8* %base, %b) { +; CHECK-LABEL: sstnt1b_d: +; CHECK: stnt1b { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i8( %data_trunc, + %pg, + i8* %base, + %b) + ret void +} + +define void @sstnt1h_d( %data, %pg, i16* %base, %b) { +; CHECK-LABEL: sstnt1h_d: +; CHECK: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16( %data_trunc, + %pg, + i16* %base, + %b) + ret void +} + +define void @sstnt1w_d( %data, %pg, i32* %base, %b) { +; CHECK-LABEL: sstnt1w_d: +; CHECK: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i32( %data_trunc, + %pg, + i32* %base, + %b) + ret void +} + +define void @sstnt1d_d( %data, %pg, i64* %base, %b) { +; CHECK-LABEL: sstnt1d_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i64( %data, + %pg, + i64* %base, + %b) + ret void +} + +define void @sstnt1d_d_double( %data, %pg, double* %base, %b) { +; CHECK-LABEL: sstnt1d_d_double: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv2f64( %data, + %pg, + double* %base, + %b) + ret void +} + +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i8(, , i8*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f64(, , double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-vector-base-scalar-offset.ll @@ -0,0 +1,134 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: vector base + scalar offset +; stnt1b { z0.s }, p0/z, [z0.s, x0] +; + +; STNT1B +define void @stnt1b_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1b_s: +; CHECK: stnt1b { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1b_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1b_d: +; CHECK: stnt1b { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1H +define void @stnt1h_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1h_s: +; CHECK: stnt1h { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1h_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1h_d: +; CHECK: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1W +define void @stnt1w_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_s: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1w_f32_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_f32_s: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1w_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_d: +; CHECK: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1D +define void @stnt1d_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1d_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1d_f64_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1d_f64_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1B +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i8.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i8.nxv4i32(, , , i64) + +; STNT1H +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i16.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i16.nxv4i32(, , , i64) + +; STNT1W +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4i32.nxv4i32(, , , i64) + +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv4f32.nxv4i32(, , , i64) + +; STNT1D +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2i64.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.scalar.offset.nxv2f64.nxv2i64(, , , i64)