diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1767,6 +1767,9 @@ // 64 bit unscaled offsets def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; +// 64 bit indices +def int_aarch64_sve_ldnt1_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; + // 32 bit unscaled offsets, zero (zxtw) extended to 64 bits def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; @@ -1814,6 +1817,10 @@ // 64 bit unscaled offsets def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic; +// 64 bit indices +def int_aarch64_sve_stnt1_scatter_index + : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic; + // 32 bit unscaled offsets, zero (zxtw) extended to 64 bits def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -263,6 +263,7 @@ // Non-temporal gather loads GLDNT1, + GLDNT1_INDEX, GLDNT1S, // Scatter store @@ -276,6 +277,7 @@ // Non-temporal scatter store SSTNT1, + SSTNT1_INDEX, // Strict (exception-raising) floating point comparison STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1440,6 +1440,7 @@ case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM"; case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1"; + case AArch64ISD::GLDNT1_INDEX: return "AArch64ISD::GLDNT1_INDEX"; case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S"; case AArch64ISD::SST1: return "AArch64ISD::SST1"; @@ -1451,6 +1452,7 @@ case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1"; + case AArch64ISD::SSTNT1_INDEX: return "AArch64ISD::SSTNT1_INDEX"; case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; @@ -12652,6 +12654,19 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal scatters because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::SSTNT1_INDEX) { + SDValue BytesPerElt = + DAG.getConstant(SrcElVT.getScalarSizeInBits() / 8, DL, MVT::i64); + SDValue SplatBytesPerElt = + DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, BytesPerElt); + + Offset = DAG.getNode(ISD::MUL, DL, MVT::nxv2i64, SplatBytesPerElt, Offset); + Opcode = AArch64ISD::SSTNT1; + } + // In the case of non-temporal gather loads there's only one SVE instruction // per data-size: "scalar + vector", i.e. // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] @@ -12746,6 +12761,19 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal gathers because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::GLDNT1_INDEX) { + SDValue BytesPerElt = + DAG.getConstant(RetElVT.getScalarSizeInBits() / 8, DL, MVT::i64); + SDValue SplatBytesPerElt = + DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, BytesPerElt); + + Offset = DAG.getNode(ISD::MUL, DL, MVT::nxv2i64, SplatBytesPerElt, Offset); + Opcode = AArch64ISD::GLDNT1; + } + // In the case of non-temporal gather loads there's only one SVE instruction // per data-size: "scalar + vector", i.e. // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] @@ -13003,6 +13031,8 @@ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); + case Intrinsic::aarch64_sve_ldnt1_gather_index: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_INDEX); case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnf1: @@ -13017,6 +13047,8 @@ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); case Intrinsic::aarch64_sve_stnt1_scatter: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); + case Intrinsic::aarch64_sve_stnt1_scatter_index: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX); case Intrinsic::aarch64_sve_ld1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1); case Intrinsic::aarch64_sve_ld1_gather_index: diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll @@ -0,0 +1,90 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1H, LDNT1W, LDNT1D: base + 64-bit index +; e.g. +; mul z0.d, z0.d, #2 +; ldnt1h z0.d, p0/z, [z0.d, x0] +; + +define @gldnt1h_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1h_index +; CHECK: mul z0.d, z0.d, #2 +; CHECK-NEXT: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1w_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldnt1w_index +; CHECK: mul z0.d, z0.d, #4 +; CHECK-NEXT: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1d_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gldnt1d_index +; CHECK: mul z0.d, z0.d, #8 +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldnt1d_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldnt1d_index_double +; CHECK: mul z0.d, z0.d, #8 +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDNT1SH, LDNT1SW: base + 64-bit index +; e.g. +; mul z0.d, z0.d, #2 +; ldnt1sh z0.d, p0/z, [z0.d, x0] +; + +define @gldnt1sh_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1sh_index +; CHECK: mul z0.d, z0.d, #2 +; CHECK-NEXT: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldnt1sw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldnt1sw_index +; CHECK: mul z0.d, z0.d, #4 +; CHECK-NEXT: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll @@ -0,0 +1,64 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1H, STNT1W, STNT1D: base + 64-bit index +; e.g. +; mul z1.d, z1.d, #2 +; stnt1h { z0.d }, p0, [z0.d, x0] +; + +define void @sstnt1h_index( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sstnt1h_index +; CHECK: mul z1.d, z1.d, #2 +; CHECK-NEXT: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sstnt1w_index( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sstnt1w_index +; CHECK: mul z1.d, z1.d, #4 +; CHECK-NEXT: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sstnt1d_index( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sstnt1d_index +; CHECK: mul z1.d, z1.d, #8 +; CHECK-NEXT: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sstnt1d_index_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sstnt1d_index_double +; CHECK: mul z1.d, z1.d, #8 +; CHECK-NEXT: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64( %data, + %pg, + double* %base, + %offsets) + ret void +} + + +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64(, , double*, )