diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14366,6 +14366,64 @@ return SDValue(); } +static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { + unsigned Opc = N->getOpcode(); + + assert(((Opc >= AArch64ISD::GLD1_MERGE_ZERO && // unsigned gather loads + Opc <= AArch64ISD::GLD1_IMM_MERGE_ZERO) || + (Opc >= AArch64ISD::GLD1S_MERGE_ZERO && // signed gather loads + Opc <= AArch64ISD::GLD1S_IMM_MERGE_ZERO)) && + "Invalid opcode."); + + const bool Scaled = Opc == AArch64ISD::GLD1_SCALED_MERGE_ZERO || + Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; + const bool Signed = Opc == AArch64ISD::GLD1S_MERGE_ZERO || + Opc == AArch64ISD::GLD1S_SCALED_MERGE_ZERO; + const bool SExt = Opc == AArch64ISD::GLD1_SXTW_MERGE_ZERO || + Opc == AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO; + const bool ZExt = Opc == AArch64ISD::GLD1_UXTW_MERGE_ZERO || + Opc == AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO; + const bool Extended = SExt || ZExt; + + SDLoc DL(N); + SDValue Chain = N->getOperand(0); + SDValue Pg = N->getOperand(1); + SDValue Base = N->getOperand(2); + SDValue Offset = N->getOperand(3); + SDValue Ty = N->getOperand(4); + + EVT ResVT = N->getValueType(0); + + const auto OffsetOpc = Offset.getOpcode(); + const bool OffsetIsZExt = + OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU; + const bool OffsetIsSExt = + OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU; + + // Fold sign/zero extensions of vector offsets into GLD1 nodes where possible. + if (!Extended && (OffsetIsSExt || OffsetIsZExt)) { + SDValue ExtPg = Offset.getOperand(0); + VTSDNode *ExtFrom = cast(Offset.getOperand(2).getNode()); + EVT ExtFromEVT = ExtFrom->getVT().getVectorElementType(); + + // If the predicate for the sign- or zero-extended offset is the + // same as the predicate used for this load and the sign-/zero-extension + // was from a 32-bits... + if (ExtPg == Pg && ExtFromEVT == MVT::i32) { + SDValue UnextendedOffset = Offset.getOperand(1); + + unsigned NewOpc = getGatherVecOpcode(Scaled, OffsetIsSExt, true); + if (Signed) + NewOpc = getSignExtendedGatherOpcode(NewOpc); + + return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other}, + {Chain, Pg, Base, UnextendedOffset, Ty}); + } + } + + return SDValue(); +} + /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, @@ -15743,6 +15801,21 @@ return performNVCASTCombine(N); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); + case AArch64ISD::GLD1_MERGE_ZERO: + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1_IMM_MERGE_ZERO: + case AArch64ISD::GLD1S_MERGE_ZERO: + case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1S_UXTW_MERGE_ZERO: + case AArch64ISD::GLD1S_SXTW_MERGE_ZERO: + case AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1S_IMM_MERGE_ZERO: + return performGLD1Combine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll @@ -78,7 +78,194 @@ ret %res } +; +; LD1H, LD1W, LD1D: base + 64-bit sxtw'd scaled offset +; e.g. ld1h z0.d, p0/z, [x0, z0.d, sxtw #1] +; + +define @gld1h_index_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_index_sxtw +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1w_index_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1w_index_sxtw +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1d_index_sxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_index_sxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i64( %pg, + i64* %base, + %sxtw) + ret %load +} + +define @gld1d_index_double_sxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_index_double_sxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %pg, + double* %base, + %sxtw) + ret %load +} + +; +; LD1SH, LD1SW: base + 64-bit sxtw'd scaled offset +; e.g. ld1sh z0.d, p0/z, [x0, z0.d, sxtw #1] +; + +define @gld1sh_index_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_index_sxtw +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_index_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1sw_index_sxtw +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = sext %load to + ret %res +} + +; +; LD1H, LD1W, LD1D: base + 64-bit sxtw'd scaled offset +; e.g. ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] +; + +define @gld1h_index_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_index_uxtw +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1w_index_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1w_index_uxtw +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1d_index_uxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_index_uxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i64( %pg, + i64* %base, + %uxtw) + ret %load +} + +define @gld1d_index_double_uxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_index_double_uxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %pg, + double* %base, + %uxtw) + ret %load +} + +; +; LD1SH, LD1SW: base + 64-bit uxtw'd scaled offset +; e.g. ld1sh z0.d, p0/z, [x0, z0.d, uxtw #1] +; + +define @gld1sh_index_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_index_uxtw +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_index_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1sw_index_uxtw +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = sext %load to + ret %res +} + declare @llvm.aarch64.sve.ld1.gather.index.nxv2i16(, i16*, ) declare @llvm.aarch64.sve.ld1.gather.index.nxv2i32(, i32*, ) declare @llvm.aarch64.sve.ld1.gather.index.nxv2i64(, i64*, ) declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, double*, ) + +declare @llvm.aarch64.sve.sxtw.nxv2i64(, , ) +declare @llvm.aarch64.sve.uxtw.nxv2i64(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll @@ -100,8 +100,251 @@ ret %res } +; +; LD1B, LD1W, LD1H, LD1D: base + 64-bit sxtw'd unscaled offset +; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; + +define @gld1b_d_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1b_d_sxtw: +; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1h_d_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_d_sxtw: +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1w_d_sxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1w_d_sxtw: +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1d_d_sxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_d_sxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i64( %pg, + i64* %base, + %sxtw) + ret %load +} + +define @gld1d_d_double_sxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_d_double_sxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2f64( %pg, + double* %base, + %sxtw) + ret %load +} + +; +; LD1SB, LD1SW, LD1SH: base + 64-bit sxtw'd unscaled offset +; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d] +; + +define @gld1sb_d_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1sb_d_sxtw: +; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %sxtw) + %res = sext %load to + ret %res +} + +define @gld1sh_d_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_d_sxtw: +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_d_sxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1sw_d_sxtw: +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = sext %load to + ret %res +} + +; +; LD1B, LD1W, LD1H, LD1D: base + 64-bit uxtw'd unscaled offset +; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] +; + +define @gld1b_d_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1b_d_uxtw: +; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1h_d_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_d_uxtw: +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1w_d_uxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1w_d_uxtw: +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1d_d_uxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_d_uxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i64( %pg, + i64* %base, + %uxtw) + ret %load +} + +define @gld1d_d_double_uxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_d_double_uxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2f64( %pg, + double* %base, + %uxtw) + ret %load +} + +; +; LD1SB, LD1SW, LD1SH: base + 64-bit uxtw'd unscaled offset +; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d] +; + +define @gld1sb_d_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1sb_d_uxtw: +; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %uxtw) + %res = sext %load to + ret %res +} + +define @gld1sh_d_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_d_uxtw: +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_d_uxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1sw_d_uxtw: +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = sext %load to + ret %res +} + declare @llvm.aarch64.sve.ld1.gather.nxv2i8(, i8*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2i16(, i16*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2i32(, i32*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2i64(, i64*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2f64(, double*, ) + +declare @llvm.aarch64.sve.sxtw.nxv2i64(, , ) +declare @llvm.aarch64.sve.uxtw.nxv2i64(, , )