diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14366,6 +14366,63 @@ return SDValue(); } +static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) { + unsigned Opc = N->getOpcode(); + + SDLoc DL(N); + SDValue Op0 = N->getOperand(0); // Ch + SDValue Op1 = N->getOperand(1); // Pg + SDValue Op2 = N->getOperand(2); // Base + SDValue Op3 = N->getOperand(3); // Offset + SDValue Op4 = N->getOperand(4); // Ty + + EVT ResVT = N->getValueType(0); + + const auto OffsetOpc = Op3.getOpcode(); + const bool OffsetIsZExt = + OffsetOpc == AArch64ISD::ZERO_EXTEND_INREG_MERGE_PASSTHRU; + const bool OffsetIsSExt = + OffsetOpc == AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU; + + // If the offset is sign- or zero-extended... + if (OffsetIsSExt || OffsetIsZExt) { + SDValue ExtPg = Op3.getOperand(0); + + // If the predicate for the sign- or zero-extended offset is the + // same as the predicate used for this load... + if (ExtPg == Op1) { + SDValue UnextendedOffset = Op3.getOperand(1); + + unsigned NewOpc; + switch (Opc) { + case AArch64ISD::GLD1_MERGE_ZERO: + NewOpc = OffsetIsZExt ? AArch64ISD::GLD1_UXTW_MERGE_ZERO + : AArch64ISD::GLD1_SXTW_MERGE_ZERO; + break; + case AArch64ISD::GLD1S_MERGE_ZERO: + NewOpc = OffsetIsZExt ? AArch64ISD::GLD1S_UXTW_MERGE_ZERO + : AArch64ISD::GLD1S_SXTW_MERGE_ZERO; + break; + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + NewOpc = OffsetIsZExt ? AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO + : AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO; + break; + case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: + NewOpc = OffsetIsZExt ? AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO + : AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; + break; + default: + llvm_unreachable("Unexpected opcode."); + } + + return DAG.getNode(NewOpc, DL, {ResVT, MVT::Other}, + {Op0, Op1, Op2, UnextendedOffset, Op4}); + } + } + + return SDValue(); +} + /// Target-specific DAG combine function for post-increment LD1 (lane) and /// post-increment LD1R. static SDValue performPostLD1Combine(SDNode *N, @@ -15743,6 +15800,11 @@ return performNVCASTCombine(N); case AArch64ISD::UZP1: return performUzpCombine(N, DAG); + case AArch64ISD::GLD1_MERGE_ZERO: + case AArch64ISD::GLD1S_MERGE_ZERO: + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + case AArch64ISD::GLD1S_SCALED_MERGE_ZERO: + return performGLD1Combine(N, DAG); case ISD::INSERT_VECTOR_ELT: return performPostLD1Combine(N, DCI, true); case ISD::EXTRACT_VECTOR_ELT: diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-scaled-offset.ll @@ -78,7 +78,194 @@ ret %res } +; +; LD1H, LD1W, LD1D: base + 64-bit sign-extended scaled offset +; e.g. ld1h z0.d, p0/z, [x0, z0.d, sxtw #1] +; + +define @gld1h_index_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_index_sxtw +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1w_index_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1w_index_sxtw +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1d_index_sxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_index_sxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i64( %pg, + i64* %base, + %sxtw) + ret %load +} + +define @gld1d_index_double_sxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_index_double_sxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %pg, + double* %base, + %sxtw) + ret %load +} + +; +; LD1SH, LD1SW: base + 64-bit sign-extended scaled offset +; e.g. ld1sh z0.d, p0/z, [x0, z0.d, sxtw #1] +; + +define @gld1sh_index_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_index_sxtw +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_index_sxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1sw_index_sxtw +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = sext %load to + ret %res +} + +; +; LD1H, LD1W, LD1D: base + 64-bit zero-extended scaled offset +; e.g. ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] +; + +define @gld1h_index_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_index_uxtw +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1w_index_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1w_index_uxtw +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1d_index_uxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_index_uxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i64( %pg, + i64* %base, + %uxtw) + ret %load +} + +define @gld1d_index_double_uxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_index_double_uxtw +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2f64( %pg, + double* %base, + %uxtw) + ret %load +} + +; +; LD1SH, LD1SW: base + 64-bit zero-extended scaled offset +; e.g. ld1sh z0.d, p0/z, [x0, z0.d, uxtw #1] +; + +define @gld1sh_index_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_index_uxtw +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_index_uxtw( %pg, i32* %base, %b) { +; CHECK-LABEL: gld1sw_index_uxtw +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.index.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = sext %load to + ret %res +} + declare @llvm.aarch64.sve.ld1.gather.index.nxv2i16(, i16*, ) declare @llvm.aarch64.sve.ld1.gather.index.nxv2i32(, i32*, ) declare @llvm.aarch64.sve.ld1.gather.index.nxv2i64(, i64*, ) declare @llvm.aarch64.sve.ld1.gather.index.nxv2f64(, double*, ) + +declare @llvm.aarch64.sve.sxtw.nxv2i64(, , ) +declare @llvm.aarch64.sve.uxtw.nxv2i64(, , ) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-loads-64bit-unscaled-offset.ll @@ -100,8 +100,251 @@ ret %res } +; +; LD1B, LD1W, LD1H, LD1D: base + 64-bit sign-extended unscaled offset +; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; + +define @gld1b_d_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1b_d_sxtw: +; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1h_d_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_d_sxtw: +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1w_d_sxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1w_d_sxtw: +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = zext %load to + ret %res +} + +define @gld1d_d_sxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_d_sxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i64( %pg, + i64* %base, + %sxtw) + ret %load +} + +define @gld1d_d_double_sxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_d_double_sxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2f64( %pg, + double* %base, + %sxtw) + ret %load +} + +; +; LD1SB, LD1SW, LD1SH: base + 64-bit sign-extended unscaled offset +; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d] +; + +define @gld1sb_d_sxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1sb_d_sxtw: +; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %sxtw) + %res = sext %load to + ret %res +} + +define @gld1sh_d_sxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_d_sxtw: +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %sxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_d_sxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1sw_d_sxtw: +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ret + %sxtw = call @llvm.aarch64.sve.sxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %sxtw) + %res = sext %load to + ret %res +} + +; +; LD1B, LD1W, LD1H, LD1D: base + 64-bit zero-extended unscaled offset +; e.g. ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] +; + +define @gld1b_d_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1b_d_uxtw: +; CHECK: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1h_d_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1h_d_uxtw: +; CHECK: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1w_d_uxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1w_d_uxtw: +; CHECK: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = zext %load to + ret %res +} + +define @gld1d_d_uxtw( %pg, i64* %base, %b) { +; CHECK-LABEL: gld1d_d_uxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i64( %pg, + i64* %base, + %uxtw) + ret %load +} + +define @gld1d_d_double_uxtw( %pg, double* %base, %b) { +; CHECK-LABEL: gld1d_d_double_uxtw: +; CHECK: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2f64( %pg, + double* %base, + %uxtw) + ret %load +} + +; +; LD1SB, LD1SW, LD1SH: base + 64-bit zero-extended unscaled offset +; e.g. ld1sh { z0.d }, p0/z, [x0, z0.d] +; + +define @gld1sb_d_uxtw( %pg, i8* %base, %b) { +; CHECK-LABEL: gld1sb_d_uxtw: +; CHECK: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i8( %pg, + i8* %base, + %uxtw) + %res = sext %load to + ret %res +} + +define @gld1sh_d_uxtw( %pg, i16* %base, %b) { +; CHECK-LABEL: gld1sh_d_uxtw: +; CHECK: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %b) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i16( %pg, + i16* %base, + %uxtw) + %res = sext %load to + ret %res +} + +define @gld1sw_d_uxtw( %pg, i32* %base, %offsets) { +; CHECK-LABEL: gld1sw_d_uxtw: +; CHECK: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw] +; CHECK-NEXT: ret + %uxtw = call @llvm.aarch64.sve.uxtw.nxv2i64( undef, + %pg, + %offsets) + %load = call @llvm.aarch64.sve.ld1.gather.nxv2i32( %pg, + i32* %base, + %uxtw) + %res = sext %load to + ret %res +} + declare @llvm.aarch64.sve.ld1.gather.nxv2i8(, i8*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2i16(, i16*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2i32(, i32*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2i64(, i64*, ) declare @llvm.aarch64.sve.ld1.gather.nxv2f64(, double*, ) + +declare @llvm.aarch64.sve.sxtw.nxv2i64(, , ) +declare @llvm.aarch64.sve.uxtw.nxv2i64(, , )