diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1767,6 +1767,9 @@ // 64 bit unscaled offsets def int_aarch64_sve_ldnt1_gather : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; +// 64 bit indices +def int_aarch64_sve_ldnt1_gather_index : AdvSIMD_GatherLoad_SV_64b_Offsets_Intrinsic; + // 32 bit unscaled offsets, zero (zxtw) extended to 64 bits def int_aarch64_sve_ldnt1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic; @@ -1814,6 +1817,10 @@ // 64 bit unscaled offsets def int_aarch64_sve_stnt1_scatter : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic; +// 64 bit indices +def int_aarch64_sve_stnt1_scatter_index + : AdvSIMD_ScatterStore_SV_64b_Offsets_Intrinsic; + // 32 bit unscaled offsets, zero (zxtw) extended to 64 bits def int_aarch64_sve_stnt1_scatter_uxtw : AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5262,7 +5262,8 @@ // amounts. This catches things like trying to shift an i1024 value by an // i8, which is easy to fall into in generic code that uses // TLI.getShiftAmount(). - assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) && + assert(N2.getValueType().getScalarSizeInBits().getFixedSize() >= + Log2_32_Ceil(VT.getScalarSizeInBits().getFixedSize()) && "Invalid use of small shift amount with oversized value!"); // Always fold shifts of i1 values so the code generator doesn't need to diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -190,6 +190,11 @@ return SelectSVELogicalImm(N, VT, Imm); } + template + bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) { + return SelectSVEShiftImm64(N, Low, High, Imm); + } + // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. template bool SelectCntImm(SDValue N, SDValue &Imm) { @@ -307,6 +312,8 @@ bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); + bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High, + SDValue &Imm); bool SelectSVEArithImm(SDValue N, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, @@ -3072,6 +3079,24 @@ return false; } +// This method is only needed to "cast" i64s into i32s when the value +// is a valid shift which has been splatted into a vector with i64 elements. +// Every other type is fine in tablegen. +bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low, + uint64_t High, SDValue &Imm) { + if (auto *CN = dyn_cast(N)) { + uint64_t ImmVal = CN->getZExtValue(); + SDLoc DL(N); + + if (ImmVal >= Low && ImmVal <= High) { + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } + } + + return false; +} + bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) { // tagp(FrameIndex, IRGstack, tag_offset): // since the offset between FrameIndex and IRGstack is a compile-time diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -263,6 +263,7 @@ // Non-temporal gather loads GLDNT1, + GLDNT1_INDEX, GLDNT1S, // Scatter store @@ -276,6 +277,7 @@ // Non-temporal scatter store SSTNT1, + SSTNT1_INDEX, // Strict (exception-raising) floating point comparison STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1440,6 +1440,7 @@ case AArch64ISD::GLDFF1S_IMM: return "AArch64ISD::GLDFF1S_IMM"; case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1"; + case AArch64ISD::GLDNT1_INDEX: return "AArch64ISD::GLDNT1_INDEX"; case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S"; case AArch64ISD::SST1: return "AArch64ISD::SST1"; @@ -1451,6 +1452,7 @@ case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1"; + case AArch64ISD::SSTNT1_INDEX: return "AArch64ISD::SSTNT1_INDEX"; case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; @@ -12625,6 +12627,19 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } +// Turns the vector of indices into a vector of byte offstes by scaling Offset +// by (BitWidth / 8). +static SDValue getScaledOffsetForLDNT1(SelectionDAG &DAG, SDValue Offset, + SDLoc DL, unsigned BitWidth) { + assert(Offset.getValueType().isScalableVector() && + "This method is only for scalable vectors of offsets"); + + SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64); + SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift); + + return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift); +} + static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { @@ -12652,6 +12667,14 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(5); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal scatters because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::SSTNT1_INDEX) { + Offset = getScaledOffsetForLDNT1(DAG, Offset, DL, SrcElVT.getSizeInBits()); + Opcode = AArch64ISD::SSTNT1; + } + // In the case of non-temporal gather loads there's only one SVE instruction // per data-size: "scalar + vector", i.e. // * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] @@ -12746,6 +12769,14 @@ // vector of offsets (that fits into one register) SDValue Offset = N->getOperand(4); + // For "scalar + vector of indices", just scale the indices. This only + // applies to non-temporal gathers because there's no instruction that takes + // indicies. + if (Opcode == AArch64ISD::GLDNT1_INDEX) { + Offset = getScaledOffsetForLDNT1(DAG, Offset, DL, RetElVT.getSizeInBits()); + Opcode = AArch64ISD::GLDNT1; + } + // In the case of non-temporal gather loads there's only one SVE instruction // per data-size: "scalar + vector", i.e. // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0] @@ -13003,6 +13034,8 @@ return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); + case Intrinsic::aarch64_sve_ldnt1_gather_index: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_INDEX); case Intrinsic::aarch64_sve_ldnt1_gather_uxtw: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnf1: @@ -13017,6 +13050,8 @@ return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); case Intrinsic::aarch64_sve_stnt1_scatter: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); + case Intrinsic::aarch64_sve_stnt1_scatter_index: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX); case Intrinsic::aarch64_sve_ld1_gather: return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1); case Intrinsic::aarch64_sve_ld1_gather_index: diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// def SVE8BitLslImm : ComplexPattern; +def SVELShiftImm64 : ComplexPattern", []>; // Non-faulting loads - node definitions // @@ -139,7 +140,6 @@ def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; let Predicates = [HasSVE] in { - defm RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>; def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">; defm RDFFR_P : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>; @@ -1090,9 +1090,23 @@ defm INDEX_II : sve_int_index_ii<"index", index_vector>; // Unpredicated shifts - defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">; - defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">; - defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">; + defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", sra>; + defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", srl>; + defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", shl>; + + // Patterns for unpredicated left shift by immediate + def : Pat<(nxv16i8 (shl (nxv16i8 ZPR:$Zs1), + (nxv16i8 (AArch64dup (vecshiftL8:$imm))))), + (LSL_ZZI_B ZPR:$Zs1, vecshiftL8:$imm)>; + def : Pat<(nxv8i16 (shl (nxv8i16 ZPR:$Zs1), + (nxv8i16 (AArch64dup (vecshiftL16:$imm))))), + (LSL_ZZI_H ZPR:$Zs1, vecshiftL16:$imm)>; + def : Pat<(nxv4i32 (shl (nxv4i32 ZPR:$Zs1), + (nxv4i32 (AArch64dup (vecshiftL32:$imm))))), + (LSL_ZZI_S ZPR:$Zs1, vecshiftL32:$imm)>; + def : Pat<(nxv2i64 (shl (nxv2i64 ZPR:$Zs1), + (nxv2i64 (AArch64dup (i64 (SVELShiftImm64 i32:$imm)))))), + (LSL_ZZI_D ZPR:$Zs1, vecshiftL64:$imm)>; defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">; defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4818,10 +4818,12 @@ } class sve_int_bin_cons_shift_imm tsz8_64, bits<2> opc, string asm, - ZPRRegOp zprty, Operand immtype> + ZPRRegOp zprty, Operand immtype, ValueType vt, + SDPatternOperator op> : I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm), asm, "\t$Zd, $Zn, $imm", - "", []>, Sched<[]> { + "", + [(set (vt zprty:$Zd), (op (vt zprty:$Zn), immtype:$imm))]>, Sched<[]> { bits<5> Zd; bits<5> Zn; bits<6> imm; @@ -4836,29 +4838,31 @@ let Inst{4-0} = Zd; } -multiclass sve_int_bin_cons_shift_imm_left opc, string asm> { - def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { +multiclass sve_int_bin_cons_shift_imm_left opc, string asm, + SDPatternOperator op> { + def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8, nxv16i8, op>; + def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16, nxv8i16, op> { let Inst{19} = imm{3}; } - def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32, nxv4i32, op> { let Inst{20-19} = imm{4-3}; } - def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64, nxv2i64, op> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } } -multiclass sve_int_bin_cons_shift_imm_right opc, string asm> { - def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { +multiclass sve_int_bin_cons_shift_imm_right opc, string asm, + SDPatternOperator op> { + def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8, nxv16i8, op>; + def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16, nxv8i16, op> { let Inst{19} = imm{3}; } - def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : sve_int_bin_cons_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32, nxv4i32, op> { let Inst{20-19} = imm{4-3}; } - def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : sve_int_bin_cons_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64, nxv2i64, op> { let Inst{22} = imm{5}; let Inst{20-19} = imm{4-3}; } diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads-64bit-scaled-offset.ll @@ -0,0 +1,90 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1H, LDNT1W, LDNT1D: base + 64-bit index +; e.g. +; lsl z0.d, z0.d, #1 +; ldnt1h z0.d, p0/z, [z0.d, x0] +; + +define @gldnt1h_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1h_index +; CHECK: lsl z0.d, z0.d, #1 +; CHECK-NEXT: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1w_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldnt1w_index +; CHECK: lsl z0.d, z0.d, #2 +; CHECK-NEXT: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = zext %load to + ret %res +} + +define @gldnt1d_index( %pg, i64* %base, %b) { +; CHECK-LABEL: gldnt1d_index +; CHECK: lsl z0.d, z0.d, #3 +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64( %pg, + i64* %base, + %b) + ret %load +} + +define @gldnt1d_index_double( %pg, double* %base, %b) { +; CHECK-LABEL: gldnt1d_index_double +; CHECK: lsl z0.d, z0.d, #3 +; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64( %pg, + double* %base, + %b) + ret %load +} + +; +; LDNT1SH, LDNT1SW: base + 64-bit index +; e.g. +; lsl z0.d, z0.d, #1 +; ldnt1sh z0.d, p0/z, [z0.d, x0] +; + +define @gldnt1sh_index( %pg, i16* %base, %b) { +; CHECK-LABEL: gldnt1sh_index +; CHECK: lsl z0.d, z0.d, #1 +; CHECK-NEXT: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16( %pg, + i16* %base, + %b) + %res = sext %load to + ret %res +} + +define @gldnt1sw_index( %pg, i32* %base, %b) { +; CHECK-LABEL: gldnt1sw_index +; CHECK: lsl z0.d, z0.d, #2 +; CHECK-NEXT: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32( %pg, + i32* %base, + %b) + %res = sext %load to + ret %res +} + +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2i16(, i16*, ) +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2i32(, i32*, ) +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2i64(, i64*, ) +declare @llvm.aarch64.sve.ldnt1.gather.index.nxv2f64(, double*, ) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores-64bit-scaled-offset.ll @@ -0,0 +1,64 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1H, STNT1W, STNT1D: base + 64-bit index +; e.g. +; lsl z1.d, z1.d, #1 +; stnt1h { z0.d }, p0, [z0.d, x0] +; + +define void @sstnt1h_index( %data, %pg, i16* %base, %offsets) { +; CHECK-LABEL: sstnt1h_index +; CHECK: lsl z1.d, z1.d, #1 +; CHECK-NEXT: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16( %data_trunc, + %pg, + i16* %base, + %offsets) + ret void +} + +define void @sstnt1w_index( %data, %pg, i32* %base, %offsets) { +; CHECK-LABEL: sstnt1w_index +; CHECK: lsl z1.d, z1.d, #2 +; CHECK-NEXT: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32( %data_trunc, + %pg, + i32* %base, + %offsets) + ret void +} + +define void @sstnt1d_index( %data, %pg, i64* %base, %offsets) { +; CHECK-LABEL: sstnt1d_index +; CHECK: lsl z1.d, z1.d, #3 +; CHECK-NEXT: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64( %data, + %pg, + i64* %base, + %offsets) + ret void +} + +define void @sstnt1d_index_double( %data, %pg, double* %base, %offsets) { +; CHECK-LABEL: sstnt1d_index_double +; CHECK: lsl z1.d, z1.d, #3 +; CHECK-NEXT: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64( %data, + %pg, + double* %base, + %offsets) + ret void +} + + +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i16(, , i16*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i32(, , i32*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2i64(, , i64*, ) +declare void @llvm.aarch64.sve.stnt1.scatter.index.nxv2f64(, , double*, )