diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1241,6 +1241,31 @@ ], [IntrWriteMem, IntrArgMemOnly]>; +// +// Non-temporal gather load/scatter store +// + +class SVE2_NTGatherLoad_VectorBase_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [ + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty, + llvm_i64_ty + ], + [IntrReadMem, IntrArgMemOnly]>; + +class SVE2_NTScatterStore_VectorBase_Intrinsic + : Intrinsic<[], + [ + llvm_anyvector_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty, llvm_i64_ty + ], + [IntrWriteMem, IntrArgMemOnly]>; + +def int_aarch64_sve_ldnt1_gather : SVE2_NTGatherLoad_VectorBase_Intrinsic; +def int_aarch64_sve_stnt1_scatter : SVE2_NTScatterStore_VectorBase_Intrinsic; + // // Loads // diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -240,6 +240,11 @@ GLD1S_UXTW_SCALED, GLD1S_SXTW_SCALED, GLD1S_IMM, + + // Non-temporal gather loads + GLDNT1, + GLDNT1S, + // Scatter store SST1, SST1_SCALED, @@ -249,6 +254,9 @@ SST1_SXTW_SCALED, SST1_IMM, + // Non-temporal scatter store + SSTNT1, + // Strict (exception-raising) floating point comparison STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE, STRICT_FCMPE, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1415,6 +1415,8 @@ case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; + case AArch64ISD::GLDNT1: return "AArch64ISD::GLDNT1"; + case AArch64ISD::GLDNT1S: return "AArch64ISD::GLDNT1S"; case AArch64ISD::SST1: return "AArch64ISD::SST1"; case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; @@ -1422,6 +1424,7 @@ case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; + case AArch64ISD::SSTNT1: return "AArch64ISD::SSTNT1"; case AArch64ISD::LDP: return "AArch64ISD::LDP"; case AArch64ISD::STP: return "AArch64ISD::STP"; case AArch64ISD::STNP: return "AArch64ISD::STNP"; @@ -10348,6 +10351,7 @@ case AArch64ISD::GLD1_UXTW: case AArch64ISD::GLD1_UXTW_SCALED: case AArch64ISD::GLD1_IMM: + case AArch64ISD::GLDNT1: MemVT = cast(Src->getOperand(4))->getVT(); break; default: @@ -12508,7 +12512,7 @@ DAG.getConstant(MinOffset, DL, MVT::i64)); } -static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, +static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { const SDValue Src = N->getOperand(2); @@ -12536,11 +12540,11 @@ SDValue Offset = N->getOperand(5); // SST1_IMM requires that the offset is an immediate: - // * multiple of #SizeInBytes - // * in the range [0, 31 x #SizeInBytes] - // where #SizeInBytes is the size in bytes of the stored - // items. For immediates outside that range and non-immediate scalar offsets use - // SST1 or SST1_UXTW instead. + // * multiple of #SizeInBytes + // * in the range [0, 31 x #SizeInBytes] + // where #SizeInBytes is the size in bytes of the stored items. For + // immediates outside that range and non-immediate scalar offsets use SST1 or + // SST1_UXTW instead. if (Opcode == AArch64ISD::SST1_IMM) { uint64_t MaxIndex = 31; uint64_t SrcElSize = SrcElVT.getStoreSize().getKnownMinSize(); @@ -12576,7 +12580,8 @@ EVT HwSrcVt = getSVEContainerType(SrcVT); // Keep the original type of the input data to store - this is needed to - // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the + // differentiate between the actual data sizes and instructions, e.g. ST1B, + // ST1H, ST1W and ST1D for regular scatter stores. For FP values we want the // integer equivalent, so just use HwSrcVt. SDValue InputVT = DAG.getValueType(SrcVT); if (SrcVT.isFloatingPoint()) @@ -12600,7 +12605,7 @@ return DAG.getNode(Opcode, DL, VTs, Ops); } -static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, +static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG, unsigned Opcode, bool OnlyPackedOffsets = true) { EVT RetVT = N->getValueType(0); @@ -12608,6 +12613,7 @@ "Gather loads are only possible for SVE vectors"); SDLoc DL(N); + // Make sure that the loaded data will fit into an SVE register if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) return SDValue(); @@ -12619,9 +12625,9 @@ SDValue Offset = N->getOperand(4); // GLD1_IMM requires that the offset is an immediate: - // * multiple of #SizeInBytes - // * in the range [0, 31 x #SizeInBytes] - // where #SizeInBytes is the size in bytes of the loaded items. For immediates + // * multiple of #SizeInBytes + // * in the range [0, 31 x #SizeInBytes] + // where #SizeInBytes is the size in bytes of the loaded items. For immediates // outside that range and non-immediate scalar offsets use GLD1 or GLD1_UXTW // instead. if (Opcode == AArch64ISD::GLD1_IMM) { @@ -12658,10 +12664,10 @@ // Return value type that is representable in hardware EVT HwRetVt = getSVEContainerType(RetVT); - // Keep the original output value type around - this will better inform - // optimisations (e.g. instruction folding when load is followed by - // zext/sext). This will only be used for ints, so the value for FPs - // doesn't matter. + // Keep the original output value type around - this is needed to + // differentiate between the actual data sizes and instructions, e.g. LD1B, + // LD1H, LD1W and LD1D. For FP values we want the integer equivalent, so just + // use HwRetVT. SDValue OutVT = DAG.getValueType(RetVT); if (RetVT.isFloatingPoint()) OutVT = DAG.getValueType(HwRetVt); @@ -12685,7 +12691,6 @@ return DAG.getMergeValues({Load, LoadChain}, DL); } - static SDValue performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -12729,6 +12734,9 @@ case AArch64ISD::GLD1_IMM: NewOpc = AArch64ISD::GLD1S_IMM; break; + case AArch64ISD::GLDNT1: + NewOpc = AArch64ISD::GLDNT1S; + break; default: return SDValue(); } @@ -12842,48 +12850,52 @@ return performNEONPostLDSTCombine(N, DCI, DAG); case Intrinsic::aarch64_sve_ldnt1: return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ldnt1_gather: + return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnf1: return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1); case Intrinsic::aarch64_sve_ldff1: return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1); case Intrinsic::aarch64_sve_stnt1: return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1_scatter: + return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); case Intrinsic::aarch64_sve_ld1_gather: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1); case Intrinsic::aarch64_sve_ld1_gather_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SCALED); case Intrinsic::aarch64_sve_ld1_gather_sxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED, + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_ld1_gather_scalar_offset: - return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM); case Intrinsic::aarch64_sve_st1_scatter: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1); case Intrinsic::aarch64_sve_st1_scatter_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED); case Intrinsic::aarch64_sve_st1_scatter_sxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW, + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW, + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, /*OnlyPackedOffsets=*/false); case Intrinsic::aarch64_sve_st1_scatter_scalar_offset: - return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); + return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM); default: break; } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -15,6 +15,7 @@ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; +// Gather Loads def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> @@ -25,6 +26,12 @@ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; +def SDT_AArch64_GLDNT1 : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +// Scatter Stores def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> @@ -35,6 +42,11 @@ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> ]>; +def SDT_AArch64_SSTNT1 : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; @@ -53,6 +65,11 @@ def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ldnt1_gather : SDNode<"AArch64ISD::GLDNT1", SDT_AArch64_GLDNT1, [SDNPHasChain, SDNPMayLoad]>; +def AArch64ldnt1s_gather : SDNode<"AArch64ISD::GLDNT1S", SDT_AArch64_GLDNT1, [SDNPHasChain, SDNPMayLoad]>; + +def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1", SDT_AArch64_SSTNT1, [SDNPHasChain, SDNPMayStore]>; + // SVE CNT/INC/RDVL def sve_rdvl_imm : ComplexPattern">; def sve_cnth_imm : ComplexPattern">; @@ -1752,32 +1769,32 @@ def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">; // SVE2 non-temporal gather loads - defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>; - defm LDNT1B_ZZR_S : sve2_mem_gldnt_vs<0b00001, "ldnt1b", Z_s, ZPR32>; - defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>; - defm LDNT1H_ZZR_S : sve2_mem_gldnt_vs<0b00101, "ldnt1h", Z_s, ZPR32>; - defm LDNT1W_ZZR_S : sve2_mem_gldnt_vs<0b01001, "ldnt1w", Z_s, ZPR32>; - - defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>; - defm LDNT1B_ZZR_D : sve2_mem_gldnt_vs<0b10010, "ldnt1b", Z_d, ZPR64>; - defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>; - defm LDNT1H_ZZR_D : sve2_mem_gldnt_vs<0b10110, "ldnt1h", Z_d, ZPR64>; - defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>; - defm LDNT1W_ZZR_D : sve2_mem_gldnt_vs<0b11010, "ldnt1w", Z_d, ZPR64>; - defm LDNT1D_ZZR_D : sve2_mem_gldnt_vs<0b11110, "ldnt1d", Z_d, ZPR64>; + defm LDNT1SB_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather, nxv4i8>; + defm LDNT1B_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b00001, "ldnt1b", AArch64ldnt1_gather, nxv4i8>; + defm LDNT1SH_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather, nxv4i16>; + defm LDNT1H_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b00101, "ldnt1h", AArch64ldnt1_gather, nxv4i16>; + defm LDNT1W_ZZR_S : sve2_mem_gldnt_32b_ptrs<0b01001, "ldnt1w", AArch64ldnt1_gather, nxv4i32>; + + defm LDNT1SB_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather, nxv2i8>; + defm LDNT1B_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b10010, "ldnt1b", AArch64ldnt1_gather, nxv2i8>; + defm LDNT1SH_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather, nxv2i16>; + defm LDNT1H_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b10110, "ldnt1h", AArch64ldnt1_gather, nxv2i16>; + defm LDNT1SW_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather, nxv2i32>; + defm LDNT1W_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b11010, "ldnt1w", AArch64ldnt1_gather, nxv2i32>; + defm LDNT1D_ZZR_D : sve2_mem_gldnt_64b_ptrs<0b11110, "ldnt1d", AArch64ldnt1_gather, nxv2i64>; // SVE2 vector splice (constructive) defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">; // SVE2 non-temporal scatter stores - defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>; - defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>; - defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>; - - defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>; - defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>; - defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>; - defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>; + defm STNT1B_ZZR_S : sve2_mem_sstnt_32b_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>; + defm STNT1H_ZZR_S : sve2_mem_sstnt_32b_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>; + defm STNT1W_ZZR_S : sve2_mem_sstnt_32b_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>; + + defm STNT1B_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>; + defm STNT1H_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>; + defm STNT1W_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>; + defm STNT1D_ZZR_D : sve2_mem_sstnt_64b_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>; // SVE2 table lookup (three sources) defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4944,16 +4944,36 @@ let mayStore = 1; } -multiclass sve2_mem_sstnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_sstnt_vs_base; +multiclass sve2_mem_sstnt_32b_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>; +} + +multiclass sve2_mem_sstnt_64b_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_sstnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt), + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>; } class sve_mem_sst_sv opc, bit xs, bit scaled, string asm, @@ -6377,17 +6397,38 @@ let mayLoad = 1; } -multiclass sve2_mem_gldnt_vs opc, string asm, - RegisterOperand listty, ZPRRegOp zprty> { - def _REAL : sve2_mem_gldnt_vs_base; +multiclass sve2_mem_gldnt_32b_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; + + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>; + def : InstAlias(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>; + def : InstAlias(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>; + + def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>; +} + +multiclass sve2_mem_gldnt_64b_ptrs opc, string asm, + SDPatternOperator op, + ValueType vt> { + def _REAL : sve2_mem_gldnt_vs_base; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>; def : InstAlias(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>; + (!cast(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>; def : InstAlias(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>; + (!cast(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>; + + def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)), + (!cast(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-gather-loads.ll @@ -0,0 +1,188 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; LDNT1B, LDNT1W, LDNT1H, LDNT1D: vector base + scalar offset +; ldnt1b { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1B +define @gldnt1b_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1b_s: +; CHECK: ldnt1b { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldnt1b_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1b_d: +; CHECK: ldnt1b { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1H +define @gldnt1h_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1h_s: +; CHECK: ldnt1h { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv416.nxv4i32( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +define @gldnt1h_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1h_d: +; CHECK: ldnt1h { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1W +define @gldnt1w_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_s: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv4i32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldnt1w_s_float( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_s_float: +; CHECK: ldnt1w { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv4f32.nxv4i32( %pg, + %base, + i64 %offset) + ret %load +} + +define @gldnt1w_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1w_d: +; CHECK: ldnt1w { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = zext %load to + ret %res +} + +; LDNT1D +define @gldnt1d_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1d_d: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; LDNT1D +define @gldnt1d_d_double( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1d_d_double: +; CHECK: ldnt1d { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2f64.nxv2i64( %pg, + %base, + i64 %offset) + ret %load +} + +; +; LDNT1SB, LDNT1SW, LDNT1SH, LDNT1SD: vector base + scalar offset +; ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; + +; LDNT1SB +define @gldnt1sb_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sb_s: +; CHECK: ldnt1sb { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv4i8.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldnt1sb_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sb_d: +; CHECK: ldnt1sb { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i8.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1SH +define @gldnt1sh_s( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sh_s: +; CHECK: ldnt1sh { z0.s }, p0/z, [z0.s, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv416.nxv4i32( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +define @gldnt1sh_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sh_d: +; CHECK: ldnt1sh { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i16.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1SW +define @gldnt1sw_d( %pg, %base, i64 %offset) { +; CHECK-LABEL: gldnt1sw_d: +; CHECK: ldnt1sw { z0.d }, p0/z, [z0.d, x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnt1.gather.nxv2i32.nxv2i64( %pg, + %base, + i64 %offset) + %res = sext %load to + ret %res +} + +; LDNT1B/LDNT1SB +declare @llvm.aarch64.sve.ldnt1.gather.nxv4i8.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i8.nxv2i64(, , i64) + +; LDNT1H/LDNT1SH +declare @llvm.aarch64.sve.ldnt1.gather.nxv416.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i16.nxv2i64(, , i64) + +; LDNT1W/LDNT1SW +declare @llvm.aarch64.sve.ldnt1.gather.nxv4i32.nxv4i32(, , i64) +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i32.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldnt1.gather.nxv4f32.nxv4i32(, , i64) + +; LDNT1D +declare @llvm.aarch64.sve.ldnt1.gather.nxv2i64.nxv2i64(, , i64) + +declare @llvm.aarch64.sve.ldnt1.gather.nxv2f64.nxv2i64(, , i64) diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-nt-scatter-stores.ll @@ -0,0 +1,134 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s + +; +; STNT1B, STNT1W, STNT1H, STNT1D: vector base + scalar offset +; stnt1b { z0.s }, p0/z, [z0.s, x0] +; + +; STNT1B +define void @stnt1b_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1b_s: +; CHECK: stnt1b { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv4i8.nxv4i32( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1b_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1b_d: +; CHECK: stnt1b { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i8.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1H +define void @stnt1h_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1h_s: +; CHECK: stnt1h { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv4i16.nxv4i32( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1h_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1h_d: +; CHECK: stnt1h { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i16.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1W +define void @stnt1w_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_s: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv4i32.nxv4i32( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1w_f32_s( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_f32_s: +; CHECK: stnt1w { z0.s }, p0, [z1.s, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv4f32.nxv4i32( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1w_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1w_d: +; CHECK: stnt1w { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + %data_trunc = trunc %data to + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i32.nxv2i64( %data_trunc, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1D +define void @stnt1d_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1d_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv2i64.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +define void @stnt1d_f64_d( %data, %pg, %base, i64 %offset) { +; CHECK-LABEL: stnt1d_f64_d: +; CHECK: stnt1d { z0.d }, p0, [z1.d, x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.scatter.nxv2f64.nxv2i64( %data, + %pg, + %base, + i64 %offset) + ret void +} + +; STNT1B +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i8.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv4i8.nxv4i32(, , , i64) + +; STNT1H +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i16.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv4i16.nxv4i32(, , , i64) + +; STNT1W +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv4i32.nxv4i32(, , , i64) + +declare void @llvm.aarch64.sve.stnt1.scatter.nxv4f32.nxv4i32(, , , i64) + +; STNT1D +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2i64.nxv2i64(, , , i64) + +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f32.nxv2i64(, , , i64) +declare void @llvm.aarch64.sve.stnt1.scatter.nxv2f64.nxv2i64(, , , i64)