Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -103,6 +103,28 @@ def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>; +// This flag is currently still labeled as Experimental, but when fully +// implemented this should tell the compiler to use the zeroing pseudos to +// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive +// lanes are known to be zero. The pseudos will then be expanded using the +// MOVPRFX instruction to zero the inactive lanes. This feature should only be +// enabled if MOVPRFX instructions are known to merge with the destructive +// operations they prefix. +// +// This feature could similarly be extended to support cheap merging of _any_ +// value into the inactive lanes using the MOVPRFX instruction that uses +// merging-predication. +// +// It is recommended to create a separate feature if we want to benefit from the +// reverse instructions without assuming cheap MOVPRFX when the inactive lanes +// are `undef` or when the predicate is known to always be PTRUE. +def FeatureExperimentalZeroingPseudos + : SubtargetFeature<"use-experimental-zeroing-pseudos", + "UseExperimentalZeroingPseudos", "true", + "Hint to the compiler that the MOVPRFX instruction is " + "merged with destructive operations", + []>; + def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true", "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>; Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -155,6 +155,8 @@ def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; +def UseExperimentalZeroingPseudos + : Predicate<"Subtarget->useExperimentalZeroingPseudos()">; def UseAlternateSExtLoadCVTF32 : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">; Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -222,9 +222,11 @@ defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ", 1>; defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", 0>; - defm ADD_ZPZZ : sve_int_bin_pred_zx; - defm SUB_ZPZZ : sve_int_bin_pred_zx; - defm SUBR_ZPZZ : sve_int_bin_pred_zx; + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm ADD_ZPZZ : sve_int_bin_pred_zx; + defm SUB_ZPZZ : sve_int_bin_pred_zx; + defm SUBR_ZPZZ : sve_int_bin_pred_zx; + } defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>; defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>; @@ -345,18 +347,20 @@ defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", 0>; defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ", 1>; - defm FADD_ZPZZ : sve_fp_2op_p_zds_zx; - defm FSUB_ZPZZ : sve_fp_2op_p_zds_zx; - defm FMUL_ZPZZ : sve_fp_2op_p_zds_zx; - defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zx; - defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zx; - defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zx; - defm FMAX_ZPZZ : sve_fp_2op_p_zds_zx; - defm FMIN_ZPZZ : sve_fp_2op_p_zds_zx; - defm FABD_ZPZZ : sve_fp_2op_p_zds_zx; - defm FMULX_ZPZZ : sve_fp_2op_p_zds_zx; - defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zx; - defm FDIV_ZPZZ : sve_fp_2op_p_zds_zx; + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm FADD_ZPZZ : sve_fp_2op_p_zds_zx; + defm FSUB_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMUL_ZPZZ : sve_fp_2op_p_zds_zx; + defm FSUBR_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMAX_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMIN_ZPZZ : sve_fp_2op_p_zds_zx; + defm FABD_ZPZZ : sve_fp_2op_p_zds_zx; + defm FMULX_ZPZZ : sve_fp_2op_p_zds_zx; + defm FDIVR_ZPZZ : sve_fp_2op_p_zds_zx; + defm FDIV_ZPZZ : sve_fp_2op_p_zds_zx; + } defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>; @@ -1223,11 +1227,6 @@ defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; - defm ASR_ZPZZ : sve_int_bin_pred_zx; - defm LSR_ZPZZ : sve_int_bin_pred_zx; - defm LSL_ZPZZ : sve_int_bin_pred_zx; - defm ASRD_ZPZI : sve_int_bin_pred_shift_0_right_zx; - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ", 1>; defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ", 1>; defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ", 1>; @@ -1235,6 +1234,13 @@ defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", 0>; defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", 0>; + let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in { + defm ASR_ZPZZ : sve_int_bin_pred_zx; + defm LSR_ZPZZ : sve_int_bin_pred_zx; + defm LSL_ZPZZ : sve_int_bin_pred_zx; + defm ASRD_ZPZI : sve_int_bin_pred_shift_imm_right_zx; + } + defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; @@ -2230,6 +2236,14 @@ defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>; + let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in { + defm SQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zx; + defm UQSHL_ZPZI : sve_int_bin_pred_shift_imm_left_zx; + defm SRSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zx; + defm URSHR_ZPZI : sve_int_bin_pred_shift_imm_right_zx; + defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zx; + } + // SVE2 integer add/subtract long defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>; defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>; Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -104,6 +104,10 @@ bool HasPAN_RWV = false; bool HasCCPP = false; + // Armv8.2 SVE extensions + bool HasSVE = false; + bool UseExperimentalZeroingPseudos = false; + // Armv8.2 Crypto extensions bool HasSM4 = false; bool HasSHA3 = false; @@ -130,8 +134,6 @@ bool HasRCPC_IMMO = false; bool HasLSLFast = false; - bool HasSVE = false; - bool HasSVE2 = false; bool HasRCPC = false; bool HasAggressiveFMA = false; @@ -158,6 +160,7 @@ bool HasEnhancedCounterVirtualization = false; // Arm SVE2 extensions + bool HasSVE2 = false; bool HasSVE2AES = false; bool HasSVE2SM4 = false; bool HasSVE2SHA3 = false; @@ -398,6 +401,10 @@ unsigned getWideningBaseCost() const { return WideningBaseCost; } + bool useExperimentalZeroingPseudos() const { + return UseExperimentalZeroingPseudos; + } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4766,27 +4766,24 @@ let Inst{9-8} = imm{4-3}; } - def _B_Z_UNDEF : PredTwoOpImmPseudo; - def _H_Z_UNDEF : PredTwoOpImmPseudo; - def _S_Z_UNDEF : PredTwoOpImmPseudo; - def _D_Z_UNDEF : PredTwoOpImmPseudo; - - def _B_Z_ZERO : PredTwoOpImmPseudo; - def _H_Z_ZERO : PredTwoOpImmPseudo; - def _S_Z_ZERO : PredTwoOpImmPseudo; - def _D_Z_ZERO : PredTwoOpImmPseudo; - - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _B_Z_ZERO)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _H_Z_ZERO)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _S_Z_ZERO)>; - def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _D_Z_ZERO)>; - def : SVE_3_Op_Imm_Pat(NAME # _B)>; def : SVE_3_Op_Imm_Pat(NAME # _H)>; def : SVE_3_Op_Imm_Pat(NAME # _S)>; def : SVE_3_Op_Imm_Pat(NAME # _D)>; } +multiclass sve_int_bin_pred_shift_imm_left_zx { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; +} + multiclass sve_int_bin_pred_shift_imm_right opc, string asm, string Ps, SDPatternOperator op = null_frag> { def _B : SVEPseudo2Instr, @@ -4811,7 +4808,7 @@ def : SVE_3_Op_Imm_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_shift_0_right_zx { +multiclass sve_int_bin_pred_shift_imm_right_zx { def _ZERO_B : PredTwoOpImmPseudo; def _ZERO_H : PredTwoOpImmPseudo; def _ZERO_S : PredTwoOpImmPseudo; Index: llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-fp-arith-merging.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=sve < %s 2>%t | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=sve -mattr=+use-experimental-zeroing-pseudos < %s 2>%t | FileCheck %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; WARN-NOT: warning Index: llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-int-arith-merging.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-experimental-zeroing-pseudos < %s 2>%t | FileCheck %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; WARN-NOT: warning Index: llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll +++ llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -mattr=+use-experimental-zeroing-pseudos < %s 2>%t | FileCheck %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; WARN-NOT: warning Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-zeroing.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-zeroing.ll +++ llvm/test/CodeGen/AArch64/sve2-intrinsics-uniform-dsp-zeroing.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -asm-verbose=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 -asm-verbose=0 -mattr=+use-experimental-zeroing-pseudos < %s | FileCheck %s ; ; SQSHLU