diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -527,6 +527,29 @@ "HasFastVariablePerLaneShuffle", "true", "Per-lane shuffles with variable masks are fast">; +// Goldmont / Tremont (atom in general) has no bypass delay +def TuningNoDomainDelay : SubtargetFeature<"no-bypass-delay", + "NoDomainDelay","true", + "Has no bypass delay when using the 'wrong' domain">; + +// Many processors (Nehalem+ on Intel) have no bypass delay when +// using the wrong mov type. +def TuningNoDomainDelayMov : SubtargetFeature<"no-bypass-delay-mov", + "NoDomainDelayMov","true", + "Has no bypass delay when using the 'wrong' mov type">; + +// Newer processors (Skylake+ on Intel) have no bypass delay when +// using the wrong blend type. +def TuningNoDomainDelayBlend : SubtargetFeature<"no-bypass-delay-blend", + "NoDomainDelayBlend","true", + "Has no bypass delay when using the 'wrong' blend type">; + +// Newer processors (Haswell+ on Intel) have no bypass delay when +// using the wrong shuffle type. +def TuningNoDomainDelayShuffle : SubtargetFeature<"no-bypass-delay-shuffle", + "NoDomainDelayShuffle","true", + "Has no bypass delay when using the 'wrong' shuffle type">; + // Prefer lowering shuffles on AVX512 targets (e.g. Skylake Server) to // imm shifts/rotate if they can use more ports than regular shuffles. def TuningPreferShiftShuffle : SubtargetFeature<"faster-shift-than-shuffle", @@ -737,7 +760,8 @@ // Nehalem list NHMFeatures = X86_64V2Features; list NHMTuning = [TuningMacroFusion, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelayMov]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -757,7 +781,8 @@ TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelayMov]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -789,7 +814,9 @@ TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle]; list HSWFeatures = !listconcat(IVBFeatures, HSWAdditionalFeatures); @@ -818,7 +845,10 @@ TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -848,7 +878,10 @@ TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, TuningAllowLight256Bit, - TuningPreferShiftShuffle]; + TuningPreferShiftShuffle, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -886,7 +919,10 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -911,7 +947,10 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -985,7 +1024,8 @@ TuningSlowTwoMemOps, TuningLEAUsesAG, TuningPadShortFunctions, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; // Silvermont list SLMAdditionalFeatures = [FeatureSSE42, @@ -1003,7 +1043,8 @@ TuningFast7ByteNOP, TuningFastMOVBE, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list SLMFeatures = !listconcat(AtomFeatures, SLMAdditionalFeatures); @@ -1023,7 +1064,8 @@ TuningSlowIncDec, TuningFastMOVBE, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list GLMFeatures = !listconcat(SLMFeatures, GLMAdditionalFeatures); @@ -1035,7 +1077,8 @@ TuningSlowLEA, TuningSlowIncDec, TuningFastMOVBE, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list GLPFeatures = !listconcat(GLMFeatures, GLPAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -249,6 +249,17 @@ return hasBWI() && canExtendTo512DQ(); } + bool hasNoDomainDelay() const { return NoDomainDelay; } + bool hasNoDomainDelayMov() const { + return hasNoDomainDelay() || NoDomainDelayMov; + } + bool hasNoDomainDelayBlend() const { + return hasNoDomainDelay() || NoDomainDelayBlend; + } + bool hasNoDomainDelayShuffle() const { + return hasNoDomainDelay() || NoDomainDelayShuffle; + } + bool hasFasterShiftThanShuffle() const { return PreferLowerShuffleAsShift; } // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -89,6 +89,10 @@ X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts, X86::TuningPreferShiftShuffle, + X86::TuningNoDomainDelay, + X86::TuningNoDomainDelayMov, + X86::TuningNoDomainDelayShuffle, + X86::TuningNoDomainDelayBlend, // Perf-tuning flags. X86::TuningFastGather,