diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -527,6 +527,29 @@ "HasFastVariablePerLaneShuffle", "true", "Per-lane shuffles with variable masks are fast">; +// Goldmont / Tremont (atom in general) has no bypass delay +def TuningNoDomainDelay : SubtargetFeature<"no-bypass-delay", + "NoDomainDelay","true", + "Has no bypass delay when using the 'wrong' domain">; + +// Many processors (Nehalem+ on Intel) have no bypass delay when +// using the wrong mov type. +def TuningNoDomainDelayMov : SubtargetFeature<"no-bypass-delay-mov", + "NoDomainDelayMov","true", + "Has no bypass delay when using the 'wrong' mov type">; + +// Newer processors (Skylake+ on Intel) have no bypass delay when +// using the wrong blend type. +def TuningNoDomainDelayBlend : SubtargetFeature<"no-bypass-delay-blend", + "NoDomainDelayBlend","true", + "Has no bypass delay when using the 'wrong' blend type">; + +// Newer processors (Haswell+ on Intel) have no bypass delay when +// using the wrong shuffle type. +def TuningNoDomainDelayShuffle : SubtargetFeature<"no-bypass-delay-shuffle", + "NoDomainDelayShuffle","true", + "Has no bypass delay when using the 'wrong' shuffle type">; + // Skylake server has avx512 for `vpro{r|l}{d|q}` but not that extra // shuffle ports that other avx512 targets have so prefer shuffle with // shifts/rotate. @@ -738,7 +761,8 @@ // Nehalem list NHMFeatures = X86_64V2Features; list NHMTuning = [TuningMacroFusion, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelayMov]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -758,7 +782,8 @@ TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelayMov]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -790,7 +815,9 @@ TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle]; list HSWFeatures = !listconcat(IVBFeatures, HSWAdditionalFeatures); @@ -819,7 +846,10 @@ TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -849,7 +879,10 @@ TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, TuningAllowLight256Bit, - TuningPreferShiftShuffle]; + TuningPreferShiftShuffle, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -887,7 +920,10 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -912,7 +948,10 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -986,7 +1025,8 @@ TuningSlowTwoMemOps, TuningLEAUsesAG, TuningPadShortFunctions, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; // Silvermont list SLMAdditionalFeatures = [FeatureSSE42, @@ -1004,7 +1044,8 @@ TuningFast7ByteNOP, TuningFastMOVBE, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list SLMFeatures = !listconcat(AtomFeatures, SLMAdditionalFeatures); @@ -1024,7 +1065,8 @@ TuningSlowIncDec, TuningFastMOVBE, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list GLMFeatures = !listconcat(SLMFeatures, GLMAdditionalFeatures); @@ -1036,7 +1078,8 @@ TuningSlowLEA, TuningSlowIncDec, TuningFastMOVBE, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list GLPFeatures = !listconcat(GLMFeatures, GLPAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -249,6 +249,17 @@ return hasBWI() && canExtendTo512DQ(); } + bool hasNoDomainDelay() const { return NoDomainDelay; } + bool hasNoDomainDelayMov() const { + return hasNoDomainDelay() || NoDomainDelayMov; + } + bool hasNoDomainDelayBlend() const { + return hasNoDomainDelay() || NoDomainDelayBlend; + } + bool hasNoDomainDelayShuffle() const { + return hasNoDomainDelay() || NoDomainDelayShuffle; + } + bool hasFasterShiftThanShuffle() const { return PreferLowerShuffleAsShift; } // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -89,6 +89,10 @@ X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts, X86::TuningPreferShiftShuffle, + X86::TuningNoDomainDelay, + X86::TuningNoDomainDelayMov, + X86::TuningNoDomainDelayShuffle, + X86::TuningNoDomainDelayBlend, // Perf-tuning flags. X86::TuningFastGather,