diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -527,6 +527,29 @@ "HasFastVariablePerLaneShuffle", "true", "Per-lane shuffles with variable masks are fast">; +// Goldmont / Tremont (atom in general) has no bypass delay +def TuningNoDomainDelay : SubtargetFeature<"no-bypass-delay", + "NoDomainDelay","true", + "Has no bypass delay when using the 'wrong' domain">; + +// Many processors (Nehalem+ on Intel) have no bypass delay when +// using the wrong mov type. +def TuningNoDomainDelayMov : SubtargetFeature<"no-bypass-delay-mov", + "NoDomainDelayMov","true", + "Has no bypass delay when using the 'wrong' mov type">; + +// Newer processors (Skylake+ on Intel) have no bypass delay when +// using the wrong blend type. +def TuningNoDomainDelayBlend : SubtargetFeature<"no-bypass-delay-blend", + "NoDomainDelayBlend","true", + "Has no bypass delay when using the 'wrong' blend type">; + +// Newer processors (Haswell+ on Intel) have no bypass delay when +// using the wrong shuffle type. +def TuningNoDomainDelayShuffle : SubtargetFeature<"no-bypass-delay-shuffle", + "NoDomainDelayShuffle","true", + "Has no bypass delay when using the 'wrong' shuffle type">; + // On some X86 processors, a vzeroupper instruction should be inserted after // using ymm/zmm registers before executing code that may use SSE instructions. def TuningInsertVZEROUPPER @@ -781,7 +804,8 @@ // Nehalem list NHMFeatures = X86_64V2Features; list NHMTuning = [TuningMacroFusion, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelayMov]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -801,7 +825,8 @@ TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelayMov]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -833,7 +858,9 @@ TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle]; list HSWFeatures = !listconcat(IVBFeatures, HSWAdditionalFeatures); @@ -862,7 +889,10 @@ TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -891,7 +921,10 @@ TuningPrefer256Bit, TuningPOPCNTFalseDeps, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -929,7 +962,10 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -954,7 +990,10 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningInsertVZEROUPPER, - TuningAllowLight256Bit]; + TuningAllowLight256Bit, + TuningNoDomainDelayMov, + TuningNoDomainDelayShuffle, + TuningNoDomainDelayBlend]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -1028,7 +1067,8 @@ TuningSlowTwoMemOps, TuningLEAUsesAG, TuningPadShortFunctions, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; // Silvermont list SLMAdditionalFeatures = [FeatureSSE42, @@ -1046,7 +1086,8 @@ TuningFast7ByteNOP, TuningFastMOVBE, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list SLMFeatures = !listconcat(AtomFeatures, SLMAdditionalFeatures); @@ -1066,7 +1107,8 @@ TuningSlowIncDec, TuningFastMOVBE, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list GLMFeatures = !listconcat(SLMFeatures, GLMAdditionalFeatures); @@ -1078,7 +1120,8 @@ TuningSlowLEA, TuningSlowIncDec, TuningFastMOVBE, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningNoDomainDelay]; list GLPFeatures = !listconcat(GLMFeatures, GLPAdditionalFeatures); diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -249,6 +249,17 @@ return hasBWI() && canExtendTo512DQ(); } + bool hasNoDomainDelay() const { return NoDomainDelay; } + bool hasNoDomainDelayMov() const { + return hasNoDomainDelay() || NoDomainDelayMov; + } + bool hasNoDomainDelayBlend() const { + return hasNoDomainDelay() || NoDomainDelayBlend; + } + bool hasNoDomainDelayShuffle() const { + return hasNoDomainDelay() || NoDomainDelayShuffle; + } + // If there are no 512-bit vectors and we prefer not to use 512-bit registers, // disable them in the legalizer. bool useAVX512Regs() const { diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -88,6 +88,10 @@ X86::TuningInsertVZEROUPPER, X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts, + X86::TuningNoDomainDelay, + X86::TuningNoDomainDelayMov, + X86::TuningNoDomainDelayShuffle, + X86::TuningNoDomainDelayBlend, // Perf-tuning flags. X86::TuningFastGather,