Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -263,6 +263,12 @@ def FeatureSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", "Use software floating point features.">; +// On recent X86 (port bound) processors, its preferable to combine to a single shuffle +// using a variable mask over multiple fixed shuffles. +def FeatureFastVariableShuffle + : SubtargetFeature<"fast-variable-shuffle", + "HasFastVariableShuffle", + "true", "Shuffles with variable masks are fast">; // On some X86 processors, there is no performance hazard to writing only the // lower parts of a YMM or ZMM register without clearing the upper part. def FeatureFastPartialYMMorZMMWrite @@ -620,7 +626,8 @@ FeatureERMSB, FeatureFMA, FeatureLZCNT, - FeatureMOVBE + FeatureMOVBE, + FeatureFastVariableShuffle ]>; class HaswellProc : ProcModel= 3) || HasVariableMask; + int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; + bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -228,6 +228,10 @@ /// the stack pointer. This is an optimization for Intel Atom processors. bool UseLeaForSP; + /// True if its preferable to combine to a single shuffle using a variable + /// mask over multiple fixed shuffles. + bool HasFastVariableShuffle; + /// True if there is no performance penalty to writing only the lower parts /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; @@ -527,6 +531,9 @@ bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } + bool hasFastVariableShuffle() const { + return HasFastVariableShuffle; + } bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -186,8 +186,7 @@ ; ; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: ; SKX: ## %bb.0: -; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[2,3,2,3,0,1,0,1,10,11,10,11,8,9,8,9,18,19,18,19,16,17,16,17,26,27,26,27,24,25,24,25,34,35,34,35,32,33,32,33,42,43,42,43,40,41,40,41,50,51,50,51,48,49,48,49,58,59,58,59,56,57,56,57] ; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c