Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -819,7 +819,14 @@ { TTI::SK_Reverse, MVT::v32i8, 2 }, // vperm2i128 + pshufb { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw - { TTI::SK_Alternate, MVT::v32i8, 1 } // vpblendvb + { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb + + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb + // + vpblendvb + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 } // vperm2i128 + 2 * vpshufb + // + vpblendvb }; if (ST->hasAVX2()) @@ -876,7 +883,10 @@ { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por - { TTI::SK_Alternate, MVT::v16i8, 3 } // pshufb + pshufb + por + { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por + + { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb }; if (ST->hasSSSE3()) @@ -901,7 +911,10 @@ { TTI::SK_Alternate, MVT::v2f64, 1 }, // movsd { TTI::SK_Alternate, MVT::v4i32, 2 }, // 2*shufps { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por - { TTI::SK_Alternate, MVT::v16i8, 3 } // pand + pandn + por + { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por + + { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd }; if (ST->hasSSE2()) Index: test/Analysis/CostModel/X86/shuffle-single-src.ll =================================================================== --- test/Analysis/CostModel/X86/shuffle-single-src.ll +++ test/Analysis/CostModel/X86/shuffle-single-src.ll @@ -46,7 +46,7 @@ ; SSSE3: cost of 8 {{.*}} %V256 = shufflevector ; SSE42: cost of 8 {{.*}} %V256 = shufflevector ; AVX1: cost of 8 {{.*}} %V256 = shufflevector - ; AVX2: cost of 8 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> @@ -94,11 +94,11 @@ ; CHECK-LABEL: 'test_vXi32' define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024) { - ; SSE2: cost of 8 {{.*}} %V128 = shufflevector - ; SSSE3: cost of 8 {{.*}} %V128 = shufflevector - ; SSE42: cost of 8 {{.*}} %V128 = shufflevector - ; AVX1: cost of 8 {{.*}} %V128 = shufflevector - ; AVX2: cost of 8 {{.*}} %V128 = shufflevector + ; SSE2: cost of 1 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX1: cost of 1 {{.*}} %V128 = shufflevector + ; AVX2: cost of 1 {{.*}} %V128 = shufflevector ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> @@ -106,7 +106,7 @@ ; SSSE3: cost of 16 {{.*}} %V256 = shufflevector ; SSE42: cost of 16 {{.*}} %V256 = shufflevector ; AVX1: cost of 16 {{.*}} %V256 = shufflevector - ; AVX2: cost of 16 {{.*}} %V256 = shufflevector + ; AVX2: cost of 1 {{.*}} %V256 = shufflevector ; AVX512: cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> @@ -132,11 +132,11 @@ define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) { ; SSE2: cost of 16 {{.*}} %V128 = shufflevector - ; SSSE3: cost of 16 {{.*}} %V128 = shufflevector - ; SSE42: cost of 16 {{.*}} %V128 = shufflevector - ; AVX1: cost of 16 {{.*}} %V128 = shufflevector - ; AVX2: cost of 16 {{.*}} %V128 = shufflevector - ; AVX512F: cost of 16 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX1: cost of 1 {{.*}} %V128 = shufflevector + ; AVX2: cost of 1 {{.*}} %V128 = shufflevector + ; AVX512F: cost of 1 {{.*}} %V128 = shufflevector ; AVX512BW: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> @@ -144,8 +144,8 @@ ; SSSE3: cost of 32 {{.*}} %V256 = shufflevector ; SSE42: cost of 32 {{.*}} %V256 = shufflevector ; AVX1: cost of 32 {{.*}} %V256 = shufflevector - ; AVX2: cost of 32 {{.*}} %V256 = shufflevector - ; AVX512F: cost of 32 {{.*}} %V256 = shufflevector + ; AVX2: cost of 4 {{.*}} %V256 = shufflevector + ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector ; AVX512BW cost of 1 {{.*}} %V256 = shufflevector %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> @@ -172,10 +172,10 @@ ; CHECK-LABEL: 'test_vXi8' define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) { ; SSE2: cost of 32 {{.*}} %V128 = shufflevector - ; SSSE3: cost of 32 {{.*}} %V128 = shufflevector - ; SSE42: cost of 32 {{.*}} %V128 = shufflevector - ; AVX1: cost of 32 {{.*}} %V128 = shufflevector - ; AVX2: cost of 32 {{.*}} %V128 = shufflevector + ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector + ; SSE42: cost of 1 {{.*}} %V128 = shufflevector + ; AVX1: cost of 1 {{.*}} %V128 = shufflevector + ; AVX2: cost of 1 {{.*}} %V128 = shufflevector ; AVX512: cost of 1 {{.*}} %V128 = shufflevector %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> @@ -183,8 +183,8 @@ ; SSSE3: cost of 64 {{.*}} %V256 = shufflevector ; SSE42: cost of 64 {{.*}} %V256 = shufflevector ; AVX1: cost of 64 {{.*}} %V256 = shufflevector - ; AVX2: cost of 64 {{.*}} %V256 = shufflevector - ; AVX512F: cost of 64 {{.*}} %V256 = shufflevector + ; AVX2: cost of 4 {{.*}} %V256 = shufflevector + ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector ; AVX512BW: cost of 3 {{.*}} %V256 = shufflevector %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32>