diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35351,6 +35351,9 @@ // Depth threshold above which we can efficiently use variable mask shuffles. int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2; AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; + // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a + // higher depth before combining them. + bool AllowBWIVPERMV3 = (Depth >= 2 || HasVariableMask); bool MaskContainsZeros = isAnyZero(Mask); @@ -35387,9 +35390,9 @@ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && + (Subtarget.hasBWI() && AllowBWIVPERMV3 && (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && + (Subtarget.hasVBMI() && AllowBWIVPERMV3 && (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { // Adjust shuffle mask - replace SM_SentinelZero with second source index. for (unsigned i = 0; i != NumMaskElts; ++i) @@ -35416,9 +35419,9 @@ MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || - (Subtarget.hasBWI() && + (Subtarget.hasBWI() && AllowBWIVPERMV3 && (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && + (Subtarget.hasVBMI() && AllowBWIVPERMV3 && (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { V1 = DAG.getBitcast(MaskVT, V1); V2 = DAG.getBitcast(MaskVT, V2); @@ -35588,10 +35591,10 @@ MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || - (Subtarget.hasBWI() && (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || - MaskVT == MVT::v32i16)) || - (Subtarget.hasVBMI() && (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || - MaskVT == MVT::v64i8)))) { + (Subtarget.hasBWI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || + (Subtarget.hasVBMI() && AllowBWIVPERMV3 && + (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { V1 = DAG.getBitcast(MaskVT, V1); V2 = DAG.getBitcast(MaskVT, V2); Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -857,10 +857,10 @@ define <8 x i32> @trunc_v8i64_v8i32_zeroes(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v8i64_v8i32_zeroes: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm1 -; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm2 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] -; CHECK-NEXT: vpermi2w %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vpsrlq $48, 32(%rdi), %ymm0 +; CHECK-NEXT: vpsrlq $48, (%rdi), %ymm1 +; CHECK-NEXT: vpackusdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <8 x i64>, <8 x i64>* %x %b = lshr <8 x i64> %a, @@ -920,9 +920,10 @@ define <16 x i16> @trunc_v16i32_v16i16_sign(<16 x i32>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v16i32_v16i16_sign: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31] -; CHECK-NEXT: vpermi2w 32(%rdi), %ymm1, %ymm0 +; CHECK-NEXT: vpsrad $16, 32(%rdi), %ymm0 +; CHECK-NEXT: vpsrad $16, (%rdi), %ymm1 +; CHECK-NEXT: vpackssdw %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; CHECK-NEXT: retq %a = load <16 x i32>, <16 x i32>* %x %b = ashr <16 x i32> %a, @@ -931,20 +932,13 @@ } define <32 x i8> @trunc_v32i16_v32i8_sign(<32 x i16>* %x) nounwind "min-legal-vector-width"="256" { -; CHECK-AVX512-LABEL: trunc_v32i16_v32i8_sign: -; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vpsraw $8, 32(%rdi), %ymm0 -; CHECK-AVX512-NEXT: vpsraw $8, (%rdi), %ymm1 -; CHECK-AVX512-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 -; CHECK-AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; CHECK-AVX512-NEXT: retq -; -; CHECK-VBMI-LABEL: trunc_v32i16_v32i8_sign: -; CHECK-VBMI: # %bb.0: -; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 -; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm0 = [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35,37,39,41,43,45,47,49,51,53,55,57,59,61,63] -; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 -; CHECK-VBMI-NEXT: retq +; CHECK-LABEL: trunc_v32i16_v32i8_sign: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsraw $8, 32(%rdi), %ymm0 +; CHECK-NEXT: vpsraw $8, (%rdi), %ymm1 +; CHECK-NEXT: vpacksswb %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; CHECK-NEXT: retq %a = load <32 x i16>, <32 x i16>* %x %b = ashr <32 x i16> %a, %c = trunc <32 x i16> %b to <32 x i8> diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -304,24 +304,11 @@ ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX2-NEXT: retq -; -; AVX512VLBW-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: -; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,0,17,0,18,0,19,0,20,0,21,0,22,0,23] -; AVX512VLVBMI-NEXT: vpermi2b %xmm0, %xmm1, %xmm2 -; AVX512VLVBMI-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512VLVBMI-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastb %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07: ; XOPAVX1: # %bb.0: @@ -1335,23 +1322,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX1OR2-NEXT: retq -; -; AVX512VLBW-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512VLBW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: -; AVX512VLVBMI: # %bb.0: -; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} xmm2 = [0,16,1,17,4,20,5,21,2,18,3,19,6,22,7,23] -; AVX512VLVBMI-NEXT: vpermt2b %xmm1, %xmm2, %xmm0 -; AVX512VLVBMI-NEXT: retq +; AVX-LABEL: shuffle_v16i8_00_16_01_17_04_20_05_21_02_18_03_19_06_22_07_23: +; AVX: # %bb.0: +; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] +; AVX-NEXT: retq %shuffle = shufflevector <16 x i8> %val1, <16 x i8> %val2, <16 x i32> ret <16 x i8> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -1017,23 +1017,11 @@ ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_0c1d2e3f: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_0c1d2e3f: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_0c1d2e3f: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,12,1,13,2,14,3,15] -; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-FAST-NEXT: retq +; AVX-LABEL: shuffle_v8i16_0c1d2e3f: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1059,23 +1047,11 @@ ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_48596a7b: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX1OR2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1OR2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_48596a7b: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] -; AVX512VL-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_48596a7b: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,8,5,9,6,10,7,11] -; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-FAST-NEXT: retq +; AVX-LABEL: shuffle_v8i16_48596a7b: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1424,23 +1400,11 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_012dXXXX: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_012dXXXX: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_012dXXXX: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,13,4,5,6,7] -; AVX512VL-FAST-NEXT: vpermt2w %xmm1, %xmm2, %xmm0 -; AVX512VL-FAST-NEXT: retq +; AVX-LABEL: shuffle_v8i16_012dXXXX: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } @@ -1475,24 +1439,11 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: shuffle_v8i16_XXXXcde3: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] -; AVX2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_XXXXcde3: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_XXXXcde3: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,11] -; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 -; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512VL-FAST-NEXT: retq +; AVX2OR512VL-LABEL: shuffle_v8i16_XXXXcde3: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2OR512VL-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5,6],xmm0[7] +; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: shuffle_v8i16_XXXXcde3: ; XOPAVX1: # %bb.0: @@ -1533,24 +1484,11 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] ; SSE41-NEXT: retq ; -; AVX1OR2-LABEL: shuffle_v8i16_cde3XXXX: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX1OR2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] -; AVX1OR2-NEXT: retq -; -; AVX512VL-SLOW-LABEL: shuffle_v8i16_cde3XXXX: -; AVX512VL-SLOW: # %bb.0: -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] -; AVX512VL-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] -; AVX512VL-SLOW-NEXT: retq -; -; AVX512VL-FAST-LABEL: shuffle_v8i16_cde3XXXX: -; AVX512VL-FAST: # %bb.0: -; AVX512VL-FAST-NEXT: vmovdqa {{.*#+}} xmm2 = [4,5,6,11,4,5,6,7] -; AVX512VL-FAST-NEXT: vpermi2w %xmm0, %xmm1, %xmm2 -; AVX512VL-FAST-NEXT: vmovdqa %xmm2, %xmm0 -; AVX512VL-FAST-NEXT: retq +; AVX-LABEL: shuffle_v8i16_cde3XXXX: +; AVX: # %bb.0: +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3],xmm1[4,5,6,7] +; AVX-NEXT: retq %shuffle = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %shuffle } diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4804,29 +4804,11 @@ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-LABEL: PR28136: -; AVX2: # %bb.0: -; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX2-NEXT: retq -; -; AVX512VLBW-LABEL: PR28136: -; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VLBW-NEXT: retq -; -; AVX512VLVBMI-SLOW-LABEL: PR28136: -; AVX512VLVBMI-SLOW: # %bb.0: -; AVX512VLVBMI-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512VLVBMI-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] -; AVX512VLVBMI-SLOW-NEXT: retq -; -; AVX512VLVBMI-FAST-LABEL: PR28136: -; AVX512VLVBMI-FAST: # %bb.0: -; AVX512VLVBMI-FAST-NEXT: vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,16,48,17,49,18,50,19,51,4,36,5,37,6,38,7,39,20,52,21,53,22,54,23,55] -; AVX512VLVBMI-FAST-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 -; AVX512VLVBMI-FAST-NEXT: retq +; AVX2OR512VL-LABEL: PR28136: +; AVX2OR512VL: # %bb.0: +; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; AVX2OR512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2OR512VL-NEXT: retq ; ; XOPAVX1-LABEL: PR28136: ; XOPAVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -1902,20 +1902,11 @@ ; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; AVX2-NEXT: retq ; -; AVX512F-LABEL: shuf_zext_8i16_to_4i64_offset2: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] -; AVX512F-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: shuf_zext_8i16_to_4i64_offset2: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vmovdqa {{.*#+}} ymm1 = [2,33,34,35,3,37,38,39,4,41,42,43,5,45,46,47] -; AVX512BW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512BW-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 -; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 -; AVX512BW-NEXT: retq +; AVX512-LABEL: shuf_zext_8i16_to_4i64_offset2: +; AVX512: # %bb.0: # %entry +; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,2,2,3] +; AVX512-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX512-NEXT: retq entry: %B = shufflevector <8 x i16> %A, <8 x i16> zeroinitializer, <16 x i32> %Z = bitcast <16 x i16> %B to <4 x i64>