diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19302,6 +19302,37 @@ return false; } +static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + if (!Subtarget.hasAVX512()) + return false; + + MVT VT = V1.getSimpleValueType().getScalarType(); + if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) + return false; + + // i8 is better to be widen to i16, because there is PBLENDW for vXi16 + // when the vector bit size is 128 or 256. + if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512) + return false; + + auto HasMaskOperation = [&](SDValue V) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isBinOp(V->getOpcode())) + return false; + if (!V->hasOneUse()) + return false; + + return true; + }; + + if (HasMaskOperation(V1) || HasMaskOperation(V2)) + return true; + + return false; +} + // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, @@ -19377,6 +19408,7 @@ // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + !canCombineAsMaskOperation(V1, V2, Subtarget, DAG) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll @@ -1,29 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X86-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW define <16 x i32> @shuffle_v8i64(<16 x i32> %t0, <16 x i32> %t1) { -; AVX512F-LABEL: shuffle_v8i64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: movb $-86, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: ret{{[l|q]}} -; -; AVX512BW-LABEL: shuffle_v8i64: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movb $-86, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: ret{{[l|q]}} +; CHECK-LABEL: shuffle_v8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15] +; CHECK-NEXT: ret{{[l|q]}} entry: %t2 = add nsw <16 x i32> %t0, %t1 %t3 = sub nsw <16 x i32> %t0, %t1 @@ -96,15 +83,24 @@ ; X64-AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; X64-AVX512F-NEXT: retq ; -; AVX512BW-LABEL: addb_selectw_64xi8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movl $1, %eax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: ret{{[l|q]}} +; X86-AVX512BW-LABEL: addb_selectw_64xi8: +; X86-AVX512BW: # %bb.0: +; X86-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; X86-AVX512BW-NEXT: movl $3, %eax +; X86-AVX512BW-NEXT: kmovd %eax, %k0 +; X86-AVX512BW-NEXT: kmovd %k0, %k1 +; X86-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} +; X86-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; X86-AVX512BW-NEXT: retl +; +; X64-AVX512BW-LABEL: addb_selectw_64xi8: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; X64-AVX512BW-NEXT: movl $3, %eax +; X64-AVX512BW-NEXT: kmovq %rax, %k1 +; X64-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} +; X64-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; X64-AVX512BW-NEXT: retq %t2 = add nsw <64 x i8> %t0, %t1 %t3 = sub nsw <64 x i8> %t0, %t1 %t4 = shufflevector <64 x i8> %t2, <64 x i8> %t3, <64 x i32> @@ -169,10 +165,9 @@ ; AVX512BW-LABEL: addw_selectd_32xi16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movw $1, %ax +; AVX512BW-NEXT: movl $3, %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <32 x i16> %t0, %t1 @@ -198,20 +193,18 @@ ; AVX512F-LABEL: addd_selectq_16xi32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: movb $1, %al +; AVX512F-NEXT: movw $3, %ax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512BW-LABEL: addd_selectq_16xi32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: movw $3, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <16 x i32> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,12 +2889,26 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX512BW-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -973,18 +973,25 @@ ; SSE-FAST-NEXT: movshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR45747_2: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1] -; AVX-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR45747_2: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] +; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,1,1] +; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: PR45747_2: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm0 ; AVX-FAST-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] ; AVX-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR45747_2: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm1[2,2,2,2] +; AVX512-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,3,3,3] +; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; AVX512-SLOW-NEXT: retq %t0 = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> %t1 = fadd <4 x float> %t0, %b %shuffle = shufflevector <4 x float> %t1, <4 x float> undef, <4 x i32> @@ -1029,17 +1036,29 @@ ; SSE-FAST-NEXT: haddps %xmm1, %xmm0 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f32_0u23: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3] -; AVX-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2] -; AVX-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f32_0u23: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,1],xmm1[0,3] +; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,1],xmm1[1,2] +; AVX1-SLOW-NEXT: vaddps %xmm2, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: PR34724_add_v4f32_0u23: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f32_0u23: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] +; AVX512-SLOW-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm2 +; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm2[0],zero +; AVX512-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX512-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX512-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-SLOW-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> %4 = fadd <4 x float> %3, %0 %5 = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> @@ -1065,18 +1084,26 @@ ; SSE-FAST-NEXT: haddps %xmm1, %xmm0 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f32_01u3: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f32_01u3: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: PR34724_add_v4f32_01u3: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f32_01u3: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovsldup {{.*#+}} xmm2 = xmm1[0,0,2,2] +; AVX512-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX512-SLOW-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-SLOW-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> %5 = fadd <2 x float> %3, %4 @@ -1101,18 +1128,26 @@ ; SSE-FAST-NEXT: haddps %xmm1, %xmm0 ; SSE-FAST-NEXT: retq ; -; AVX-SLOW-LABEL: PR34724_add_v4f32_012u: -; AVX-SLOW: # %bb.0: -; AVX-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 -; AVX-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 -; AVX-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-SLOW-NEXT: retq +; AVX1-SLOW-LABEL: PR34724_add_v4f32_012u: +; AVX1-SLOW: # %bb.0: +; AVX1-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX1-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-SLOW-NEXT: retq ; ; AVX-FAST-LABEL: PR34724_add_v4f32_012u: ; AVX-FAST: # %bb.0: ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0 ; AVX-FAST-NEXT: retq +; +; AVX512-SLOW-LABEL: PR34724_add_v4f32_012u: +; AVX512-SLOW: # %bb.0: +; AVX512-SLOW-NEXT: vhaddps %xmm0, %xmm0, %xmm0 +; AVX512-SLOW-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] +; AVX512-SLOW-NEXT: vaddps %xmm1, %xmm2, %xmm1 +; AVX512-SLOW-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],zero +; AVX512-SLOW-NEXT: retq %3 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> %4 = shufflevector <4 x float> %0, <4 x float> undef, <2 x i32> %5 = fadd <2 x float> %3, %4 diff --git a/llvm/test/CodeGen/X86/vselect-avx512.ll b/llvm/test/CodeGen/X86/vselect-avx512.ll --- a/llvm/test/CodeGen/X86/vselect-avx512.ll +++ b/llvm/test/CodeGen/X86/vselect-avx512.ll @@ -17,24 +17,21 @@ ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm2[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: vpminsd %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: movb $-86, %al -; CHECK-NEXT: kmovw %eax, %k2 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm1[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm2[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] +; CHECK-NEXT: vshufps {{.*#+}} zmm2 = zmm1[0,1],zmm0[2,3],zmm1[4,5],zmm0[6,7],zmm1[8,9],zmm0[10,11],zmm1[12,13],zmm0[14,15] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[3,2],zmm1[5,4],zmm0[7,6],zmm1[9,8],zmm0[11,10],zmm1[13,12],zmm0[15,14] +; CHECK-NEXT: vpminsd %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpmaxsd %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm1[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] ; CHECK-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] +; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 +; CHECK-NEXT: kmovw %eax, %k2 +; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k2} +; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm2[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; CHECK-NEXT: vpminsd %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: movb $-52, %al -; CHECK-NEXT: kmovw %eax, %k3 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k3} -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm1[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] -; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2} -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm2[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vshufps {{.*#+}} zmm2 = zmm1[0,1],zmm0[2,3],zmm1[4,5],zmm0[6,7],zmm1[8,9],zmm0[10,11],zmm1[12,13],zmm0[14,15] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[3,2],zmm1[5,4],zmm0[7,6],zmm1[9,8],zmm0[11,10],zmm1[13,12],zmm0[15,14] ; CHECK-NEXT: vpminsd %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpmaxsd %zmm2, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm1[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12] @@ -44,16 +41,15 @@ ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1],zmm0[6,7,4,5] ; CHECK-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[4,5,6,7] ; CHECK-NEXT: vpminsd %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vpmaxsd %zmm0, %zmm1, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k3} +; CHECK-NEXT: vpmaxsd %zmm0, %zmm1, %zmm2 {%k2} ; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm2[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] ; CHECK-NEXT: vpminsd %zmm2, %zmm0, %zmm1 ; CHECK-NEXT: vpmaxsd %zmm2, %zmm0, %zmm0 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2} -; CHECK-NEXT: vpshufd {{.*#+}} zmm0 = zmm1[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovdqu64 %zmm2, (%rdi) +; CHECK-NEXT: vshufps {{.*#+}} zmm2 = zmm1[0,1],zmm0[2,3],zmm1[4,5],zmm0[6,7],zmm1[8,9],zmm0[10,11],zmm1[12,13],zmm0[14,15] +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm1[1,0],zmm0[3,2],zmm1[5,4],zmm0[7,6],zmm1[9,8],zmm0[11,10],zmm1[13,12],zmm0[15,14] +; CHECK-NEXT: vpminsd %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vpmaxsd %zmm2, %zmm0, %zmm1 {%k1} +; CHECK-NEXT: vmovdqu64 %zmm1, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %2 = load <16 x i32>, ptr %0, align 1