diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19304,44 +19304,6 @@ return false; } -static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, - const X86Subtarget &Subtarget) { - if (!Subtarget.hasAVX512()) - return false; - - MVT VT = V1.getSimpleValueType().getScalarType(); - if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) - return false; - - // i8 is better to be widen to i16, because there is PBLENDW for vXi16 - // when the vector bit size is 128 or 256. - if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512) - return false; - - auto HasMaskOperation = [&](SDValue V) { - // TODO: Currently we only check limited opcode. We probably extend - // it to all binary operation by checking TLI.isBinOp(). - switch (V->getOpcode()) { - default: - return false; - case ISD::ADD: - case ISD::SUB: - case ISD::AND: - case ISD::XOR: - break; - } - if (!V->hasOneUse()) - return false; - - return true; - }; - - if (HasMaskOperation(V1) || HasMaskOperation(V2)) - return true; - - return false; -} - // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, @@ -19417,7 +19379,6 @@ // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && - !canCombineAsMaskOperation(V1, V2, Subtarget) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll @@ -1,16 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X86-AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW define <16 x i32> @shuffle_v8i64(<16 x i32> %t0, <16 x i32> %t1) { -; CHECK-LABEL: shuffle_v8i64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15] -; CHECK-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: shuffle_v8i64: +; AVX512F: # %bb.0: # %entry +; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: movb $-86, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-NEXT: ret{{[l|q]}} +; +; AVX512BW-LABEL: shuffle_v8i64: +; AVX512BW: # %bb.0: # %entry +; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movb $-86, %al +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: ret{{[l|q]}} entry: %t2 = add nsw <16 x i32> %t0, %t1 %t3 = sub nsw <16 x i32> %t0, %t1 @@ -83,24 +96,15 @@ ; X64-AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; X64-AVX512F-NEXT: retq ; -; X86-AVX512BW-LABEL: addb_selectw_64xi8: -; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 -; X86-AVX512BW-NEXT: movl $3, %eax -; X86-AVX512BW-NEXT: kmovd %eax, %k0 -; X86-AVX512BW-NEXT: kmovd %k0, %k1 -; X86-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} -; X86-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; X86-AVX512BW-NEXT: retl -; -; X64-AVX512BW-LABEL: addb_selectw_64xi8: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 -; X64-AVX512BW-NEXT: movl $3, %eax -; X64-AVX512BW-NEXT: kmovq %rax, %k1 -; X64-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} -; X64-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; X64-AVX512BW-NEXT: retq +; AVX512BW-LABEL: addb_selectw_64xi8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movl $1, %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <64 x i8> %t0, %t1 %t3 = sub nsw <64 x i8> %t0, %t1 %t4 = shufflevector <64 x i8> %t2, <64 x i8> %t3, <64 x i32> @@ -165,9 +169,10 @@ ; AVX512BW-LABEL: addw_selectd_32xi16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: movl $3, %eax +; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movw $1, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <32 x i16> %t0, %t1 @@ -193,18 +198,20 @@ ; AVX512F-LABEL: addd_selectq_16xi32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: movw $3, %ax +; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: movb $1, %al ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512BW-LABEL: addd_selectq_16xi32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: movw $3, %ax +; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: movb $1, %al ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <16 x i32> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,26 +2889,12 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2-NEXT: retq -; -; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] -; AVX512BW-NEXT: retq +; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2ORLATER: # %bb.0: +; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2ORLATER-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: