diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19302,6 +19302,37 @@ return false; } +static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + MVT VT = V1.getSimpleValueType().getScalarType(); + if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) + return false; + + auto HasMaskOperation = [](SDValue V) { + switch (V->getOpcode()) { + default: + return false; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::XOR: + break; + } + if (!V->hasOneUse()) + return false; + + return true; + }; + + if (HasMaskOperation(V1) || HasMaskOperation(V2)) + return true; + + return false; +} + // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, @@ -19377,6 +19408,7 @@ // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + !canCombineAsMaskOperation(V1, V2, Subtarget) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll @@ -6,10 +6,7 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 ; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: movb $-86, %al -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15] ; CHECK-NEXT: retq entry: %t2 = add nsw <16 x i32> %t0, %t1 @@ -64,10 +61,9 @@ ; CHECK-LABEL: addb_selectw_64xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpsubb %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: movl $3, %eax +; CHECK-NEXT: kmovq %rax, %k1 +; CHECK-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %t2 = add nsw <64 x i8> %t0, %t1 @@ -80,9 +76,10 @@ ; CHECK-LABEL: addb_selectw_32xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm2 -; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; CHECK-NEXT: movl $3, %eax +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vpsubb %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ; CHECK-NEXT: retq %t2 = add nsw <32 x i8> %t0, %t1 %t3 = sub nsw <32 x i8> %t0, %t1 @@ -94,8 +91,10 @@ ; CHECK-LABEL: addb_selectw_16xi8: ; CHECK: # %bb.0: ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; CHECK-NEXT: movw $3, %ax +; CHECK-NEXT: kmovd %eax, %k1 +; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %t2 = add nsw <16 x i8> %t0, %t1 %t3 = sub nsw <16 x i8> %t0, %t1 @@ -107,10 +106,9 @@ ; CHECK-LABEL: addw_selectd_32xi16: ; CHECK: # %bb.0: ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: movw $1, %ax +; CHECK-NEXT: movl $3, %eax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %t2 = add nsw <32 x i16> %t0, %t1 @@ -136,10 +134,9 @@ ; CHECK-LABEL: addd_selectq_16xi32: ; CHECK: # %bb.0: ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: movw $3, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; CHECK-NEXT: vmovdqa64 %zmm2, %zmm0 ; CHECK-NEXT: retq %t2 = add nsw <16 x i32> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,12 +2889,26 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX512BW-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: