diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -19302,6 +19302,40 @@ return false; } +static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + if (V1.getSimpleValueType().getSizeInBits() < 128) + return false; + + MVT VT = V1.getSimpleValueType().getScalarType(); + if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) + return false; + + auto HasMaskOperation = [](SDValue V) { + switch (V->getOpcode()) { + default: + return false; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::XOR: + break; + } + if (!V->hasOneUse()) + return false; + + return true; + }; + + if (HasMaskOperation(V1) || HasMaskOperation(V2)) + return true; + + return false; +} + // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, @@ -19377,6 +19411,7 @@ // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + !canCombineAsMaskOperation(V1, V2, Subtarget) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll @@ -1,29 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X86-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW define <16 x i32> @shuffle_v8i64(<16 x i32> %t0, <16 x i32> %t1) { -; AVX512F-LABEL: shuffle_v8i64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: movb $-86, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: ret{{[l|q]}} -; -; AVX512BW-LABEL: shuffle_v8i64: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movb $-86, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: ret{{[l|q]}} +; CHECK-LABEL: shuffle_v8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15] +; CHECK-NEXT: ret{{[l|q]}} entry: %t2 = add nsw <16 x i32> %t0, %t1 %t3 = sub nsw <16 x i32> %t0, %t1 @@ -96,15 +83,24 @@ ; X64-AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; X64-AVX512F-NEXT: retq ; -; AVX512BW-LABEL: addb_selectw_64xi8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movl $1, %eax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: ret{{[l|q]}} +; X86-AVX512BW-LABEL: addb_selectw_64xi8: +; X86-AVX512BW: # %bb.0: +; X86-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; X86-AVX512BW-NEXT: movl $3, %eax +; X86-AVX512BW-NEXT: kmovd %eax, %k0 +; X86-AVX512BW-NEXT: kmovd %k0, %k1 +; X86-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} +; X86-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; X86-AVX512BW-NEXT: retl +; +; X64-AVX512BW-LABEL: addb_selectw_64xi8: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; X64-AVX512BW-NEXT: movl $3, %eax +; X64-AVX512BW-NEXT: kmovq %rax, %k1 +; X64-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} +; X64-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; X64-AVX512BW-NEXT: retq %t2 = add nsw <64 x i8> %t0, %t1 %t3 = sub nsw <64 x i8> %t0, %t1 %t4 = shufflevector <64 x i8> %t2, <64 x i8> %t3, <64 x i32> @@ -112,13 +108,22 @@ } define <32 x i8> @addb_selectw_32xi8(<32 x i8> %t0, <32 x i8> %t1) { -; CHECK-LABEL: addb_selectw_32xi8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm2 -; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: addb_selectw_32xi8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX512F-NEXT: ret{{[l|q]}} +; +; AVX512BW-LABEL: addb_selectw_32xi8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddb %ymm1, %ymm0, %ymm2 +; AVX512BW-NEXT: movl $3, %eax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpsubb %ymm1, %ymm0, %ymm2 {%k1} +; AVX512BW-NEXT: vmovdqa %ymm2, %ymm0 +; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <32 x i8> %t0, %t1 %t3 = sub nsw <32 x i8> %t0, %t1 %t4 = shufflevector <32 x i8> %t2, <32 x i8> %t3, <32 x i32> @@ -126,18 +131,49 @@ } define <16 x i8> @addb_selectw_16xi8(<16 x i8> %t0, <16 x i8> %t1) { -; CHECK-LABEL: addb_selectw_16xi8: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vpsubb %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; CHECK-NEXT: ret{{[l|q]}} +; AVX512F-LABEL: addb_selectw_16xi8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: ret{{[l|q]}} +; +; AVX512BW-LABEL: addb_selectw_16xi8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: movw $3, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <16 x i8> %t0, %t1 %t3 = sub nsw <16 x i8> %t0, %t1 %t4 = shufflevector <16 x i8> %t2, <16 x i8> %t3, <16 x i32> ret <16 x i8> %t4 } +define <8 x i8> @addb_selectw_8xi8(<8 x i8> %t0, <8 x i8> %t1) { +; AVX512F-LABEL: addb_selectw_8xi8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpsubb %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX512F-NEXT: ret{{[l|q]}} +; +; AVX512BW-LABEL: addb_selectw_8xi8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpaddb %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: movw $3, %ax +; AVX512BW-NEXT: kmovd %eax, %k1 +; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm2 {%k1} +; AVX512BW-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512BW-NEXT: ret{{[l|q]}} + %t2 = add nsw <8 x i8> %t0, %t1 + %t3 = sub nsw <8 x i8> %t0, %t1 + %t4 = shufflevector <8 x i8> %t2, <8 x i8> %t3, <8 x i32> + ret <8 x i8> %t4 +} + define <32 x i16> @addw_selectd_32xi16(<32 x i16> %t0, <32 x i16> %t1) { ; AVX512F-LABEL: addw_selectd_32xi16: ; AVX512F: # %bb.0: @@ -156,10 +192,9 @@ ; AVX512BW-LABEL: addw_selectd_32xi16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movw $1, %ax +; AVX512BW-NEXT: movl $3, %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <32 x i16> %t0, %t1 @@ -185,20 +220,18 @@ ; AVX512F-LABEL: addd_selectq_16xi32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: movb $1, %al +; AVX512F-NEXT: movw $3, %ax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512BW-LABEL: addd_selectq_16xi32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: movw $3, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <16 x i32> %t0, %t1 @@ -221,3 +254,17 @@ ret <8 x i32> %t4 } + +define <4 x i32> @addd_selectq_4xi32(<4 x i32> %t0, <4 x i32> %t1) { +; CHECK-LABEL: addd_selectq_4xi32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] +; CHECK-NEXT: ret{{[l|q]}} + %t2 = add nsw <4 x i32> %t0, %t1 + %t3 = sub nsw <4 x i32> %t0, %t1 + %t4 = shufflevector <4 x i32> %t2, <4 x i32> %t3, <4 x i32> + + ret <4 x i32> %t4 +} diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,12 +2889,26 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX512BW-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: