diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -16871,6 +16871,12 @@ return SDValue(); } + // Avoid returning the same shuffle operation. For example, + // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, + // undef:v16i16 + if (CrossLaneMask == Mask || InLaneMask == Mask) + return SDValue(); + SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), InLaneMask); @@ -17621,8 +17627,6 @@ RepeatedMask[Idx] = M + (Lane * NumLaneElts); } } - SDValue RepeatedShuffle = - DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); // Shuffle each source sub-lane to its destination. SmallVector SubLaneMask((unsigned)NumElts, -1); @@ -17634,6 +17638,14 @@ SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); } + // Avoid returning the same shuffle operation. + // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32 + if (RepeatedMask == Mask || SubLaneMask == Mask) + return SDValue(); + + SDValue RepeatedShuffle = + DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); + return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), SubLaneMask); }; @@ -19385,6 +19397,44 @@ return false; } +static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return false; + + MVT VT = V1.getSimpleValueType().getScalarType(); + if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) + return false; + + // i8 is better to be widen to i16, because there is PBLENDW for vXi16 + // when the vector bit size is 128 or 256. + if (VT == MVT::i8 && V1.getSimpleValueType().getSizeInBits() < 512) + return false; + + auto HasMaskOperation = [&](SDValue V) { + // TODO: Currently we only check limited opcode. We probably extend + // it to all binary operation by checking TLI.isBinOp(). + switch (V->getOpcode()) { + default: + return false; + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::XOR: + break; + } + if (!V->hasOneUse()) + return false; + + return true; + }; + + if (HasMaskOperation(V1) || HasMaskOperation(V2)) + return true; + + return false; +} + // Forward declaration. static SDValue canonicalizeShuffleMaskWithHorizOp( MutableArrayRef Ops, MutableArrayRef Mask, @@ -19460,6 +19510,7 @@ // integers to handle flipping the low and high halves of AVX 256-bit vectors. SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + !canCombineAsMaskOperation(V1, V2, Subtarget) && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-blend.ll @@ -1,29 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X86-AVX512F ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX512F,X64-AVX512F -; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW -; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X86-AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=CHECK,AVX512BW,X64-AVX512BW define <16 x i32> @shuffle_v8i64(<16 x i32> %t0, <16 x i32> %t1) { -; AVX512F-LABEL: shuffle_v8i64: -; AVX512F: # %bb.0: # %entry -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: movb $-86, %al -; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: ret{{[l|q]}} -; -; AVX512BW-LABEL: shuffle_v8i64: -; AVX512BW: # %bb.0: # %entry -; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movb $-86, %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: ret{{[l|q]}} +; CHECK-LABEL: shuffle_v8i64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vpsubd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vshufps {{.*#+}} zmm0 = zmm2[0,1],zmm0[2,3],zmm2[4,5],zmm0[6,7],zmm2[8,9],zmm0[10,11],zmm2[12,13],zmm0[14,15] +; CHECK-NEXT: ret{{[l|q]}} entry: %t2 = add nsw <16 x i32> %t0, %t1 %t3 = sub nsw <16 x i32> %t0, %t1 @@ -96,15 +83,24 @@ ; X64-AVX512F-NEXT: vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0 ; X64-AVX512F-NEXT: retq ; -; AVX512BW-LABEL: addb_selectw_64xi8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movl $1, %eax -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512BW-NEXT: ret{{[l|q]}} +; X86-AVX512BW-LABEL: addb_selectw_64xi8: +; X86-AVX512BW: # %bb.0: +; X86-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; X86-AVX512BW-NEXT: movl $3, %eax +; X86-AVX512BW-NEXT: kmovd %eax, %k0 +; X86-AVX512BW-NEXT: kmovd %k0, %k1 +; X86-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} +; X86-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; X86-AVX512BW-NEXT: retl +; +; X64-AVX512BW-LABEL: addb_selectw_64xi8: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vpaddb %zmm1, %zmm0, %zmm2 +; X64-AVX512BW-NEXT: movl $3, %eax +; X64-AVX512BW-NEXT: kmovq %rax, %k1 +; X64-AVX512BW-NEXT: vpsubb %zmm1, %zmm0, %zmm2 {%k1} +; X64-AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; X64-AVX512BW-NEXT: retq %t2 = add nsw <64 x i8> %t0, %t1 %t3 = sub nsw <64 x i8> %t0, %t1 %t4 = shufflevector <64 x i8> %t2, <64 x i8> %t3, <64 x i32> @@ -169,10 +165,9 @@ ; AVX512BW-LABEL: addw_selectd_32xi16: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddw %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movw $1, %ax +; AVX512BW-NEXT: movl $3, %eax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsubw %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <32 x i16> %t0, %t1 @@ -198,20 +193,18 @@ ; AVX512F-LABEL: addd_selectq_16xi32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: movb $1, %al +; AVX512F-NEXT: movw $3, %ax ; AVX512F-NEXT: kmovw %eax, %k1 -; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512F-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512F-NEXT: ret{{[l|q]}} ; ; AVX512BW-LABEL: addd_selectq_16xi32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vpaddd %zmm1, %zmm0, %zmm2 -; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: movw $3, %ax ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpsubd %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 ; AVX512BW-NEXT: ret{{[l|q]}} %t2 = add nsw <16 x i32> %t0, %t1 diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll --- a/llvm/test/CodeGen/X86/combine-sdiv.ll +++ b/llvm/test/CodeGen/X86/combine-sdiv.ll @@ -2889,12 +2889,26 @@ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] ; AVX1-NEXT: retq ; -; AVX2ORLATER-LABEL: combine_vec_sdiv_nonuniform7: -; AVX2ORLATER: # %bb.0: -; AVX2ORLATER-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpsubw %xmm0, %xmm1, %xmm1 -; AVX2ORLATER-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] -; AVX2ORLATER-NEXT: retq +; AVX2-LABEL: combine_vec_sdiv_nonuniform7: +; AVX2: # %bb.0: +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX2-NEXT: retq +; +; AVX512F-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512F-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3] +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: combine_vec_sdiv_nonuniform7: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512BW-NEXT: vpsubw %xmm0, %xmm1, %xmm1 +; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; AVX512BW-NEXT: retq ; ; XOP-LABEL: combine_vec_sdiv_nonuniform7: ; XOP: # %bb.0: