diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45012,6 +45012,12 @@ if (!IsAnyOf && !IsAllOf) return SDValue(); + // TODO: Check more combining cases for me. + // Here we check the cmp use number to decide do combining or not. + // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))" + // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint. + bool IsOneUse = CmpOp.getNode()->hasOneUse(); + // See if we can peek through to a vector with a wider element type, if the // signbits extend down to all the sub-elements as well. // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose @@ -45040,7 +45046,7 @@ // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)). // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)). // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). - if (VecVT.is256BitVector() && NumElts <= CmpBits) { + if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) { SmallVector Ops; if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) && Ops.size() == 2) { @@ -45061,7 +45067,7 @@ // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X). // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(SUB(X,Y),SUB(X,Y)). // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(SUB(X,Y),SUB(X,Y)). - if (IsAllOf && Subtarget.hasSSE41()) { + if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) { MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; SDValue BC = peekThroughBitcasts(Vec); // Ensure MOVMSK was testing every signbit of BC. diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -881,7 +881,7 @@ ret i8 %11 } -; FIXME: Should not "MOVMSK(PCMPEQ(..)) -> PTESTZ(..)" when cmp result has muti-uses. +; Should not "MOVMSK(PCMPEQ(..)) -> PTESTZ(..)" when cmp result has muti-uses. define i32 @test_v32i8_muti_uses(<32 x i8> %x, <32 x i8>%y, i32 %z) { ; SSE-LABEL: test_v32i8_muti_uses: ; SSE: # %bb.0: @@ -914,10 +914,9 @@ ; ; AVX2-LABEL: test_v32i8_muti_uses: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm2 -; AVX2-NEXT: vpmovmskb %ymm2, %ecx -; AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vptest %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %ecx +; AVX2-NEXT: cmpl $-1, %ecx ; AVX2-NEXT: movl $16, %eax ; AVX2-NEXT: cmovnel %ecx, %eax ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -1358,11 +1358,9 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2 -; AVX2-NEXT: vpmovmskb %ymm2, %eax -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpmovmskb %xmm0, %ecx -; AVX2-NEXT: testl %ecx, %ecx +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpmovmskb %ymm0, %eax +; AVX2-NEXT: testl %eax, %eax ; AVX2-NEXT: sete %dl ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq