diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39777,6 +39777,75 @@ return SDValue(); } +/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC +/// to avoid the inversion. +static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, + SelectionDAG &DAG) { + // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST. + if (EFLAGS.getOpcode() != X86ISD::PTEST && + EFLAGS.getOpcode() != X86ISD::TESTP) + return SDValue(); + + // PTEST/TESTP sets EFLAGS as: + // TESTZ: ZF = (Op0 & Op1) == 0 + // TESTC: CF = (~Op0 & Op1) == 0 + // TESTNZC: ZF == 0 && CF == 0 + EVT VT = EFLAGS.getValueType(); + SDValue Op0 = EFLAGS.getOperand(0); + SDValue Op1 = EFLAGS.getOperand(1); + EVT OpVT = Op0.getValueType(); + + // TEST*(~X,Y) == TEST*(X,Y) + if (SDValue NotOp0 = IsNOT(Op0, DAG)) { + X86::CondCode InvCC; + switch (CC) { + case X86::COND_B: + // testc -> testz. + InvCC = X86::COND_E; + break; + case X86::COND_AE: + // !testc -> !testz. + InvCC = X86::COND_NE; + break; + case X86::COND_E: + // testz -> testc. + InvCC = X86::COND_B; + break; + case X86::COND_NE: + // !testz -> !testc. + InvCC = X86::COND_AE; + break; + case X86::COND_A: + case X86::COND_BE: + // testnzc -> testnzc (no change). + InvCC = CC; + break; + default: + InvCC = X86::COND_INVALID; + break; + } + + if (InvCC != X86::COND_INVALID) { + CC = InvCC; + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, + DAG.getBitcast(OpVT, NotOp0), Op1); + } + } + + // TODO: TESTZ(X,~Y) == TESTC(Y,X) + + // TESTZ(X,-1) == TESTZ(X,X) + // TESTZ(-1,X) == TESTZ(X,X) + if (CC == X86::COND_E || CC == X86::COND_NE) { + if (ISD::isBuildVectorAllOnes(Op0.getNode())) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); + if (ISD::isBuildVectorAllOnes(Op1.getNode())) + return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); + } + + return SDValue(); +} + /// Optimize an EFLAGS definition used according to the condition code \p CC /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. @@ -39789,6 +39858,10 @@ if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; + + if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG)) + return R; + return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll --- a/llvm/test/CodeGen/X86/combine-ptest.ll +++ b/llvm/test/CodeGen/X86/combine-ptest.ll @@ -9,10 +9,8 @@ ; CHECK-LABEL: ptestz_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, %t2 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %t1, <2 x i64> %d) @@ -25,11 +23,8 @@ ; CHECK-LABEL: ptestz_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vptest %ymm1, %ymm0 -; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t1 = xor <4 x i64> %c, @@ -85,10 +80,8 @@ ; CHECK-LABEL: ptestc_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d) @@ -101,11 +94,8 @@ ; CHECK-LABEL: ptestc_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vptest %ymm1, %ymm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t1 = xor <4 x i64> %c, @@ -123,10 +113,8 @@ ; CHECK-LABEL: ptestnzc_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d) @@ -139,9 +127,6 @@ ; CHECK-LABEL: ptestnzc_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vptest %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper @@ -153,6 +138,21 @@ ret i32 %t4 } +define i32 @ptestnzc_256_invert0_commute(<4 x i64> %c, <4 x i64> %d, i32 %a, i32 %b) { +; CHECK-LABEL: ptestnzc_256_invert0_commute: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: cmoval %esi, %eax +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %t1 = xor <4 x i64> %c, + %t2 = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %t1, <4 x i64> %d) + %t3 = icmp eq i32 %t2, 0 + %t4 = select i1 %t3, i32 %a, i32 %b + ret i32 %t4 +} + ; ; testz(-1,X) -> testz(X,X) ; @@ -161,8 +161,7 @@ ; CHECK-LABEL: ptestz_128_allones0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vptest %xmm0, %xmm1 +; CHECK-NEXT: vptest %xmm0, %xmm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> , <2 x i64> %c) @@ -175,9 +174,7 @@ ; CHECK-LABEL: ptestz_256_allones0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vptest %ymm0, %ymm1 +; CHECK-NEXT: vptest %ymm0, %ymm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -195,8 +192,7 @@ ; CHECK-LABEL: ptestz_128_allones1: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vptest %xmm1, %xmm0 +; CHECK-NEXT: vptest %xmm0, %xmm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> ) @@ -209,9 +205,7 @@ ; CHECK-LABEL: ptestz_256_allones1: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: vptest %ymm0, %ymm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -226,10 +220,8 @@ ; CHECK: # %bb.0: # %start ; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: setb %al +; CHECK-NEXT: vptest %xmm0, %xmm0 +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq start: %0 = load <16 x i8>, <16 x i8>* %x, align 16 diff --git a/llvm/test/CodeGen/X86/combine-testpd.ll b/llvm/test/CodeGen/X86/combine-testpd.ll --- a/llvm/test/CodeGen/X86/combine-testpd.ll +++ b/llvm/test/CodeGen/X86/combine-testpd.ll @@ -9,10 +9,8 @@ ; CHECK-LABEL: testpdz_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 -; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <2 x double> %c to <2 x i64> %t1 = xor <2 x i64> %t0, @@ -27,11 +25,8 @@ ; CHECK-LABEL: testpdz_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 -; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = bitcast <4 x double> %c to <4 x i64> @@ -93,10 +88,8 @@ ; CHECK-LABEL: testpdc_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <2 x double> %c to <2 x i64> %t1 = xor <2 x i64> %t0, @@ -111,11 +104,8 @@ ; CHECK-LABEL: testpdc_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = bitcast <4 x double> %c to <4 x i64> @@ -135,8 +125,6 @@ ; CHECK-LABEL: testpdnzc_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: retq @@ -153,9 +141,6 @@ ; CHECK-LABEL: testpdnzc_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll --- a/llvm/test/CodeGen/X86/combine-testps.ll +++ b/llvm/test/CodeGen/X86/combine-testps.ll @@ -9,10 +9,8 @@ ; CHECK-LABEL: testpsz_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestps %xmm1, %xmm0 -; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <4 x float> %c to <2 x i64> %t1 = xor <2 x i64> %t0, @@ -27,11 +25,8 @@ ; CHECK-LABEL: testpsz_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestps %ymm1, %ymm0 -; CHECK-NEXT: cmovnel %esi, %eax +; CHECK-NEXT: cmovael %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = bitcast <8 x float> %c to <4 x i64> @@ -93,10 +88,8 @@ ; CHECK-LABEL: testpsc_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestps %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <4 x float> %c to <2 x i64> %t1 = xor <2 x i64> %t0, @@ -111,11 +104,8 @@ ; CHECK-LABEL: testpsc_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestps %ymm1, %ymm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = bitcast <8 x float> %c to <4 x i64> @@ -135,8 +125,6 @@ ; CHECK-LABEL: testpsnzc_128_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestps %xmm1, %xmm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: retq @@ -153,9 +141,6 @@ ; CHECK-LABEL: testpsnzc_256_invert0: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestps %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper