diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -45136,6 +45136,65 @@ return SDValue(); } +static SDValue combinePTEST(SDNode *N, SelectionDAG &DAG) { + assert((N->getOpcode() == X86ISD::PTEST || N->getOpcode() == X86ISD::TESTP) && + "Unsupported TEST instruction"); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT OpVT = Op0.getValueType(); + EVT VT = N->getValueType(0); + + // TEST*(~X,Y) == TEST*(X,Y) + if (SDValue NotOp0 = IsNOT(Op0, DAG)) { + // TODO - do we need more thorough CondCode coverage? + if (llvm::all_of(N->uses(), [](SDNode *U) { + return U->getOpcode() == X86ISD::SETCC && + (U->getConstantOperandVal(0) == X86::COND_A || + U->getConstantOperandVal(0) == X86::COND_B || + U->getConstantOperandVal(0) == X86::COND_E); + })) { + SDLoc DL(N); + SDValue NewTEST = DAG.getNode(N->getOpcode(), DL, VT, + DAG.getBitcast(OpVT, NotOp0), Op1); + for (SDNode *U : N->uses()) { + X86::CondCode CC = (X86::CondCode)U->getConstantOperandVal(0); + X86::CondCode InvCC; + switch (CC) { + default: + llvm_unreachable("Unhandled PTEST/TESTP EFLAG"); + case X86::COND_B: + // testc -> testz. + InvCC = X86::COND_E; + break; + case X86::COND_E: + // testz -> testc. + InvCC = X86::COND_B; + case X86::COND_A: + // testnzc -> testnzc (no change). + InvCC = CC; + break; + } + SDValue InvU = getSETCC(InvCC, NewTEST, DL, DAG); + DAG.ReplaceAllUsesWith(U, InvU.getNode()); + } + return NewTEST; + } + } + + // TODO: Handle TEST*(X,~Y) == TEST*(Y,X) + + // TESTZ(X,-1) == TESTZ(X,X) + if (ISD::isBuildVectorAllOnes(Op1.getNode()) && + llvm::all_of(N->uses(), [](SDNode *U) { + return U->getOpcode() == X86ISD::SETCC && + U->getConstantOperandVal(0) == X86::COND_E; + })) { + return DAG.getNode(N->getOpcode(), SDLoc(N), VT, Op0, Op0); + } + + return SDValue(); +} + static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { // With vector masks we only demand the upper bit of the mask. @@ -47205,6 +47264,8 @@ case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); case X86ISD::VTRUNC: return combineVTRUNC(N, DAG); + case X86ISD::PTEST: + case X86ISD::TESTP: return combinePTEST(N, DAG); case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); diff --git a/llvm/test/CodeGen/X86/combine-ptest.ll b/llvm/test/CodeGen/X86/combine-ptest.ll --- a/llvm/test/CodeGen/X86/combine-ptest.ll +++ b/llvm/test/CodeGen/X86/combine-ptest.ll @@ -9,9 +9,7 @@ ; CHECK-LABEL: ptestz_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vptest %xmm1, %xmm0 +; CHECK-NEXT: vptest %xmm0, %xmm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, @@ -25,10 +23,7 @@ ; CHECK-LABEL: ptestz_256_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vptest %ymm1, %ymm0 +; CHECK-NEXT: vptest %ymm0, %ymm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -47,10 +42,8 @@ ; CHECK-LABEL: ptestc_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vptest %xmm0, %xmm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> ) @@ -63,11 +56,8 @@ ; CHECK-LABEL: ptestc_256_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vptest %ymm1, %ymm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vptest %ymm0, %ymm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t1 = xor <4 x i64> %c, @@ -85,10 +75,8 @@ ; CHECK-LABEL: ptestnzc_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vptest %xmm0, %xmm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> ) @@ -103,7 +91,6 @@ ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vptest %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper @@ -123,10 +110,8 @@ ; CHECK-LABEL: ptestnzc_128_flip: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t1 = xor <2 x i64> %c, %t2 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %d) @@ -139,9 +124,6 @@ ; CHECK-LABEL: ptestnzc_256_flip: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vptest %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper @@ -158,10 +140,8 @@ ; CHECK: # %bb.0: # %start ; CHECK-NEXT: vmovdqa (%rdi), %xmm0 ; CHECK-NEXT: vpcmpgtb (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vptest %xmm1, %xmm0 -; CHECK-NEXT: setb %al +; CHECK-NEXT: vptest %xmm0, %xmm0 +; CHECK-NEXT: sete %al ; CHECK-NEXT: retq start: %0 = load <16 x i8>, <16 x i8>* %x, align 16 diff --git a/llvm/test/CodeGen/X86/combine-testpd.ll b/llvm/test/CodeGen/X86/combine-testpd.ll --- a/llvm/test/CodeGen/X86/combine-testpd.ll +++ b/llvm/test/CodeGen/X86/combine-testpd.ll @@ -9,9 +9,7 @@ ; CHECK-LABEL: testpdz_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vtestpd %xmm1, %xmm0 +; CHECK-NEXT: vtestpd %xmm0, %xmm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <2 x double> %c to <2 x i64> @@ -27,10 +25,7 @@ ; CHECK-LABEL: testpdz_256_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vtestpd %ymm1, %ymm0 +; CHECK-NEXT: vtestpd %ymm0, %ymm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -51,10 +46,8 @@ ; CHECK-LABEL: testpdc_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vtestpd %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vtestpd %xmm0, %xmm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <2 x double> %c to <2 x i64> %t1 = xor <2 x i64> %t0, @@ -69,11 +62,8 @@ ; CHECK-LABEL: testpdc_256_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vtestpd %ymm1, %ymm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vtestpd %ymm0, %ymm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = bitcast <4 x double> %c to <4 x i64> @@ -94,7 +84,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: retq @@ -113,7 +102,6 @@ ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper @@ -135,8 +123,6 @@ ; CHECK-LABEL: testpdnzc_128_flip: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestpd %xmm1, %xmm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: retq @@ -153,9 +139,6 @@ ; CHECK-LABEL: testpdnzc_256_flip: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestpd %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll --- a/llvm/test/CodeGen/X86/combine-testps.ll +++ b/llvm/test/CodeGen/X86/combine-testps.ll @@ -9,9 +9,7 @@ ; CHECK-LABEL: testpsz_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vtestps %xmm1, %xmm0 +; CHECK-NEXT: vtestps %xmm0, %xmm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <4 x float> %c to <2 x i64> @@ -27,10 +25,7 @@ ; CHECK-LABEL: testpsz_256_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vtestps %ymm1, %ymm0 +; CHECK-NEXT: vtestps %ymm0, %ymm0 ; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -51,10 +46,8 @@ ; CHECK-LABEL: testpsc_128_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vtestps %xmm1, %xmm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vtestps %xmm0, %xmm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: retq %t0 = bitcast <4 x float> %c to <2 x i64> %t1 = xor <2 x i64> %t0, @@ -69,11 +62,8 @@ ; CHECK-LABEL: testpsc_256_invert: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: vtestps %ymm1, %ymm0 -; CHECK-NEXT: cmovael %esi, %eax +; CHECK-NEXT: vtestps %ymm0, %ymm0 +; CHECK-NEXT: cmovnel %esi, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %t0 = bitcast <8 x float> %c to <4 x i64> @@ -94,7 +84,6 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: vtestps %xmm1, %xmm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: retq @@ -113,7 +102,6 @@ ; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: vtestps %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper @@ -135,8 +123,6 @@ ; CHECK-LABEL: testpsnzc_128_flip: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; CHECK-NEXT: vtestps %xmm1, %xmm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: retq @@ -153,9 +139,6 @@ ; CHECK-LABEL: testpsnzc_256_flip: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 -; CHECK-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vtestps %ymm1, %ymm0 ; CHECK-NEXT: cmovbel %esi, %eax ; CHECK-NEXT: vzeroupper