diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39101,6 +39101,71 @@ return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp); } + +// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C) +// Where C is a mask containing the same number of bits as the setcc and +// where the setcc will freely 0 upper bits of k-register. We can replace the +// undef in the concat with 0s and remove the AND. This mainly helps with +// v2i1/v4i1 setcc being casted to scalar. +static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode!"); + + EVT VT = N->getValueType(0); + + // Make sure this is an AND with constant. We will check the value of the + // constant later. + if (!isa<ConstantSDNode>(N->getOperand(1))) + return SDValue(); + + // This is implied by the ConstantSDNode. + assert(!VT.isVector() && "Expected scalar VT!"); + + if (N->getOperand(0).getOpcode() != ISD::BITCAST || + !N->getOperand(0).hasOneUse() || + !N->getOperand(0).getOperand(0).hasOneUse()) + return SDValue(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Src = N->getOperand(0).getOperand(0); + EVT SrcVT = Src.getValueType(); + if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 || + !TLI.isTypeLegal(SrcVT)) + return SDValue(); + + if (Src.getOpcode() != ISD::CONCAT_VECTORS) + return SDValue(); + + // We only care about the first subvector of the concat, we expect the + // other subvectors to be ignored due to the AND if we make the change. + SDValue SubVec = Src.getOperand(0); + EVT SubVecVT = SubVec.getValueType(); + + // First subvector should be a setcc with a legal result type. The RHS of the + // AND should be a mask with this many bits. + if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) || + !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements())) + return SDValue(); + + EVT SetccVT = SubVec.getOperand(0).getValueType(); + if (!TLI.isTypeLegal(SetccVT) || + !(Subtarget.hasVLX() || SetccVT.is512BitVector())) + return SDValue(); + + if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32)) + return SDValue(); + + // We passed all the checks. Rebuild the concat_vectors with zeroes + // and cast it back to VT. + SDLoc dl(N); + SmallVector<SDValue, 4> Ops(Src.getNumOperands(), + DAG.getConstant(0, dl, SubVecVT)); + Ops[0] = SubVec; + SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, + Ops); + return DAG.getBitcast(VT, Concat); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -39150,6 +39215,9 @@ } } + if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget)) + return V; + if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -675,7 +675,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpmovd2m %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -714,8 +713,7 @@ ; SKX-LABEL: allzeros_v4i32_sign: ; SKX: # %bb.0: ; SKX-NEXT: vpmovd2m %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $15, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = icmp slt <4 x i32> %arg, zeroinitializer @@ -963,7 +961,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vpmovq2m %ymm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -1005,8 +1002,7 @@ ; SKX-LABEL: allzeros_v4i64_sign: ; SKX: # %bb.0: ; SKX-NEXT: vpmovq2m %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $15, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1971,7 +1967,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -2012,8 +2007,7 @@ ; SKX-LABEL: allzeros_v4i32_and1: ; SKX: # %bb.0: ; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $15, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <4 x i32> %arg, <i32 1, i32 1, i32 1, i32 1> @@ -2310,7 +2304,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip), %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $3, %al ; SKX-NEXT: cmpb $3, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -2352,8 +2345,7 @@ ; SKX-LABEL: allzeros_v2i64_and1: ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip), %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $3, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, <i64 1, i64 1> @@ -2410,7 +2402,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -2467,8 +2458,7 @@ ; SKX-LABEL: allzeros_v4i64_and1: ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $15, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -3380,7 +3370,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -3421,8 +3410,7 @@ ; SKX-LABEL: allzeros_v4i32_and4: ; SKX: # %bb.0: ; SKX-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $15, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <4 x i32> %arg, <i32 4, i32 4, i32 4, i32 4> @@ -3719,7 +3707,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip), %xmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $3, %al ; SKX-NEXT: cmpb $3, %al ; SKX-NEXT: sete %al ; SKX-NEXT: retq @@ -3761,8 +3748,7 @@ ; SKX-LABEL: allzeros_v2i64_and4: ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip), %xmm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $3, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: retq %tmp = and <2 x i64> %arg, <i64 4, i64 4> @@ -3819,7 +3805,6 @@ ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0 ; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: andb $15, %al ; SKX-NEXT: cmpb $15, %al ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper @@ -3876,8 +3861,7 @@ ; SKX-LABEL: allzeros_v4i64_and4: ; SKX: # %bb.0: ; SKX-NEXT: vptestmq {{.*}}(%rip){1to4}, %ymm0, %k0 -; SKX-NEXT: kmovd %k0, %eax -; SKX-NEXT: testb $15, %al +; SKX-NEXT: kortestb %k0, %k0 ; SKX-NEXT: sete %al ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -937,7 +937,6 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $3, %al ; AVX512-NEXT: cmpb $3, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq @@ -969,7 +968,6 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $15, %al ; AVX512-NEXT: cmpb $15, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq @@ -1006,7 +1004,6 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmplepd %ymm0, %ymm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $15, %al ; AVX512-NEXT: cmpb $15, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper @@ -1087,7 +1084,6 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $3, %al ; AVX512-NEXT: cmpb $3, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq @@ -1121,7 +1117,6 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $15, %al ; AVX512-NEXT: cmpb $15, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: retq @@ -1244,7 +1239,6 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: andb $15, %al ; AVX512-NEXT: cmpb $15, %al ; AVX512-NEXT: sete %al ; AVX512-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -857,7 +857,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpltpd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $3, %al +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: retq %a = fcmp ogt <2 x double> %x, %y @@ -888,7 +888,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmpeqps %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: retq %a = fcmp oeq <4 x float> %x, %y @@ -924,7 +924,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vcmplepd %ymm0, %ymm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -1004,7 +1004,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $3, %al +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: retq %a = icmp ugt <2 x i64> %x, %y @@ -1035,7 +1035,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpneqd %xmm1, %xmm0, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: retq %a = icmp ne <4 x i32> %x, %y @@ -1157,7 +1157,7 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpcmpgtq %ymm0, %ymm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax -; AVX512-NEXT: testb $15, %al +; AVX512-NEXT: testb %al, %al ; AVX512-NEXT: setne %al ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -55,7 +55,6 @@ ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andb $3, %al ; AVX512VL-NEXT: cmpb $3, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq @@ -108,7 +107,6 @@ ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: cmpb $15, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq @@ -259,7 +257,6 @@ ; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: cmpb $15, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: vzeroupper @@ -943,7 +940,6 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andb $3, %al ; AVX512VL-NEXT: cmpb $3, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq @@ -997,7 +993,6 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: cmpb $15, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: retq @@ -1198,7 +1193,6 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: andb $15, %al ; AVX512VL-NEXT: cmpb $15, %al ; AVX512VL-NEXT: sete %al ; AVX512VL-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -53,7 +53,7 @@ ; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $3, %al +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <2 x i64> %0 to <2 x i1> @@ -103,7 +103,7 @@ ; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = trunc <4 x i32> %0 to <4 x i1> @@ -251,7 +251,7 @@ ; AVX512VL-NEXT: vpsllq $63, %ymm0, %ymm0 ; AVX512VL-NEXT: vptestmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -932,7 +932,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $3, %al +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = icmp eq <2 x i64> %0, zeroinitializer @@ -983,7 +983,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: retq %a = icmp eq <4 x i32> %0, zeroinitializer @@ -1181,7 +1181,7 @@ ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vptestnmq %ymm0, %ymm0, %k0 ; AVX512VL-NEXT: kmovd %k0, %eax -; AVX512VL-NEXT: testb $15, %al +; AVX512VL-NEXT: testb %al, %al ; AVX512VL-NEXT: setne %al ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq