diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7038,7 +7038,8 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { V = peekThroughBitcasts(V); if (V.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) || + isAllOnesConstant(V.getOperand(1)))) return V.getOperand(0); if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { @@ -48133,39 +48134,20 @@ return SDValue(); } -/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). -static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) { +static SDValue +combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG, + std::function GetNot) { assert(N->getOpcode() == ISD::AND); MVT VT = N->getSimpleValueType(0); - if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) - return SDValue(); - SDValue X, Y; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - auto GetNot = [&VT, &DAG](SDValue V) { - // Basic X = NOT(Y) detection. - if (SDValue Not = IsNOT(V, DAG)) - return Not; - // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y). - if (V.getOpcode() == X86ISD::VBROADCAST) { - SDValue Src = V.getOperand(0); - EVT SrcVT = Src.getValueType(); - if (!SrcVT.isVector()) - return SDValue(); - if (SDValue Not = IsNOT(Src, DAG)) - return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT, - DAG.getBitcast(SrcVT, Not)); - } - return SDValue(); - }; - - if (SDValue Not = GetNot(N0)) { + if (SDValue Not = GetNot(N0, DAG)) { X = Not; Y = N1; - } else if (SDValue Not = GetNot(N1)) { + } else if (SDValue Not = GetNot(N1, DAG)) { X = Not; Y = N0; } else @@ -48176,6 +48158,59 @@ return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); } +/// Try to fold: +/// and (vector_shuffle +/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y +/// -> +/// andnp (vector_shuffle +/// (insert_vector_elt undef, X, Z), undef), Y +static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::AND); + + EVT VT = N->getValueType(0); + if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()) || + (VT.is512BitVector() && Subtarget.useAVX512Regs()))) + return SDValue(); + + auto GetNot = [](SDValue V, SelectionDAG &DAG) { + auto *SVN = dyn_cast(peekThroughOneUseBitcasts(V)); + if (SVN && SVN->hasOneUse() && SVN->isSplat() && + SVN->getOperand(1).isUndef()) { + SDValue IVEN = SVN->getOperand(0); + if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT || + !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse()) + return SDValue(); + if (!isa(IVEN.getOperand(2)) || + IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex()) + return SDValue(); + SDValue Src = IVEN.getOperand(1); + if (SDValue Not = IsNOT(Src, DAG)) { + SDValue NotIVEN = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), + IVEN.getValueType(), IVEN.getOperand(0), + Not, IVEN.getOperand(2)); + return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN, + SVN->getOperand(1), SVN->getMask()); + } + } + return SDValue(); + }; + + return combineAndNotIntoANDNP(N, DAG, GetNot); +} + +/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). +static SDValue combineAndNot(SDNode *N, SelectionDAG &DAG) { + assert(N->getOpcode() == ISD::AND); + + MVT VT = N->getSimpleValueType(0); + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) + return SDValue(); + + return combineAndNotIntoANDNP(N, DAG, IsNOT); +} + // Try to widen AND, OR and XOR nodes to VT in order to remove casts around // logical operations, like in the example below. // or (and (truncate x, truncate y)), @@ -48753,13 +48788,16 @@ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; + if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) return R; - if (SDValue R = combineAndNotIntoANDNP(N, DAG)) + if (SDValue R = combineAndNot(N, DAG)) return R; if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -473,25 +473,23 @@ define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: notq %rdi ; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <2 x i64> undef, i64 %1, i64 0 @@ -503,25 +501,23 @@ define <4 x i32> @neg_scalar_broadcast_v4i32(i32 %a0, <4 x i32> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <4 x i32> undef, i32 %1, i64 0 @@ -533,26 +529,24 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastw %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i16 %a0, -1 %2 = insertelement <8 x i16> undef, i16 %1, i64 0 @@ -564,27 +558,24 @@ define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: notb %dil -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -596,27 +587,24 @@ define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: notb %dil -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -629,25 +617,23 @@ define <2 x i64> @neg_scalar_broadcast_v4i32_v2i64(i32 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v4i32_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i32_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i32_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <4 x i32> undef, i32 %1, i64 0