diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -7038,7 +7038,8 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { V = peekThroughBitcasts(V); if (V.getOpcode() == ISD::XOR && - ISD::isBuildVectorAllOnes(V.getOperand(1).getNode())) + (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) || + isAllOnesConstant(V.getOperand(1)))) return V.getOperand(0); if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { @@ -48177,7 +48178,7 @@ /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) { - assert(N->getOpcode() == ISD::AND); + assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP"); MVT VT = N->getSimpleValueType(0); if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) @@ -48187,23 +48188,69 @@ SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); - auto GetNot = [&VT, &DAG](SDValue V) { - // Basic X = NOT(Y) detection. - if (SDValue Not = IsNOT(V, DAG)) - return Not; - // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y). - if (V.getOpcode() == X86ISD::VBROADCAST) { - SDValue Src = V.getOperand(0); - EVT SrcVT = Src.getValueType(); - if (!SrcVT.isVector()) - return SDValue(); - if (SDValue Not = IsNOT(Src, DAG)) - return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT, - DAG.getBitcast(SrcVT, Not)); + if (SDValue Not = IsNOT(N0, DAG)) { + X = Not; + Y = N1; + } else if (SDValue Not = IsNOT(N1, DAG)) { + X = Not; + Y = N0; + } else + return SDValue(); + + X = DAG.getBitcast(VT, X); + Y = DAG.getBitcast(VT, Y); + return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); +} + +/// Try to fold: +/// and (vector_shuffle +/// (insert_vector_elt undef, (xor X, -1), Z), undef), Y +/// -> +/// andnp (vector_shuffle +/// (insert_vector_elt undef, X, Z), undef), Y +static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP"); + + EVT VT = N->getValueType(0); + // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original + // value and require extra moves. + if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || + ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX()))) + return SDValue(); + + auto GetNot = [&DAG](SDValue V) { + auto *SVN = dyn_cast(peekThroughOneUseBitcasts(V)); + // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all + // end-users are ISD::AND including cases + // (and(extract_vector_element(SVN), Y)). + if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() || + !SVN->getOperand(1).isUndef()) { + return SDValue(); + } + SDValue IVEN = SVN->getOperand(0); + if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT || + !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse()) + return SDValue(); + if (!isa(IVEN.getOperand(2)) || + IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex()) + return SDValue(); + SDValue Src = IVEN.getOperand(1); + if (SDValue Not = IsNOT(Src, DAG)) { + SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not); + SDValue NotIVEN = + DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(), + IVEN.getOperand(0), NotSrc, IVEN.getOperand(2)); + return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN, + SVN->getOperand(1), SVN->getMask()); } return SDValue(); }; + SDValue X, Y; + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + if (SDValue Not = GetNot(N0)) { X = Not; Y = N1; @@ -48215,7 +48262,20 @@ X = DAG.getBitcast(VT, X); Y = DAG.getBitcast(VT, Y); - return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); + SDLoc DL(N); + // We do not split for SSE at all, but we need to split vectors for AVX1 and + // AVX2. + if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) { + SDValue LoX, HiX; + std::tie(LoX, HiX) = splitVector(X, DAG, DL); + SDValue LoY, HiY; + std::tie(LoY, HiY) = splitVector(Y, DAG, DL); + EVT SplitVT = LoX.getValueType(); + SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY}); + SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY}); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV}); + } + return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y}); } // Try to widen AND, OR and XOR nodes to VT in order to remove casts around @@ -48795,6 +48855,9 @@ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; + if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -519,9 +519,8 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64_arg: ; AVX512: # %bb.0: -; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 -; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -549,38 +548,35 @@ ; AVX1-LABEL: neg_scalar_broadcast_v8i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: notq %rdi -; AVX1-NEXT: vmovq %rdi, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2 +; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vandpd %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: vmovq %rdi, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; AVX1-NEXT: vandnpd %ymm0, %ymm2, %ymm0 +; AVX1-NEXT: vandnpd %ymm1, %ymm2, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: notq %rdi -; AVX2-NEXT: vmovq %rdi, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,0] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,0] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vmovq %rdi, %xmm2 +; AVX2-NEXT: vpbroadcastq %xmm2, %ymm2 +; AVX2-NEXT: vpandn %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %zmm1 ; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,0,1,1,0,1,0,0] ; AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0 -; AVX512-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <8 x i64> undef, i64 %1, i64 0 @@ -602,26 +598,23 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX1: # %bb.0: -; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX2: # %bb.0: -; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64_arg: ; AVX512: # %bb.0: -; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 -; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -645,32 +638,29 @@ ; AVX1-LABEL: neg_scalar_broadcast_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,3] -; AVX1-NEXT: vandpd %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vandnpd %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v4i64: ; AVX512: # %bb.0: ; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %ymm1 ; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[1,0,1,1] -; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <4 x i64> undef, i64 %1, i64 0 @@ -683,33 +673,30 @@ define <2 x i64> @neg_scalar_broadcast_v2i64(i64 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: notq %rdi ; SSE-NEXT: movq %rdi, %xmm1 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: notq %rdi ; AVX1-NEXT: vmovq %rdi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notq %rdi ; AVX2-NEXT: vmovq %rdi, %xmm1 ; AVX2-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notq %rdi ; AVX512-NEXT: vpbroadcastq %rdi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i64 %a0, -1 %2 = insertelement <2 x i64> undef, i64 %1, i64 0 @@ -762,26 +749,23 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32: ; AVX1: # %bb.0: -; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32: ; AVX512: # %bb.0: -; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0 @@ -793,35 +777,32 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: notl %edi ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v8i16: ; AVX1: # %bb.0: -; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastw %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastw %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i16 %a0, -1 %2 = insertelement <8 x i16> undef, i16 %1, i64 0 @@ -833,36 +814,32 @@ define <16 x i8> @neg_scalar_broadcast_v16i8(i8 %a0, <16 x i8> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: notb %dil -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8: ; AVX1: # %bb.0: -; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8: ; AVX2: # %bb.0: -; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -907,9 +884,8 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -954,9 +930,8 @@ ; ; AVX512-LABEL: neg_scalar_broadcast_v64i8_v8i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %zmm1 -; AVX512-NEXT: vpandq %zmm0, %zmm1, %zmm0 +; AVX512-NEXT: vpandnq %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <64 x i8> undef, i8 %1, i64 0 @@ -980,27 +955,24 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v32i8_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %ymm1 -; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <32 x i8> undef, i8 %1, i64 0 @@ -1013,36 +985,32 @@ define <2 x i64> @neg_scalar_broadcast_v16i8_v2i64(i8 %a0, <2 x i64> %a1) { ; SSE-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; SSE: # %bb.0: -; SSE-NEXT: notb %dil -; SSE-NEXT: movzbl %dil, %eax -; SSE-NEXT: movd %eax, %xmm1 +; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pshufb %xmm2, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm0 +; SSE-NEXT: pandn %xmm0, %xmm1 +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX1: # %bb.0: -; AVX1-NEXT: notb %dil ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notb %dil ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastb %xmm1, %xmm1 -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v16i8_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notb %dil ; AVX512-NEXT: vpbroadcastb %edi, %xmm1 -; AVX512-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX512-NEXT: retq %1 = xor i8 %a0, -1 %2 = insertelement <16 x i8> undef, i8 %1, i64 0 @@ -1064,26 +1032,23 @@ ; ; AVX1-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX1: # %bb.0: -; AVX1-NEXT: notl %edi ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 -; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0 +; AVX1-NEXT: vandnps %ymm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX2: # %bb.0: -; AVX2-NEXT: notl %edi ; AVX2-NEXT: vmovd %edi, %xmm1 ; AVX2-NEXT: vpbroadcastd %xmm1, %ymm1 -; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: neg_scalar_broadcast_v8i32_v4i64: ; AVX512: # %bb.0: -; AVX512-NEXT: notl %edi ; AVX512-NEXT: vpbroadcastd %edi, %ymm1 -; AVX512-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vpandn %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %1 = xor i32 %a0, -1 %2 = insertelement <8 x i32> undef, i32 %1, i64 0