diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -284,10 +284,11 @@ /// Enum of different potentially desirable ways to fold (and/or (setcc ...), /// (setcc ...)). - enum class AndOrSETCCFoldKind { - None, - AddAnd, - ABS, + enum AndOrSETCCFoldKind : uint8_t { + None = 0, // No fold is preferable. + AddAnd = 1, // Fold with `Add` op and `And` op is preferable. + NotAnd = 2, // Fold with `Not` op and `And` op is preferable. + ABS = 4, // Fold with `llvm.abs` op is preferable. }; class ArgListEntry { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5926,7 +5926,7 @@ // Preference is to use ISD::ABS or we already have an ISD::ABS (in which // case this is just a compare). if (APLhs == (-APRhs) && - (TargetPreference == AndOrSETCCFoldKind::ABS || + ((TargetPreference & AndOrSETCCFoldKind::ABS) || DAG.doesNodeExist(ISD::ABS, DAG.getVTList(OpVT), {LHS0}))) { const APInt &C = APLhs.isNegative() ? APRhs : APLhs; // (icmp eq A, C) | (icmp eq A, -C) @@ -5936,23 +5936,45 @@ SDValue AbsOp = DAG.getNode(ISD::ABS, DL, OpVT, LHS0); return DAG.getNode(ISD::SETCC, DL, VT, AbsOp, DAG.getConstant(C, DL, OpVT), LHS.getOperand(2)); - } else if (TargetPreference == AndOrSETCCFoldKind::AddAnd) { + } else if (TargetPreference & + (AndOrSETCCFoldKind::AddAnd | AndOrSETCCFoldKind::NotAnd)) { + + // AndOrSETCCFoldKind::AddAnd: // A == C0 | A == C1 // IF IsPow2(smax(C0, C1)-smin(C0, C1)) // -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) == 0 // A != C0 & A != C1 // IF IsPow2(smax(C0, C1)-smin(C0, C1)) // -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) != 0 + + // AndOrSETCCFoldKind::NotAnd: + // A == C0 | A == C1 + // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1)) + // -> ~A & smin(C0, C1) == 0 + // A != C0 & A != C1 + // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1)) + // -> ~A & smin(C0, C1) != 0 + const APInt &MaxC = APIntOps::smax(APRhs, APLhs); const APInt &MinC = APIntOps::smin(APRhs, APLhs); APInt Dif = MaxC - MinC; if (!Dif.isZero() && Dif.isPowerOf2()) { - SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0, - DAG.getConstant(-MinC, DL, OpVT)); - SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp, - DAG.getConstant(~Dif, DL, OpVT)); - return DAG.getNode(ISD::SETCC, DL, VT, AndOp, - DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + if (MaxC.isAllOnes() && + (TargetPreference & AndOrSETCCFoldKind::NotAnd)) { + SDValue NotOp = DAG.getNOT(DL, LHS0, OpVT); + SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, NotOp, + DAG.getConstant(MinC, DL, OpVT)); + return DAG.getNode(ISD::SETCC, DL, VT, AndOp, + DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + } else if (TargetPreference & AndOrSETCCFoldKind::AddAnd) { + + SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0, + DAG.getConstant(-MinC, DL, OpVT)); + SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp, + DAG.getConstant(~Dif, DL, OpVT)); + return DAG.getNode(ISD::SETCC, DL, VT, AndOp, + DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); + } } } } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57260,9 +57260,18 @@ EVT OpVT = SETCC0->getOperand(0).getValueType(); if (!VT.isInteger()) return AndOrSETCCFoldKind::None; + if (VT.isVector()) - return isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS - : AndOrSETCCFoldKind::None; + return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd | + (isOperationLegal(ISD::ABS, OpVT) + ? AndOrSETCCFoldKind::ABS + : AndOrSETCCFoldKind::None)); + + // Don't use `NotAnd` as even though `not` is generally shorter code size than + // `add`, `add` can lower to LEA which can save moves / spills. Any case where + // `NotAnd` applies, `AddAnd` does as well. + // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`, + // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here. return AndOrSETCCFoldKind::AddAnd; } diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -39,28 +39,24 @@ define <4 x i1> @andnot_eq_v4i32(<4 x i32> %x) nounwind { ; AVX512-LABEL: andnot_eq_v4i32: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %k0 -; AVX512-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpandnd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_eq_v4i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [4294967287,4294967287,4294967287,4294967287] -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967287,4294967287,4294967287,4294967287] +; AVX2-NEXT: vpandn %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE-LABEL: andnot_eq_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: por %xmm1, %xmm0 +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm1 +; SSE-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE-NEXT: retq %cmp1 = icmp eq <4 x i32> %x, %cmp2 = icmp eq <4 x i32> %x, @@ -115,41 +111,32 @@ define <2 x i1> @andnot_eq_v2i64(<2 x i64> %x) nounwind { ; AVX512-LABEL: andnot_eq_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %k0 -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %k1 -; AVX512-NEXT: korw %k1, %k0, %k1 -; AVX512-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} {z} +; AVX512-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_eq_v2i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpcmpeqq %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE41-LABEL: andnot_eq_v2i64: ; SSE41: # %bb.0: -; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551611,18446744073709551611] -; SSE41-NEXT: pcmpeqq %xmm0, %xmm1 -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pcmpeqq %xmm2, %xmm0 -; SSE41-NEXT: por %xmm1, %xmm0 +; SSE41-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: andnot_eq_v2i64: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [18446744073709551611,18446744073709551611] -; SSE2-NEXT: pcmpeqd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm0 -; SSE2-NEXT: por %xmm2, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp eq <2 x i64> %x, %cmp2 = icmp eq <2 x i64> %x, @@ -195,30 +182,28 @@ define <8 x i1> @andnot_ne_v8i16(<8 x i16> %x) nounwind { ; AVX512-LABEL: andnot_ne_v8i16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $18, %xmm2, %xmm1, %xmm0 +; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v8i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm2, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; SSE-LABEL: andnot_ne_v8i16: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [49151,49151,49151,49151,49151,49151,49151,49151] -; SSE-NEXT: pcmpeqw %xmm0, %xmm1 -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: pcmpeqw %xmm2, %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq %cmp1 = icmp ne <8 x i16> %x, %cmp2 = icmp ne <8 x i16> %x, @@ -264,30 +249,28 @@ define <16 x i1> @andnot_ne_v16i8(<16 x i8> %x) nounwind { ; AVX512-LABEL: andnot_ne_v16i8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX512-NEXT: vpternlogq $18, %xmm1, %xmm2, %xmm0 +; AVX512-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpternlogq $15, %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: retq ; ; AVX2-LABEL: andnot_ne_v16i8: ; AVX2: # %bb.0: ; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm2 -; AVX2-NEXT: vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0 ; AVX2-NEXT: retq ; ; SSE-LABEL: andnot_ne_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: pcmpeqb %xmm2, %xmm1 -; SSE-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE-NEXT: pxor %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pcmpeqd %xmm1, %xmm1 +; SSE-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: pxor %xmm2, %xmm2 +; SSE-NEXT: pcmpeqb %xmm2, %xmm0 +; SSE-NEXT: pxor %xmm1, %xmm0 ; SSE-NEXT: retq %cmp1 = icmp ne <16 x i8> %x, %cmp2 = icmp ne <16 x i8> %x,