diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -289,6 +289,7 @@ AddAnd = 1, // Fold with `Add` op and `And` op is preferable. NotAnd = 2, // Fold with `Not` op and `And` op is preferable. ABS = 4, // Fold with `llvm.abs` op is preferable. + NegAnd = 8, // Fold with `Neg` op and `And` op is preferable. }; class ArgListEntry { diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6158,7 +6158,8 @@ return DAG.getNode(ISD::SETCC, DL, VT, AbsOp, DAG.getConstant(C, DL, OpVT), LHS.getOperand(2)); } else if (TargetPreference & - (AndOrSETCCFoldKind::AddAnd | AndOrSETCCFoldKind::NotAnd)) { + (AndOrSETCCFoldKind::AddAnd | AndOrSETCCFoldKind::NotAnd | + AndOrSETCCFoldKind::NegAnd)) { // AndOrSETCCFoldKind::AddAnd: // A == C0 | A == C1 @@ -6168,6 +6169,14 @@ // IF IsPow2(smax(C0, C1)-smin(C0, C1)) // -> ((A - smin(C0, C1)) & ~(smax(C0, C1)-smin(C0, C1))) != 0 + // AndOrSETCCFoldKind::NegAnd: + // A == C0 | A == C1 + // IF smax(C0, C1) == 0 && IsPow2(smax(C0, C1)-smin(C0, C1)) + // -> ((-A) & ~(smax(C0, C1)-smin(C0, C1))) == 0 + // A != C0 & A != C1 + // IF smax(C0, C1) == 0 && IsPow2(smax(C0, C1)-smin(C0, C1)) + // -> ((-A) & ~(smax(C0, C1)-smin(C0, C1))) != 0 + // AndOrSETCCFoldKind::NotAnd: // A == C0 | A == C1 // IF smax(C0, C1) == -1 AND IsPow2(smax(C0, C1) - smin(C0, C1)) @@ -6180,19 +6189,67 @@ const APInt &MinC = APIntOps::smin(APRhs, APLhs); APInt Dif = MaxC - MinC; if (!Dif.isZero() && Dif.isPowerOf2()) { - if (MaxC.isAllOnes() && - (TargetPreference & AndOrSETCCFoldKind::NotAnd)) { - SDValue NotOp = DAG.getNOT(DL, LHS0, OpVT); - SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, NotOp, - DAG.getConstant(MinC, DL, OpVT)); - return DAG.getNode(ISD::SETCC, DL, VT, AndOp, - DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); - } else if (TargetPreference & AndOrSETCCFoldKind::AddAnd) { - - SDValue AddOp = DAG.getNode(ISD::ADD, DL, OpVT, LHS0, - DAG.getConstant(-MinC, DL, OpVT)); - SDValue AndOp = DAG.getNode(ISD::AND, DL, OpVT, AddOp, - DAG.getConstant(~Dif, DL, OpVT)); + SDValue AndOp = SDValue(); + // Try transforms, if we get any of the ops for free for one of them, + // (i.e we already have -A for NegAnd) then prefer that one. + auto TryOrGetNode = [&DAG, DL, OpVT, TargetPreference, LHS0, + MinC](AndOrSETCCFoldKind Check, bool Try) { + if (!Try) { + Check = AndOrSETCCFoldKind(Check & TargetPreference); + } + switch (Check) { + case AndOrSETCCFoldKind::NegAnd: + if (Try) { + return SDValue( + DAG.getNodeIfExists(ISD::SUB, DAG.getVTList(OpVT), + {DAG.getConstant(0, DL, OpVT), LHS0}), + 0); + } + return DAG.getNode(ISD::SUB, DL, OpVT, DAG.getConstant(0, DL, OpVT), + LHS0); + case AndOrSETCCFoldKind::NotAnd: + if (Try) { + return SDValue( + DAG.getNodeIfExists(ISD::XOR, DAG.getVTList(OpVT), + {LHS0, DAG.getAllOnesConstant(DL, OpVT)}), + 0); + } + return DAG.getNOT(DL, LHS0, OpVT); + case AndOrSETCCFoldKind::AddAnd: + if (Try) { + return SDValue( + DAG.getNodeIfExists(ISD::ADD, DAG.getVTList(OpVT), + {LHS0, DAG.getConstant(-MinC, DL, OpVT)}), + 0); + } + return DAG.getNode(ISD::ADD, DL, OpVT, LHS0, + DAG.getConstant(-MinC, DL, OpVT)); + default: + return SDValue(); + } + }; + auto TryOrGetNodes = [&](bool Try) { + if (MaxC.isZero()) + if (SDValue TransformOp = + TryOrGetNode(AndOrSETCCFoldKind::NegAnd, Try)) + AndOp = DAG.getNode(ISD::AND, DL, OpVT, TransformOp, + DAG.getConstant(~Dif, DL, OpVT)); + if (AndOp) + return true; + if (MaxC.isAllOnes()) + if (SDValue TransformOp = + TryOrGetNode(AndOrSETCCFoldKind::NotAnd, Try)) + AndOp = DAG.getNode(ISD::AND, DL, OpVT, TransformOp, + DAG.getConstant(MinC, DL, OpVT)); + if (AndOp) + return true; + if (SDValue TransformOp = + TryOrGetNode(AndOrSETCCFoldKind::AddAnd, Try)) + AndOp = DAG.getNode(ISD::AND, DL, OpVT, TransformOp, + DAG.getConstant(~Dif, DL, OpVT)); + return !!AndOp; + }; + if (TryOrGetNodes(/*Try*/ true) || TryOrGetNodes(/*Try*/ false)) { return DAG.getNode(ISD::SETCC, DL, VT, AndOp, DAG.getConstant(0, DL, OpVT), LHS.getOperand(2)); } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -56004,10 +56004,10 @@ return AndOrSETCCFoldKind::None; if (VT.isVector()) - return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd | - (isOperationLegal(ISD::ABS, OpVT) - ? AndOrSETCCFoldKind::ABS - : AndOrSETCCFoldKind::None)); + return AndOrSETCCFoldKind( + AndOrSETCCFoldKind::NotAnd | AndOrSETCCFoldKind::NegAnd | + (isOperationLegal(ISD::ABS, OpVT) ? AndOrSETCCFoldKind::ABS + : AndOrSETCCFoldKind::None)); // Don't use `NotAnd` as even though `not` is generally shorter code size than // `add`, `add` can lower to LEA which can save moves / spills. Any case where diff --git a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll --- a/llvm/test/CodeGen/X86/icmp-pow2-diff.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-diff.ll @@ -414,9 +414,9 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: negq %rdi +; CHECK-NEXT: negq %rbx +; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: callq use.i64@PLT -; CHECK-NEXT: addq $256, %rbx # imm = 0x100 ; CHECK-NEXT: testq $-257, %rbx # imm = 0xFEFF ; CHECK-NEXT: sete %al ; CHECK-NEXT: popq %rbx @@ -432,8 +432,9 @@ define <4 x i1> @negand_ne_4xi64(<4 x i64> %x) nounwind { ; AVX512-LABEL: negand_ne_4xi64: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpneqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 -; AVX512-NEXT: vptestmq %ymm0, %ymm0, %k1 {%k1} +; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vzeroupper @@ -442,12 +443,12 @@ ; AVX2-LABEL: negand_ne_4xi64: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm1 -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551612,18446744073709551612,18446744073709551612,18446744073709551612] -; AVX2-NEXT: vpcmpeqq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551611,18446744073709551611,18446744073709551611,18446744073709551611] +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper @@ -456,39 +457,38 @@ ; SSE41-LABEL: negand_ne_4xi64: ; SSE41: # %bb.0: ; SSE41-NEXT: pxor %xmm2, %xmm2 -; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: psubq %xmm0, %xmm3 +; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551611,18446744073709551611] +; SSE41-NEXT: pand %xmm0, %xmm3 +; SSE41-NEXT: pxor %xmm4, %xmm4 +; SSE41-NEXT: psubq %xmm1, %xmm4 +; SSE41-NEXT: pand %xmm0, %xmm4 +; SSE41-NEXT: pcmpeqq %xmm2, %xmm4 ; SSE41-NEXT: pcmpeqq %xmm2, %xmm3 -; SSE41-NEXT: pcmpeqq %xmm1, %xmm2 -; SSE41-NEXT: pcmpeqd %xmm4, %xmm4 -; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [18446744073709551612,18446744073709551612] -; SSE41-NEXT: pcmpeqq %xmm5, %xmm0 -; SSE41-NEXT: por %xmm3, %xmm0 -; SSE41-NEXT: pcmpeqq %xmm5, %xmm1 -; SSE41-NEXT: por %xmm2, %xmm1 -; SSE41-NEXT: packssdw %xmm1, %xmm0 -; SSE41-NEXT: pxor %xmm4, %xmm0 +; SSE41-NEXT: packssdw %xmm4, %xmm3 +; SSE41-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm0 ; SSE41-NEXT: retq ; ; SSE2-LABEL: negand_ne_4xi64: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: psubq %xmm0, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551611,18446744073709551611] +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: psubq %xmm1, %xmm4 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 -; SSE2-NEXT: pcmpeqd %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm3[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE2-NEXT: andps %xmm4, %xmm2 -; SSE2-NEXT: pcmpeqd %xmm3, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [18446744073709551612,18446744073709551612] -; SSE2-NEXT: pcmpeqd %xmm4, %xmm1 -; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm1[1,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] -; SSE2-NEXT: andps %xmm4, %xmm0 -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: xorps %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] +; SSE2-NEXT: andps %xmm0, %xmm3 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: pxor %xmm3, %xmm0 ; SSE2-NEXT: retq %cmp1 = icmp ne <4 x i64> %x, zeroinitializer %cmp2 = icmp ne <4 x i64> %x,