Index: ../lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- ../lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ ../lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1468,6 +1468,10 @@ if (isTypeDesirableForOp(ISD::SETCC, MinVT)) { // Will get folded away. SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt); + if (MinBits == 1 && C1 == 1) + // Invert the condition. + return DAG.getSetCC(dl, VT, Trunc, DAG.getConstant(0, dl, MVT::i1), + Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ); SDValue C = DAG.getConstant(C1.trunc(MinBits), dl, MinVT); return DAG.getSetCC(dl, VT, Trunc, C, Cond); } Index: ../lib/Target/X86/X86ISelLowering.cpp =================================================================== --- ../lib/Target/X86/X86ISelLowering.cpp +++ ../lib/Target/X86/X86ISelLowering.cpp @@ -14988,8 +14988,8 @@ } /// Result of 'and' is compared against zero. Change to a BT node if possible. -SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG) const { +static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) { SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -15056,6 +15056,51 @@ return SDValue(); } +// Convert (truncate (srl X, N) to i1) to (bt X, N) +static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) { + + assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 && + "Expected TRUNCATE to i1 node"); + + if (Op.getOperand(0).getOpcode() != ISD::SRL) + return SDValue(); + + SDValue LHS = Op.getOperand(0).getOperand(0); + SDValue RHS = Op.getOperand(0).getOperand(1); + + // If LHS is i8, promote it to i32 with any_extend. There is no i8 BT + // instruction. Since the shift amount is in-range-or-undefined, we know + // that doing a bittest on the i32 value is ok. We extend to i32 because + // the encoding for the i16 version is larger than the i32 version. + // Also promote i16 to i32 for performance / code size reason. + if (LHS.getValueType() == MVT::i8 || + LHS.getValueType() == MVT::i16) + LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS); + + // If the operand types disagree, extend the shift amount to match. Since + // BT ignores high bits (like shifts) we can use anyextend. + if (LHS.getValueType() != RHS.getValueType()) + RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS); + + SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS); + X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; + return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(Cond, dl, MVT::i8), BT); +} + +/// Result of 'and' or 'trunc to i1' is compared against zero. +/// Change to a BT node if possible. +SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC, + const SDLoc &dl, SelectionDAG &DAG) const { + if (Op.getOpcode() == ISD::AND) + return LowerAndToBT(Op, CC, dl, DAG); + if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1) + return LowerTruncateToBT(Op, CC, dl, DAG); + return SDValue(); +} + + /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, @@ -15583,8 +15628,7 @@ // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). - if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && - isNullConstant(Op1) && + if (Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { if (VT == MVT::i1) { @@ -16779,7 +16823,7 @@ // We know the result of AND is compared against zero. Try to match // it to BT. - if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + if (Cond.hasOneUse()) { if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); Index: ../test/CodeGen/X86/bt.ll =================================================================== --- ../test/CodeGen/X86/bt.ll +++ ../test/CodeGen/X86/bt.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=penryn | FileCheck %s +; RUN: llc < %s -mtriple=i386-apple-macosx -mcpu=skylake-avx512 | FileCheck %s ; PR3253 ; The register+memory form of the BT instruction should be usable on