Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -455,7 +455,7 @@ bool foldLoadStoreIntoMemOperand(SDNode *Node); bool matchBEXTRFromAndImm(SDNode *Node); - bool matchBEXTR(SDNode *Node); + bool matchBitExtract(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); @@ -2633,15 +2633,15 @@ return true; } -// See if this is an X & Mask that we can match to BEXTR. +// See if this is an X & Mask that we can match to BEXTR/BZHI. // Where Mask is one of the following patterns: // a) x & (1 << nbits) - 1 // b) x & ~(-1 << nbits) // c) x & (-1 >> (32 - y)) // d) x << (32 - y) >> (32 - y) -bool X86DAGToDAGISel::matchBEXTR(SDNode *Node) { - // BEXTR is BMI instruction. However, if we have BMI2, we prefer BZHI. - if (!Subtarget->hasBMI() || Subtarget->hasBMI2()) +bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { + // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. + if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) return false; MVT NVT = Node->getSimpleValueType(0); @@ -2652,17 +2652,24 @@ SDValue NBits; + // If we have BMI2's BZHI, we are ok with muti-use patterns. + // Else, if we only have BMI1's BEXTR, we require one-use. + const bool CanHaveExtraUses = Subtarget->hasBMI2(); + auto checkOneUse = [CanHaveExtraUses](SDValue Op) { + return CanHaveExtraUses || Op.hasOneUse(); + }; + // a) x & ((1 << nbits) + (-1)) - auto matchPatternA = [&NBits](SDValue Mask) -> bool { + auto matchPatternA = [&checkOneUse, &NBits](SDValue Mask) -> bool { // Match `add`. Must only have one use! - if (Mask->getOpcode() != ISD::ADD || !Mask->hasOneUse()) + if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) return false; // We should be adding all-ones constant (i.e. subtracting one.) if (!isAllOnesConstant(Mask->getOperand(1))) return false; // Match `1 << nbits`. Must only have one use! SDValue M0 = Mask->getOperand(0); - if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse()) + if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; if (!isOneConstant(M0->getOperand(0))) return false; @@ -2671,13 +2678,13 @@ }; // b) x & ~(-1 << nbits) - auto matchPatternB = [&NBits](SDValue Mask) -> bool { + auto matchPatternB = [&checkOneUse, &NBits](SDValue Mask) -> bool { // Match `~()`. Must only have one use! - if (!isBitwiseNot(Mask) || !Mask->hasOneUse()) + if (!isBitwiseNot(Mask) || !checkOneUse(Mask)) return false; // Match `-1 << nbits`. Must only have one use! SDValue M0 = Mask->getOperand(0); - if (M0->getOpcode() != ISD::SHL || !M0->hasOneUse()) + if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; if (!isAllOnesConstant(M0->getOperand(0))) return false; @@ -2713,6 +2720,15 @@ NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, NVT, ImplDef, NBits); insertDAGNode(*CurDAG, OrigNBits, NBits); + if (Subtarget->hasBMI2()) { + // Great, just emit the the BZHI.. + SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); + ReplaceNode(Node, Extract.getNode()); + SelectCode(Extract.getNode()); + return true; + } + + // Else, emitting BEXTR requires one more step. // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location // [ bit count][ shift] name @@ -3127,7 +3143,7 @@ case ISD::AND: if (matchBEXTRFromAndImm(Node)) return; - if (matchBEXTR(Node)) + if (matchBitExtract(Node)) return; if (AndImmShrink && shrinkAndImmediate(Node)) return; Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -355,6 +355,9 @@ // Bit field extract. BEXTR, + // Zero High Bits Starting with Specified Bit Position. + BZHI, + // LOW, HI, FLAGS = umul LHS, RHS. UMUL, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -26636,6 +26636,7 @@ case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; case X86ISD::BEXTR: return "X86ISD::BEXTR"; + case X86ISD::BZHI: return "X86ISD::BZHI"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -291,6 +291,8 @@ def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; +def X86bzhi : SDNode<"X86ISD::BZHI", SDTIntBinOp>; + def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA, @@ -2454,9 +2456,9 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in { defm BZHI32 : bmi_bzhi<0xF5, "bzhi{l}", GR32, i32mem, - int_x86_bmi_bzhi_32, loadi32, WriteBZHI>; + X86bzhi, loadi32, WriteBZHI>; defm BZHI64 : bmi_bzhi<0xF5, "bzhi{q}", GR64, i64mem, - int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W; + X86bzhi, loadi64, WriteBZHI>, VEX_W; } def CountTrailingOnes : SDNodeXForm { - // x & ((1 << y) - 1) - defm : _bmi_bzhi_pattern<(and RC:$src, (add (shl 1, GR8:$lz), -1)), - (and (x86memop addr:$src), - (add (shl 1, GR8:$lz), -1)), - RC, VT, DstInst, DstMemInst>; - - // x & ~(-1 << y) - defm : _bmi_bzhi_pattern<(and RC:$src, (xor (shl -1, GR8:$lz), -1)), - (and (x86memop addr:$src), - (xor (shl -1, GR8:$lz), -1)), - RC, VT, DstInst, DstMemInst>; - // x & (-1 >> (bitwidth - y)) defm : _bmi_bzhi_pattern<(and RC:$src, (srl -1, (sub bitwidth, GR8:$lz))), (and (x86memop addr:$src), Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -1120,6 +1120,8 @@ X86_INTRINSIC_DATA(avx512_vpshrd_w_512, INTR_TYPE_3OP_IMM8, X86ISD::VSHRD, 0), X86_INTRINSIC_DATA(bmi_bextr_32, INTR_TYPE_2OP, X86ISD::BEXTR, 0), X86_INTRINSIC_DATA(bmi_bextr_64, INTR_TYPE_2OP, X86ISD::BEXTR, 0), + X86_INTRINSIC_DATA(bmi_bzhi_32, INTR_TYPE_2OP, X86ISD::BZHI, 0), + X86_INTRINSIC_DATA(bmi_bzhi_64, INTR_TYPE_2OP, X86ISD::BZHI, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),