diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -52265,6 +52265,63 @@ return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } +static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && + "Invalid opcode for combing with CTLZ"); + if (Subtarget.hasFastLZCNT()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 && + (VT != MVT::i64 || !Subtarget.is64Bit())) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF && + N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF) + return SDValue(); + + SDValue OpCTLZ; + SDValue OpSizeTM1; + + if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) { + OpCTLZ = N1; + OpSizeTM1 = N0; + } else if (N->getOpcode() == ISD::SUB) { + return SDValue(); + } else { + OpCTLZ = N0; + OpSizeTM1 = N1; + } + + if (!OpCTLZ.hasOneUse()) + return SDValue(); + auto *C = dyn_cast(OpSizeTM1); + if (!C) + return SDValue(); + + if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1)) + return SDValue(); + SDLoc DL(N); + EVT OpVT = VT; + SDValue Op = OpCTLZ.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op); + } + + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op); + if (VT == MVT::i8) + Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op); + + return Op; +} + static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -52292,6 +52349,9 @@ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; + if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -55177,6 +55237,9 @@ Op1.getOperand(0)); } + if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -968,7 +968,12 @@ ; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute ; the most significant bit, which is what 'bsr' does natively. -; FIXME: We should probably select BSR instead of LZCNT in these circumstances. +; NOTE: We intentionally don't select `bsr` when `fast-lzcnt` is +; available. This is 1) because `bsr` has some drawbacks including a +; dependency on dst, 2) very poor performance on some of the +; `fast-lzcnt` processors, and 3) `lzcnt` runs at ALU latency/throughput +; so `lzcnt` + `xor` has better throughput than even the 1-uop +; (1c latency, 1c throughput) `bsr`. define i32 @ctlz_bsr(i32 %n) { ; X86-LABEL: ctlz_bsr: ; X86: # %bb.0: @@ -982,14 +987,12 @@ ; ; X86-CLZ-LABEL: ctlz_bsr: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: xorl $31, %eax +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_bsr: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntl %edi, %eax -; X64-CLZ-NEXT: xorl $31, %eax +; X64-CLZ-NEXT: bsrl %edi, %eax ; X64-CLZ-NEXT: retq ; ; X64-FASTLZCNT-LABEL: ctlz_bsr: @@ -1441,8 +1444,7 @@ ; X86-CLZ-LABEL: PR47603_zext: ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx -; X86-CLZ-NEXT: xorl $31, %ecx +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %ecx ; X86-CLZ-NEXT: movsbl (%eax,%ecx), %eax ; X86-CLZ-NEXT: retl ; @@ -1566,18 +1568,14 @@ ; X86-CLZ-LABEL: ctlz_xor7_i8_true: ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax -; X86-CLZ-NEXT: xorb $7, %al +; X86-CLZ-NEXT: bsrl %eax, %eax ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_xor7_i8_true: ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax -; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax -; X64-CLZ-NEXT: xorb $7, %al +; X64-CLZ-NEXT: bsrl %eax, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq ; @@ -1692,16 +1690,12 @@ ; ; X86-CLZ-LABEL: ctlz_xor15_i16_true: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax -; X86-CLZ-NEXT: xorl $15, %eax -; X86-CLZ-NEXT: # kill: def $ax killed $ax killed $eax +; X86-CLZ-NEXT: bsrw {{[0-9]+}}(%esp), %ax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_xor15_i16_true: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntw %di, %ax -; X64-CLZ-NEXT: xorl $15, %eax -; X64-CLZ-NEXT: # kill: def $ax killed $ax killed $eax +; X64-CLZ-NEXT: bsrw %di, %ax ; X64-CLZ-NEXT: retq ; ; X64-FASTLZCNT-LABEL: ctlz_xor15_i16_true: @@ -1836,8 +1830,7 @@ ; ; X64-CLZ-LABEL: ctlz_xor63_i64_true: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntq %rdi, %rax -; X64-CLZ-NEXT: xorq $63, %rax +; X64-CLZ-NEXT: bsrq %rdi, %rax ; X64-CLZ-NEXT: retq ; ; X64-FASTLZCNT-LABEL: ctlz_xor63_i64_true: