diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -51872,6 +51872,62 @@ return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); } +static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB); + if (Subtarget.hasFastLZCNT()) + return SDValue(); + + EVT VT = N->getValueType(0); + if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 && + (VT != MVT::i64 || !Subtarget.is64Bit())) + return SDValue(); + + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF && + N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF) + return SDValue(); + + SDValue OpCTLZ; + SDValue OpSizeTM1; + + if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) { + OpCTLZ = N1; + OpSizeTM1 = N0; + } else if (N->getOpcode() == ISD::SUB) { + return SDValue(); + } else { + OpCTLZ = N0; + OpSizeTM1 = N1; + } + + if (!OpCTLZ.hasOneUse()) + return SDValue(); + auto *C = dyn_cast(OpSizeTM1); + if (C == nullptr) + return SDValue(); + + if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1)) + return SDValue(); + SDLoc DL(N); + EVT OpVT = VT; + SDValue Op = OpCTLZ.getOperand(0); + if (VT == MVT::i8) { + // Zero extend to i32 since there is not an i8 bsr. + OpVT = MVT::i32; + Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op); + } + + SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); + Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op); + if (VT == MVT::i8) { + Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op); + } + return Op; +} + static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -51899,6 +51955,9 @@ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) return FPLogic; + if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -54784,6 +54843,9 @@ Op1.getOperand(0)); } + if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } diff --git a/llvm/test/CodeGen/X86/clz.ll b/llvm/test/CodeGen/X86/clz.ll --- a/llvm/test/CodeGen/X86/clz.ll +++ b/llvm/test/CodeGen/X86/clz.ll @@ -968,7 +968,12 @@ ; Don't generate any xors when a 'ctlz' intrinsic is actually used to compute ; the most significant bit, which is what 'bsr' does natively. -; FIXME: We should probably select BSR instead of LZCNT in these circumstances. +; NOTE: We intentionally don't select `bsr` when `fast-lzcnt` is +; available. This is 1) because `bsr` has some drawbacks including a +; dependency on dst, 2) very poor performance on some of the +; `fast-lzcnt` processors, and 3) `lzcnt` runs at ALU latency/throughput +; so `lzcnt` + `xor` has better throughput than even the 1-uop +; (1c latency, 1c throughput) `bsr`. define i32 @ctlz_bsr(i32 %n) { ; X86-LABEL: ctlz_bsr: ; X86: # %bb.0: @@ -982,14 +987,12 @@ ; ; X86-CLZ-LABEL: ctlz_bsr: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: xorl $31, %eax +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_bsr: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntl %edi, %eax -; X64-CLZ-NEXT: xorl $31, %eax +; X64-CLZ-NEXT: bsrl %edi, %eax ; X64-CLZ-NEXT: retq ; ; X64-FASTLZCNT-LABEL: ctlz_bsr: @@ -1441,8 +1444,7 @@ ; X86-CLZ-LABEL: PR47603_zext: ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %ecx -; X86-CLZ-NEXT: xorl $31, %ecx +; X86-CLZ-NEXT: bsrl {{[0-9]+}}(%esp), %ecx ; X86-CLZ-NEXT: movsbl (%eax,%ecx), %eax ; X86-CLZ-NEXT: retl ; @@ -1565,18 +1567,14 @@ ; X86-CLZ-LABEL: ctlz_xor7_i8_true: ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax -; X86-CLZ-NEXT: xorb $7, %al +; X86-CLZ-NEXT: bsrl %eax, %eax ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_xor7_i8_true: ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax -; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax -; X64-CLZ-NEXT: xorb $7, %al +; X64-CLZ-NEXT: bsrl %eax, %eax ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq ; @@ -1691,16 +1689,12 @@ ; ; X86-CLZ-LABEL: ctlz_xor15_i16_true: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: lzcntw {{[0-9]+}}(%esp), %ax -; X86-CLZ-NEXT: xorl $15, %eax -; X86-CLZ-NEXT: # kill: def $ax killed $ax killed $eax +; X86-CLZ-NEXT: bsrw {{[0-9]+}}(%esp), %ax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_xor15_i16_true: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntw %di, %ax -; X64-CLZ-NEXT: xorl $15, %eax -; X64-CLZ-NEXT: # kill: def $ax killed $ax killed $eax +; X64-CLZ-NEXT: bsrw %di, %ax ; X64-CLZ-NEXT: retq ; ; X64-FASTLZCNT-LABEL: ctlz_xor15_i16_true: @@ -1726,13 +1720,13 @@ ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB31_1 +; X86-NEXT: je .LBB30_1 ; X86-NEXT: # %bb.2: # %cond.false ; X86-NEXT: bsrl %eax, %eax ; X86-NEXT: xorl $31, %eax ; X86-NEXT: xorl $31, %eax ; X86-NEXT: retl -; X86-NEXT: .LBB31_1: +; X86-NEXT: .LBB30_1: ; X86-NEXT: movl $32, %eax ; X86-NEXT: xorl $31, %eax ; X86-NEXT: retl @@ -1740,13 +1734,13 @@ ; X64-LABEL: ctlz_xor31_i32_false: ; X64: # %bb.0: ; X64-NEXT: testl %edi, %edi -; X64-NEXT: je .LBB31_1 +; X64-NEXT: je .LBB30_1 ; X64-NEXT: # %bb.2: # %cond.false ; X64-NEXT: bsrl %edi, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq -; X64-NEXT: .LBB31_1: +; X64-NEXT: .LBB30_1: ; X64-NEXT: movl $32, %eax ; X64-NEXT: xorl $31, %eax ; X64-NEXT: retq @@ -1784,16 +1778,16 @@ ; X86-NOCMOV: # %bb.0: ; X86-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: testl %eax, %eax -; X86-NOCMOV-NEXT: jne .LBB32_1 +; X86-NOCMOV-NEXT: jne .LBB31_1 ; X86-NOCMOV-NEXT: # %bb.2: ; X86-NOCMOV-NEXT: bsrl {{[0-9]+}}(%esp), %eax ; X86-NOCMOV-NEXT: xorl $31, %eax ; X86-NOCMOV-NEXT: addl $32, %eax -; X86-NOCMOV-NEXT: jmp .LBB32_3 -; X86-NOCMOV-NEXT: .LBB32_1: +; X86-NOCMOV-NEXT: jmp .LBB31_3 +; X86-NOCMOV-NEXT: .LBB31_1: ; X86-NOCMOV-NEXT: bsrl %eax, %eax ; X86-NOCMOV-NEXT: xorl $31, %eax -; X86-NOCMOV-NEXT: .LBB32_3: +; X86-NOCMOV-NEXT: .LBB31_3: ; X86-NOCMOV-NEXT: xorl $63, %eax ; X86-NOCMOV-NEXT: xorl %edx, %edx ; X86-NOCMOV-NEXT: retl @@ -1821,22 +1815,21 @@ ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: testl %eax, %eax -; X86-CLZ-NEXT: jne .LBB32_1 +; X86-CLZ-NEXT: jne .LBB31_1 ; X86-CLZ-NEXT: # %bb.2: ; X86-CLZ-NEXT: lzcntl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: addl $32, %eax -; X86-CLZ-NEXT: jmp .LBB32_3 -; X86-CLZ-NEXT: .LBB32_1: +; X86-CLZ-NEXT: jmp .LBB31_3 +; X86-CLZ-NEXT: .LBB31_1: ; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: .LBB32_3: +; X86-CLZ-NEXT: .LBB31_3: ; X86-CLZ-NEXT: xorl $63, %eax ; X86-CLZ-NEXT: xorl %edx, %edx ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlz_xor63_i64_true: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: lzcntq %rdi, %rax -; X64-CLZ-NEXT: xorq $63, %rax +; X64-CLZ-NEXT: bsrq %rdi, %rax ; X64-CLZ-NEXT: retq ; ; X64-FASTLZCNT-LABEL: ctlz_xor63_i64_true: @@ -1849,14 +1842,14 @@ ; X86-FASTLZCNT: # %bb.0: ; X86-FASTLZCNT-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-FASTLZCNT-NEXT: testl %eax, %eax -; X86-FASTLZCNT-NEXT: jne .LBB32_1 +; X86-FASTLZCNT-NEXT: jne .LBB31_1 ; X86-FASTLZCNT-NEXT: # %bb.2: ; X86-FASTLZCNT-NEXT: lzcntl {{[0-9]+}}(%esp), %eax ; X86-FASTLZCNT-NEXT: addl $32, %eax -; X86-FASTLZCNT-NEXT: jmp .LBB32_3 -; X86-FASTLZCNT-NEXT: .LBB32_1: +; X86-FASTLZCNT-NEXT: jmp .LBB31_3 +; X86-FASTLZCNT-NEXT: .LBB31_1: ; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax -; X86-FASTLZCNT-NEXT: .LBB32_3: +; X86-FASTLZCNT-NEXT: .LBB31_3: ; X86-FASTLZCNT-NEXT: xorl $63, %eax ; X86-FASTLZCNT-NEXT: xorl %edx, %edx ; X86-FASTLZCNT-NEXT: retl