Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -3068,7 +3068,8 @@ // If we're comparing for equality to zero and isCtlzFast is true, expose the // fact that this can be implemented as a ctlz/srl pair, so that the dag // combiner can fold the new nodes. - SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, + SelectionDAG &DAG) const; private: SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1, Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -3590,7 +3590,7 @@ return CallResult.first; } -SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op, +SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy, SelectionDAG &DAG) const { assert((Op->getOpcode() == ISD::SETCC) && "Input has to be a SETCC node."); if (!isCtlzFast()) @@ -3607,9 +3607,18 @@ } unsigned Log2b = Log2_32(VT.getSizeInBits()); SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext); - SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz, + // The result of the shift is true or false, and on X86, the 32-bit + // encoding of shr and lzcnt is more desirable. + EVT SccTy = VT; + SDValue Trunc = Clz; + if (!isTypeDesirableForOp(ISD::SRL, VT) && + isTypeDesirableForOp(ISD::SRL, MVT::i32)) { + SccTy = MVT::i32; + Trunc = DAG.getNode(ISD::TRUNCATE, dl, SccTy, Clz); + } + SDValue Scc = DAG.getNode(ISD::SRL, dl, SccTy, Trunc, DAG.getConstant(Log2b, dl, MVT::i32)); - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc); + return DAG.getZExtOrTrunc(Scc, dl, ExtTy); } } return SDValue(); Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -2364,7 +2364,7 @@ // If we're comparing for equality to zero, expose the fact that this is // implemented as a ctlz/srl pair on ppc, so that the dag combiner can // fold the new nodes. - if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) + if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, MVT::i32, DAG)) return V; if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -262,6 +262,12 @@ def FeatureFastVectorFSQRT : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT", "true", "Vector SQRT is fast (disable Newton-Raphson)">; +// If lzcnt has equivalent latency/throughput to most simple integer ops, it can +// be used to replace test/set sequences. +def FeatureFastLZCNT + : SubtargetFeature< + "fast-lzcnt", "HasFastLZCNT", "true", + "LZCNT instructions are as fast as most simple integer ops">; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -646,6 +652,7 @@ FeatureF16C, FeatureMOVBE, FeatureLZCNT, + FeatureFastLZCNT, FeaturePOPCNT, FeatureXSAVE, FeatureXSAVEOPT, Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -760,6 +760,8 @@ bool isCheapToSpeculateCtlz() const override; + bool isCtlzFast() const override; + bool hasBitPreservingFPLogic(EVT VT) const override { return VT == MVT::f32 || VT == MVT::f64 || VT.isVector(); } Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4187,6 +4187,10 @@ return Subtarget.hasLZCNT(); } +bool X86TargetLowering::isCtlzFast() const { + return Subtarget.hasLZCNT() && Subtarget.hasFastLZCNT(); +} + bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { if (!Subtarget.hasBMI()) return false; @@ -28997,9 +29001,89 @@ return DAG.getBitcast(VT, Mask); } +// Try to transform: +// zext(or(setcc (x, 0, eq), setcc (y, 0, eq)) +// into: +// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) +// Will also attempt to match more generic cases, eg: +// zext(or(or(setcc, setcc), setcc)) +// Only applies if the target supports the FastLZCNT feature. +static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + // Check the OR user is a zero extend and that it is extending to 32-bit or + // more. The code generated by srl(ctlz) for 16-bit or less variants of the + // pattern would require extra instructions to clear the upper bits. + if (!N->hasOneUse() || !(N->use_begin()->getOpcode() == ISD::ZERO_EXTEND) || + !N->use_begin()->getSimpleValueType(0).bitsGE(MVT::i32)) + return SDValue(); + + auto isSetCCCandidate = [](SDValue N) { + return N->getOpcode() == ISD::SETCC && N->hasOneUse() && + N->getOperand(0).getValueType().bitsGE(MVT::i32); + }; + + SDNode *OR = N; + SDValue LHS = OR->getOperand(0); + SDValue RHS = OR->getOperand(1); + + // Save nodes matching or(or, setcc). + SmallVector ORNodes; + while (((LHS->getOpcode() == ISD::OR && isSetCCCandidate(RHS)) || + (RHS.getOpcode() == ISD::OR && isSetCCCandidate(LHS)))) { + ORNodes.push_back(OR); + OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); + LHS = OR->getOperand(0); + RHS = OR->getOperand(1); + } + + // The last OR node should match or(setcc, setcc). + if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || + OR->getOpcode() != ISD::OR) + return SDValue(); + + // We have a or(setcc, setcc) pattern, try to lower it to + // or(srl(ctlz),srl(ctlz)). The dag combiner can then fold it into: + // srl(or(ctlz, ctlz)). + EVT VT = N->getValueType(0); + SDValue NewLHS = + Subtarget.getTargetLowering()->lowerCmpEqZeroToCtlzSrl(LHS, VT, DAG); + SDValue Ret, NewRHS; + if (NewLHS && (NewRHS = Subtarget.getTargetLowering()->lowerCmpEqZeroToCtlzSrl( + RHS, VT, DAG))) + Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS); + + if (!Ret) + return SDValue(); + + // Try to lower nodes matching the or(or, setcc) pattern. + while (ORNodes.size() > 0) { + OR = ORNodes.pop_back_val(); + LHS = OR->getOperand(0); + RHS = OR->getOperand(1); + // Swap rhs with lhs to match or(setcc, or). + if (RHS->getOpcode() == ISD::OR && isSetCCCandidate(LHS)) + std::swap(LHS, RHS); + EVT VT = OR->getValueType(0); + SDValue NewRHS = + Subtarget.getTargetLowering()->lowerCmpEqZeroToCtlzSrl(RHS, VT, DAG); + if (!NewRHS) + return SDValue(); + Ret = DAG.getNode(ISD::OR, SDLoc(N), VT, Ret, NewRHS); + } + + return Ret; +} + static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) + return R; + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -31709,6 +31793,8 @@ bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { if (!isTypeLegal(VT)) return false; + if(Opc == ISD::SRL && VT != MVT::i32 && VT != MVT::i8) + return false; if (VT != MVT::i16) return true; Index: lib/Target/X86/X86InstrInfo.td =================================================================== --- lib/Target/X86/X86InstrInfo.td +++ lib/Target/X86/X86InstrInfo.td @@ -889,6 +889,7 @@ def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; +def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasMFence : Predicate<"Subtarget->hasMFence()">; //===----------------------------------------------------------------------===// Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -215,6 +215,9 @@ /// 64-bit divisions and should be used when possible. bool HasSlowDivide64; + /// True if LZCNT instruction is fast. + bool HasFastLZCNT; + /// True if the short functions should be padded to prevent /// a stall when returning too early. bool PadShortFunctions; @@ -444,6 +447,7 @@ bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } + bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -284,6 +284,7 @@ HasFastPartialYMMWrite = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; + HasFastLZCNT = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; Index: test/CodeGen/X86/lzcnt-zext-cmp.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/lzcnt-zext-cmp.ll @@ -0,0 +1,283 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Test patterns which generates lzcnt instructions. +; Eg: zext(or(setcc(cmp), setcc(cmp))) -> shr(or(lzcnt, lzcnt)) +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s + +; Test two 32-bit inputs, output is 32-bit. +define i32 @bar1(i32 %a, i32 %b) { +; CHECK-LABEL: bar1: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntl %edi, %ecx +; CHECK-NEXT: lzcntl %esi, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar1: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %or = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %or to i32 + ret i32 %lor.ext +} + +; Test two 64-bit inputs, output is 64-bit. +define i64 @bar2(i64 %a, i64 %b) { +; CHECK-LABEL: bar2: +; CHECK: # BB#0: +; CHECK-NEXT: lzcntq %rdi, %rcx +; CHECK-NEXT: lzcntq %rsi, %rax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $6, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar2: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testq %rsi, %rsi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i64 %a, 0 + %cmp1 = icmp eq i64 %b, 0 + %or = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %or to i64 + ret i64 %lor.ext +} + +; Test two 16-bit inputs, output is 16-bit. +; The transform is disabled for the 16-bit case, as we still have to clear the +; upper 16-bits, adding one more instruction. +define i16 @bar3(i16 %a, i16 %b) { +; CHECK-LABEL: bar3: +; CHECK: # BB#0: +; CHECK-NEXT: testw %di, %di +; CHECK-NEXT: sete %al +; CHECK-NEXT: testw %si, %si +; CHECK-NEXT: sete %cl +; CHECK-NEXT: orb %al, %cl +; CHECK-NEXT: movzbl %cl, %eax +; CHECK-NEXT: # kill: %AX %AX %EAX +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar3: +; NOFASTLZCNT: # BB#0: +; NOFASTLZCNT-NEXT: testw %di, %di +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testw %si, %si +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: # kill: %AX %AX %EAX +; NOFASTLZCNT-NEXT: retq + %cmp = icmp eq i16 %a, 0 + %cmp1 = icmp eq i16 %b, 0 + %or = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %or to i16 + ret i16 %lor.ext +} + +; Test two 32-bit inputs, output is 64-bit. +define i64 @bar4(i32 %a, i32 %b) { +; CHECK-LABEL: bar4: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lzcntl %edi, %ecx +; CHECK-NEXT: lzcntl %esi, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar4: +; NOFASTLZCNT: # BB#0: # %entry +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %0 = or i1 %cmp, %cmp1 + %conv = zext i1 %0 to i64 + ret i64 %conv +} + +; Test two 64-bit inputs, output is 32-bit. +define i32 @bar5(i64 %a, i64 %b) { +; CHECK-LABEL: bar5: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lzcntq %rdi, %rcx +; CHECK-NEXT: lzcntq %rsi, %rax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $6, %eax +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar5: +; NOFASTLZCNT: # BB#0: # %entry +; NOFASTLZCNT-NEXT: testq %rdi, %rdi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testq %rsi, %rsi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq +entry: + %cmp = icmp eq i64 %a, 0 + %cmp1 = icmp eq i64 %b, 0 + %0 = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %0 to i32 + ret i32 %lor.ext +} + +; Test three 32-bit inputs, output is 32-bit. +define i32 @bar6(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: bar6: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: lzcntl %esi, %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: lzcntl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar6: +; NOFASTLZCNT: # BB#0: # %entry +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: testl %edx, %edx +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: orb %cl, %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %or.cond = or i1 %cmp, %cmp1 + %cmp2 = icmp eq i32 %c, 0 + %.cmp2 = or i1 %or.cond, %cmp2 + %lor.ext = zext i1 %.cmp2 to i32 + ret i32 %lor.ext +} + +; Test three 32-bit inputs, output is 32-bit, but compared to bar6 test, +; %.cmp2 inputs' order is inverted. +define i32 @bar7(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: bar7: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: lzcntl %esi, %ecx +; CHECK-NEXT: orl %eax, %ecx +; CHECK-NEXT: lzcntl %edx, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar7: +; NOFASTLZCNT: # BB#0: # %entry +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: testl %edx, %edx +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: orb %cl, %al +; NOFASTLZCNT-NEXT: movzbl %al, %eax +; NOFASTLZCNT-NEXT: retq +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %or.cond = or i1 %cmp, %cmp1 + %cmp2 = icmp eq i32 %c, 0 + %.cmp2 = or i1 %cmp2, %or.cond + %lor.ext = zext i1 %.cmp2 to i32 + ret i32 %lor.ext +} + +; Test four 32-bit inputs, output is 32-bit. +define i32 @bar8(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: bar8: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lzcntl %edi, %eax +; CHECK-NEXT: lzcntl %esi, %esi +; CHECK-NEXT: lzcntl %edx, %edx +; CHECK-NEXT: orl %eax, %esi +; CHECK-NEXT: lzcntl %ecx, %eax +; CHECK-NEXT: orl %edx, %eax +; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: shrl $5, %eax +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar8: +; NOFASTLZCNT: # BB#0: # %entry +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %dil +; NOFASTLZCNT-NEXT: testl %esi, %esi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: orb %dil, %al +; NOFASTLZCNT-NEXT: testl %edx, %edx +; NOFASTLZCNT-NEXT: sete %dl +; NOFASTLZCNT-NEXT: testl %ecx, %ecx +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %dl, %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i32 %b, 0 + %or.cond = or i1 %cmp, %cmp1 + %cmp3 = icmp eq i32 %c, 0 + %or.cond5 = or i1 %or.cond, %cmp3 + %cmp4 = icmp eq i32 %d, 0 + %.cmp4 = or i1 %or.cond5, %cmp4 + %lor.ext = zext i1 %.cmp4 to i32 + ret i32 %lor.ext +} + +; Test one 32-bit input, one 64-bit input, output is 32-bit. +define i32 @bar9(i32 %a, i64 %b) { +; CHECK-LABEL: bar9: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: lzcntq %rsi, %rax +; CHECK-NEXT: lzcntl %edi, %ecx +; CHECK-NEXT: shrl $5, %ecx +; CHECK-NEXT: shrl $6, %eax +; CHECK-NEXT: orl %ecx, %eax +; CHECK-NEXT: # kill: %EAX %EAX %RAX +; CHECK-NEXT: retq +; +; NOFASTLZCNT-LABEL: bar9: +; NOFASTLZCNT: # BB#0: # %entry +; NOFASTLZCNT-NEXT: testl %edi, %edi +; NOFASTLZCNT-NEXT: sete %al +; NOFASTLZCNT-NEXT: testq %rsi, %rsi +; NOFASTLZCNT-NEXT: sete %cl +; NOFASTLZCNT-NEXT: orb %al, %cl +; NOFASTLZCNT-NEXT: movzbl %cl, %eax +; NOFASTLZCNT-NEXT: retq +entry: + %cmp = icmp eq i32 %a, 0 + %cmp1 = icmp eq i64 %b, 0 + %0 = or i1 %cmp, %cmp1 + %lor.ext = zext i1 %0 to i32 + ret i32 %lor.ext +}