Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -3062,7 +3062,8 @@
   // If we're comparing for equality to zero and isCtlzFast is true, expose the
   // fact that this can be implemented as a ctlz/srl pair, so that the dag
   // combiner can fold the new nodes.
-  SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerCmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
+                                  SelectionDAG &DAG) const;
 
 private:
   SDValue simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
Index: lib/CodeGen/SelectionDAG/TargetLowering.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -3563,7 +3563,7 @@
   return CallResult.first;
 }
 
-SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op,
+SDValue TargetLowering::lowerCmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
                                                 SelectionDAG &DAG) const {
   assert((Op->getOpcode() == ISD::SETCC) && "Input has to be a SETCC node.");
   if (!isCtlzFast())
@@ -3582,7 +3582,10 @@
       SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
       SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
                                 DAG.getConstant(Log2b, dl, MVT::i32));
-      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
+      if (ExtTy.bitsLE(VT))
+        return DAG.getNode(ISD::TRUNCATE, dl, ExtTy, Scc);
+      else
+        return DAG.getNode(ISD::ZERO_EXTEND, dl, ExtTy, Scc);
     }
   }
   return SDValue();
Index: lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- lib/Target/PowerPC/PPCISelLowering.cpp
+++ lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2362,7 +2362,7 @@
   // If we're comparing for equality to zero, expose the fact that this is
   // implemented as a ctlz/srl pair on ppc, so that the dag combiner can
   // fold the new nodes.
-  if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG))
+  if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, MVT::i32, DAG))
     return V;
 
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
Index: lib/Target/X86/X86.td
===================================================================
--- lib/Target/X86/X86.td
+++ lib/Target/X86/X86.td
@@ -262,6 +262,12 @@
 def FeatureFastVectorFSQRT
     : SubtargetFeature<"fast-vector-fsqrt", "HasFastVectorFSQRT",
                        "true", "Vector SQRT is fast (disable Newton-Raphson)">;
+// If lzcnt has equivalent latency/throughput to most simple integer ops, it can
+// be used to replace test/set sequences.
+def FeatureFastLZCNT
+    : SubtargetFeature<
+          "fast-lzcnt", "HasFastLZCNT", "true",
+          "LZCNT instructions are as fast as most simple integer ops">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -646,6 +652,7 @@
   FeatureF16C,
   FeatureMOVBE,
   FeatureLZCNT,
+  FeatureFastLZCNT,
   FeaturePOPCNT,
   FeatureXSAVE,
   FeatureXSAVEOPT,
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -760,6 +760,8 @@
 
     bool isCheapToSpeculateCtlz() const override;
 
+    bool isCtlzFast() const override;
+
     bool hasBitPreservingFPLogic(EVT VT) const override {
       return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
     }
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -4189,6 +4189,10 @@
   return Subtarget.hasLZCNT();
 }
 
+bool X86TargetLowering::isCtlzFast() const {
+  return Subtarget.hasLZCNT() && Subtarget.hasFastLZCNT();
+}
+
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (!Subtarget.hasBMI())
     return false;
@@ -30919,6 +30923,17 @@
   if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
     return NewAdd;
 
+  // The transform is disabled for the 16-bit case, as we still have to clear
+  // the upper 16-bits, adding one more instruction.
+  if (N0->getOpcode() == ISD::SETCC && N0->hasOneUse() &&
+      N0->getOperand(0).getValueType().bitsGE(MVT::i32) &&
+      N->getSimpleValueType(0).bitsGE(MVT::i32) && !DCI.isBeforeLegalize() &&
+      !DAG.getMachineFunction().getFunction()->optForSize())
+    if (SDValue V = Subtarget.getTargetLowering()->lowerCmpEqZeroToCtlzSrl(
+            N0, N->getValueType(0), DAG)) {
+      return V;
+    }
+
   return SDValue();
 }
 
Index: lib/Target/X86/X86InstrInfo.td
===================================================================
--- lib/Target/X86/X86InstrInfo.td
+++ lib/Target/X86/X86InstrInfo.td
@@ -889,6 +889,7 @@
 def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
+def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
Index: lib/Target/X86/X86Subtarget.h
===================================================================
--- lib/Target/X86/X86Subtarget.h
+++ lib/Target/X86/X86Subtarget.h
@@ -215,6 +215,9 @@
   /// 64-bit divisions and should be used when possible.
   bool HasSlowDivide64;
 
+  /// True if LZCNT instruction is fast.
+  bool HasFastLZCNT;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -444,6 +447,7 @@
   bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
   bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
+  bool hasFastLZCNT() const { return HasFastLZCNT; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
Index: lib/Target/X86/X86Subtarget.cpp
===================================================================
--- lib/Target/X86/X86Subtarget.cpp
+++ lib/Target/X86/X86Subtarget.cpp
@@ -284,6 +284,7 @@
   HasFastPartialYMMWrite = false;
   HasFastScalarFSQRT = false;
   HasFastVectorFSQRT = false;
+  HasFastLZCNT = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
Index: test/CodeGen/X86/lzcnt-zext-cmp.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -0,0 +1,151 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; Test patterns which generates lzcnt instructions.
+; Eg: zext(setcc(cmp)) -> shr(lzcnt)
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
+
+; Test 32-bit input, 32-bit output.
+define i32 @foo1(i32 %a) {
+; CHECK-LABEL: foo1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    lzcntl %edi, %eax
+; CHECK-NEXT:    shrl $5, %eax
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo1:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testl %edi, %edi
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Test 32-bit input, 64-bit output.
+define i64 @foo2(i32 %a) {
+; CHECK-LABEL: foo2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    lzcntl %edi, %eax
+; CHECK-NEXT:    shrl $5, %eax
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo2:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testl %edi, %edi
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i32 %a, 0
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Test 64-bit input, 64-bit output.
+define i64 @foo3(i64 %a) {
+; CHECK-LABEL: foo3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    lzcntq %rdi, %rax
+; CHECK-NEXT:    shrq $6, %rax
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo3:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testq %rdi, %rdi
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i64 %a, 0
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+; Test 64-bit input, 32-bit output.
+define i32 @foo4(i64 %a) {
+; CHECK-LABEL: foo4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    lzcntq %rdi, %rax
+; CHECK-NEXT:    shrq $6, %rax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo4:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testq %rdi, %rdi
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i64 %a, 0
+  %conv1 = zext i1 %cmp to i32
+  ret i32 %conv1
+}
+
+; Test 16-bit input, 16-bit output.
+; The transform is disabled for the 16-bit case, as we still have to clear the
+; upper 16-bits, adding one more instruction.
+define i16 @foo5(i16 %a) {
+; CHECK-LABEL: foo5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testw %di, %di
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo5:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testw %di, %di
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i16 %a, 0
+  %conv = zext i1 %cmp to i16
+  ret i16 %conv
+}
+
+; Test minsize attribute disables the transform.
+; Function Attrs: minsize
+define i32 @foo6(i32 %a) #0 {
+; CHECK-LABEL: foo6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo6:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testl %edi, %edi
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Test optsize attribute disables the transform.
+; Function Attrs: optsize
+define i32 @foo7(i32 %a) #1 {
+; CHECK-LABEL: foo7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+;
+; NOFASTLZCNT-LABEL: foo7:
+; NOFASTLZCNT:       # BB#0:
+; NOFASTLZCNT-NEXT:    xorl %eax, %eax
+; NOFASTLZCNT-NEXT:    testl %edi, %edi
+; NOFASTLZCNT-NEXT:    sete %al
+; NOFASTLZCNT-NEXT:    retq
+  %cmp = icmp eq i32 %a, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+attributes #0 = { minsize }
+attributes #1 = { optsize }