Index: llvm/lib/Target/NVPTX/NVPTXISelLowering.h
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -517,6 +517,12 @@
 
   bool enableAggressiveFMAFusion(EVT VT) const override { return true; }
 
+  // The default is to transform llvm.ctlz(x, false) (where false indicates that
+  // x == 0 is not undefined behavior) into a branch that checks whether x is 0
+  // and avoids calling ctlz in that case.  We have a dedicated ctlz
+  // instruction, so we say that ctlz is cheap to speculate.
+  bool isCheapToSpeculateCtlz() const override { return true; }
+
 private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
Index: llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
===================================================================
--- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2616,18 +2616,21 @@
 // 32-bit has a direct PTX instruction
 def : Pat<(ctlz Int32Regs:$a), (CLZr32 Int32Regs:$a)>;
 
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// The return type of the ctlz ISD node is the same as its input, but the PTX
+// ctz instruction always returns a 32-bit value.  For ctlz.i64, convert the
+// ptx value to 64 bits to match the ISD node's semantics, unless we know we're
+// truncating back down to 32 bits.
 def : Pat<(ctlz Int64Regs:$a), (CVT_u64_u32 (CLZr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctlz Int64Regs:$a))), (CLZr64 Int64Regs:$a)>;
 
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctlz of a 16-bit value is guaranteed to require less
-// than 16 bits to store). We also need to subtract 16 because the
-// high-order 16 zeros were counted.
+// For 16-bit ctlz, we zero-extend to 32-bit, perform the count, then trunc the
+// result back to 16-bits if necessary.  We also need to subtract 16 because
+// the high-order 16 zeros were counted.
 def : Pat<(ctlz Int16Regs:$a),
-          (SUBi16ri (CVT_u16_u32 (CLZr32
-            (CVT_u32_u16 Int16Regs:$a, CvtNONE)),
-           CvtNONE), 16)>;
+          (SUBi16ri (CVT_u16_u32
+           (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE), 16)>;
+def : Pat<(i32 (zext (ctlz Int16Regs:$a))),
+          (SUBi32ri (CLZr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), 16)>;
 
 // Population count
 let hasSideEffects = 0 in {
Index: llvm/test/CodeGen/NVPTX/ctlz.ll
===================================================================
--- llvm/test/CodeGen/NVPTX/ctlz.ll
+++ llvm/test/CodeGen/NVPTX/ctlz.ll
@@ -6,39 +6,127 @@
 declare i32 @llvm.ctlz.i32(i32, i1) readnone
 declare i64 @llvm.ctlz.i64(i64, i1) readnone
 
+; There should be no difference between llvm.ctlz.i32(%a, true) and
+; llvm.ctlz.i32(%a, false), as ptx's clz(0) is defined to return 0.
+
+; CHECK-LABEL: myctpop(
 define i32 @myctpop(i32 %a) {
-; CHECK: clz.b32
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
   %val = call i32 @llvm.ctlz.i32(i32 %a, i1 false) readnone
   ret i32 %val
 }
-
-define i16 @myctpop16(i16 %a) {
-; CHECK: clz.b32
-  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
-  ret i16 %val
-}
-
-define i64 @myctpop64(i64 %a) {
-; CHECK: clz.b64
-  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
-  ret i64 %val
-}
-
-
+; CHECK-LABEL: myctpop_2(
 define i32 @myctpop_2(i32 %a) {
-; CHECK: clz.b32
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
   %val = call i32 @llvm.ctlz.i32(i32 %a, i1 true) readnone
   ret i32 %val
 }
 
-define i16 @myctpop16_2(i16 %a) {
-; CHECK: clz.b32
+; PTX's clz.b64 returns a 32-bit value, but LLVM's intrinsic returns a 64-bit
+; value, so here we have to zero-extend it.
+; CHECK-LABEL: myctpop64(
+define i64 @myctpop64(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  ret i64 %val
+}
+; CHECK-LABEL: myctpop64_2(
+define i64 @myctpop64_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: cvt.u64.u32
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
+  ret i64 %val
+}
+
+; Here we truncate the 64-bit value of LLVM's ctlz intrinsic to 32 bits, the
+; natural return width of ptx's clz.b64 instruction.  No conversions should be
+; necessary in the PTX.
+; CHECK-LABEL: myctpop64_as_32(
+define i32 @myctpop64_as_32(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
+}
+; CHECK-LABEL: myctpop64_as_32_2(
+define i32 @myctpop64_as_32_2(i64 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: clz.b64
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 false) readnone
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
+}
+
+; ctlz.i16 is implemented by extending the input to i32, computing the result,
+; and then truncating the result back down to i16.  But the NVPTX ABI
+; zero-extends i16 return values to i32, so the final truncation doesn't appear
+; in this function.
+; CHECK-LABEL: myctpop_ret16(
+define i16 @myctpop_ret16(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  ret i16 %val
+}
+; CHECK-LABEL: myctpop_ret16_2(
+define i16 @myctpop_ret16_2(i16 %a) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NEXT: clz.b32
+; CHECK-NEXT: sub.
+; CHECK-NEXT: st.param.
+; CHECK-NEXT: ret;
   %val = call i16 @llvm.ctlz.i16(i16 %a, i1 true) readnone
   ret i16 %val
 }
 
-define i64 @myctpop64_2(i64 %a) {
-; CHECK: clz.b64
-  %val = call i64 @llvm.ctlz.i64(i64 %a, i1 true) readnone
-  ret i64 %val
+; Here we store the result of ctlz.16 into an i16 pointer, so the trunc should
+; remain.
+; CHECK-LABEL: myctpop_store16(
+define void @myctpop_store16(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  store i16 %val, i16* %b
+  ret void
+}
+; CHECK-LABEL: myctpop_store16_2(
+define void @myctpop_store16_2(i16 %a, i16* %b) {
+; CHECK: ld.param.
+; CHECK-NEXT: cvt.u32.u16
+; CHECK-NET: clz.b32
+; CHECK-DAG: cvt.u16.u32
+; CHECK-DAG: sub.
+; CHECK: st.{{[a-z]}}16
+; CHECK: ret;
+  %val = call i16 @llvm.ctlz.i16(i16 %a, i1 false) readnone
+  store i16 %val, i16* %b
+  ret void
 }