Index: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -2822,15 +2822,19 @@
 // 32-bit has a direct PTX instruction
 def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>;
 
-// For 64-bit, the result in PTX is actually 32-bit so we zero-extend
-// to 64-bit to match the LLVM semantics
+// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit
+// to match the LLVM semantics.  Just as with ctlz.i64, we provide a second
+// pattern that avoids the type conversion if we're truncating the result to
+// i32 anyway.
 def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>;
+def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>;
 
-// For 16-bit, we zero-extend to 32-bit, then trunc the result back
-// to 16-bits (ctpop of a 16-bit value is guaranteed to require less
-// than 16 bits to store)
+// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits.
+// If we know that we're storing into an i32, we can avoid the final trunc.
 def : Pat<(ctpop Int16Regs:$a),
           (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>;
+def : Pat<(i32 (zext (ctpop Int16Regs:$a))),
+          (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>;
 
 // fpround f32 -> f16
 def : Pat<(f16 (fpround Float32Regs:$a)),
Index: llvm/trunk/test/CodeGen/NVPTX/intrinsics.ll
===================================================================
--- llvm/trunk/test/CodeGen/NVPTX/intrinsics.ll
+++ llvm/trunk/test/CodeGen/NVPTX/intrinsics.ll
@@ -36,8 +36,62 @@
   ret i64 %val
 }
 
+; CHECK-LABEL: test_popc32(
+define i32 @test_popc32(i32 %a) {
+; CHECK: popc.b32
+  %val = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %val
+}
+
+; CHECK-LABEL: test_popc64
+define i64 @test_popc64(i64 %a) {
+; CHECK: popc.b64
+; CHECK: cvt.u64.u32
+  %val = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %val
+}
+
+; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so
+; if this function returns an i32, there's no need to do any type conversions
+; in the ptx.
+; CHECK-LABEL: test_popc64_trunc
+define i32 @test_popc64_trunc(i64 %a) {
+; CHECK: popc.b64
+; CHECK-NOT: cvt.
+  %val = call i64 @llvm.ctpop.i64(i64 %a)
+  %trunc = trunc i64 %val to i32
+  ret i32 %trunc
+}
+
+; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and
+; then converting back to i16.
+; CHECK-LABEL: test_popc16
+define void @test_popc16(i16 %a, i16* %b) {
+; CHECK: cvt.u32.u16
+; CHECK: popc.b32
+; CHECK: cvt.u16.u32
+  %val = call i16 @llvm.ctpop.i16(i16 %a)
+  store i16 %val, i16* %b
+  ret void
+}
+
+; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need
+; to do any conversions after calling popc.b32, because that returns an i32.
+; CHECK-LABEL: test_popc16_to_32
+define i32 @test_popc16_to_32(i16 %a) {
+; CHECK: cvt.u32.u16
+; CHECK: popc.b32
+; CHECK-NOT: cvt.
+  %val = call i16 @llvm.ctpop.i16(i16 %a)
+  %zext = zext i16 %val to i32
+  ret i32 %zext
+}
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
 declare float @llvm.nvvm.sqrt.f(float)
 declare i32 @llvm.bitreverse.i32(i32)
 declare i64 @llvm.bitreverse.i64(i64)
+declare i16 @llvm.ctpop.i16(i16)
+declare i32 @llvm.ctpop.i32(i32)
+declare i64 @llvm.ctpop.i64(i64)