Index: llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td +++ llvm/trunk/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2822,15 +2822,19 @@ // 32-bit has a direct PTX instruction def : Pat<(ctpop Int32Regs:$a), (POPCr32 Int32Regs:$a)>; -// For 64-bit, the result in PTX is actually 32-bit so we zero-extend -// to 64-bit to match the LLVM semantics +// For 64-bit, the result in PTX is actually 32-bit so we zero-extend to 64-bit +// to match the LLVM semantics. Just as with ctlz.i64, we provide a second +// pattern that avoids the type conversion if we're truncating the result to +// i32 anyway. def : Pat<(ctpop Int64Regs:$a), (CVT_u64_u32 (POPCr64 Int64Regs:$a), CvtNONE)>; +def : Pat<(i32 (trunc (ctpop Int64Regs:$a))), (POPCr64 Int64Regs:$a)>; -// For 16-bit, we zero-extend to 32-bit, then trunc the result back -// to 16-bits (ctpop of a 16-bit value is guaranteed to require less -// than 16 bits to store) +// For 16-bit, we zero-extend to 32-bit, then trunc the result back to 16-bits. +// If we know that we're storing into an i32, we can avoid the final trunc. def : Pat<(ctpop Int16Regs:$a), (CVT_u16_u32 (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE)), CvtNONE)>; +def : Pat<(i32 (zext (ctpop Int16Regs:$a))), + (POPCr32 (CVT_u32_u16 Int16Regs:$a, CvtNONE))>; // fpround f32 -> f16 def : Pat<(f16 (fpround Float32Regs:$a)), Index: llvm/trunk/test/CodeGen/NVPTX/intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/intrinsics.ll +++ llvm/trunk/test/CodeGen/NVPTX/intrinsics.ll @@ -36,8 +36,62 @@ ret i64 %val } +; CHECK-LABEL: test_popc32( +define i32 @test_popc32(i32 %a) { +; CHECK: popc.b32 + %val = call i32 @llvm.ctpop.i32(i32 %a) + ret i32 %val +} + +; CHECK-LABEL: test_popc64 +define i64 @test_popc64(i64 %a) { +; CHECK: popc.b64 +; CHECK: cvt.u64.u32 + %val = call i64 @llvm.ctpop.i64(i64 %a) + ret i64 %val +} + +; NVPTX popc.b64 returns an i32 even though @llvm.ctpop.i64 returns an i64, so +; if this function returns an i32, there's no need to do any type conversions +; in the ptx. +; CHECK-LABEL: test_popc64_trunc +define i32 @test_popc64_trunc(i64 %a) { +; CHECK: popc.b64 +; CHECK-NOT: cvt. + %val = call i64 @llvm.ctpop.i64(i64 %a) + %trunc = trunc i64 %val to i32 + ret i32 %trunc +} + +; llvm.ctpop.i16 is implemenented by converting to i32, running popc.b32, and +; then converting back to i16. +; CHECK-LABEL: test_popc16 +define void @test_popc16(i16 %a, i16* %b) { +; CHECK: cvt.u32.u16 +; CHECK: popc.b32 +; CHECK: cvt.u16.u32 + %val = call i16 @llvm.ctpop.i16(i16 %a) + store i16 %val, i16* %b + ret void +} + +; If we call llvm.ctpop.i16 and then zext the result to i32, we shouldn't need +; to do any conversions after calling popc.b32, because that returns an i32. +; CHECK-LABEL: test_popc16_to_32 +define i32 @test_popc16_to_32(i16 %a) { +; CHECK: cvt.u32.u16 +; CHECK: popc.b32 +; CHECK-NOT: cvt. + %val = call i16 @llvm.ctpop.i16(i16 %a) + %zext = zext i16 %val to i32 + ret i32 %zext +} + declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) declare float @llvm.nvvm.sqrt.f(float) declare i32 @llvm.bitreverse.i32(i32) declare i64 @llvm.bitreverse.i64(i64) +declare i16 @llvm.ctpop.i16(i16) +declare i32 @llvm.ctpop.i32(i32) +declare i64 @llvm.ctpop.i64(i64)