diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -846,6 +846,8 @@
         AMDGPUAS::CONSTANT_ADDRESS_32BIT)
       return true;
     return false;
+  case AMDGPUISD::SETCC: // ballot-style instruction
+    return true;
   }
   return false;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll
@@ -71,21 +71,14 @@
   ret i32 %ballot
 }
 
-define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+define amdgpu_cs i32 @ctpop_of_ballot(float %x, float %y) {
 ; CHECK-LABEL: ctpop_of_ballot:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, v0, v1
-
-; TODO: This should use a scalar s_bcnt1 instruction.
-; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
-
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b32 s0, vcc_lo
 ; CHECK-NEXT:    ; return to shader part epilog
-  %cmp = icmp ugt i32 %x, %y
+  %cmp = fcmp ogt float %x, %y
   %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp)
   %bcnt = call i32 @llvm.ctpop.i32(i32 %ballot)
-  %r.i = mul i32 %x, %bcnt
-  %r = bitcast i32 %r.i to float
-  ret float %r
+  ret i32 %bcnt
 }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll
@@ -74,23 +74,15 @@
   ret i64 %ballot
 }
 
-define amdgpu_cs float @ctpop_of_ballot(i32 %x, i32 %y) {
+define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) {
 ; CHECK-LABEL: ctpop_of_ballot:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v1
-
-; TODO: This should use a scalar s_bcnt1 instruction.
-; NOTE: The final mul is cruft to prevent a "bad VGPR to SGPR copy" error
-
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_lo, 0
-; CHECK-NEXT:    v_bcnt_u32_b32 v1, vcc_hi, v1 
-; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_cmp_gt_f32_e32 vcc, v0, v1
+; CHECK-NEXT:    s_bcnt1_i32_b64 s0, vcc
+; CHECK-NEXT:    s_mov_b32 s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
-  %cmp = icmp ugt i32 %x, %y
+  %cmp = fcmp ogt float %x, %y
   %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp)
   %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot)
-  %bcnt.32 = trunc i64 %bcnt to i32
-  %r.i = mul i32 %x, %bcnt.32
-  %r = bitcast i32 %r.i to float
-  ret float %r
+  ret i64 %bcnt
 }