diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -107,6 +107,7 @@ bool selectInterpP1F16(MachineInstr &MI) const; bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicIcmp(MachineInstr &MI) const; + bool selectBallot(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I) const; bool selectEndCfIntrinsic(MachineInstr &MI) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -891,6 +891,8 @@ return selectDivScale(I); case Intrinsic::amdgcn_icmp: return selectIntrinsicIcmp(I); + case Intrinsic::amdgcn_ballot: + return selectBallot(I); default: return selectImpl(I, *CoverageInfo); } @@ -1039,6 +1041,40 @@ return Ret; } +bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register DstReg = I.getOperand(0).getReg(); + const unsigned Size = MRI->getType(DstReg).getSizeInBits(); + const bool Is64 = Size == 64; + + if (Size != STI.getWavefrontSize()) + return false; + + Optional Arg = + getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true); + + if (Arg.hasValue()) { + const int64_t Value = Arg.getValue().Value; + if (Value == 0) { + unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; + BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); + } else if (Value == -1) { // all ones + Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; + const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0; + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) + .addReg(SrcReg, 0, SubReg); + } else + return false; + } else { + Register SrcReg = I.getOperand(2).getReg(); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); + } + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2989,6 +2989,7 @@ constrainOpWithReadfirstlane(MI, MRI, 3); // Index return; } + case Intrinsic::amdgcn_ballot: case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: @@ -4160,6 +4161,13 @@ OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); break; } + case Intrinsic::amdgcn_ballot: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); + break; + } } break; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -0,0 +1,77 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel < %s | FileCheck %s + +declare i32 @llvm.amdgcn.ballot.i32(i1) + +; Test ballot(0) + +define amdgpu_cs i32 @constant_false() { +; CHECK-LABEL: constant_false: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: ; return to shader part epilog + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0) + ret i32 %ballot +} + +; Test ballot(1) + +define amdgpu_cs i32 @constant_true() { +; CHECK-LABEL: constant_true: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: ; return to shader part epilog + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) + ret i32 %ballot +} + +; Test ballot of a non-comparison operation + +define amdgpu_cs i32 @non_compare(i32 %x) { +; CHECK-LABEL: non_compare: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; CHECK-NEXT: ; return to shader part epilog + %trunc = trunc i32 %x to i1 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) + ret i32 %ballot +} + +; Test ballot of comparisons + +define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) { +; CHECK-LABEL: compare_ints: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: ; return to shader part epilog + %cmp = icmp eq i32 %x, %y + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} + +define amdgpu_cs i32 @compare_int_with_constant(i32 %x) { +; CHECK-LABEL: compare_int_with_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: ; return to shader part epilog + %cmp = icmp sge i32 %x, 99 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} + +define amdgpu_cs i32 @compare_floats(float %x, float %y) { +; CHECK-LABEL: compare_floats: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: ; return to shader part epilog + %cmp = fcmp ogt float %x, %y + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck %s + +declare i64 @llvm.amdgcn.ballot.i64(i1) + +; Test ballot(0) + +define amdgpu_cs i64 @constant_false() { +; CHECK-LABEL: constant_false: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0) + ret i64 %ballot +} + +; Test ballot(1) + +define amdgpu_cs i64 @constant_true() { +; CHECK-LABEL: constant_true: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: ; return to shader part epilog + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) + ret i64 %ballot +} + +; Test ballot of a non-comparison operation + +define amdgpu_cs i64 @non_compare(i32 %x) { +; CHECK-LABEL: non_compare: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; CHECK-NEXT: ; return to shader part epilog + %trunc = trunc i32 %x to i1 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) + ret i64 %ballot +} + +; Test ballot of comparisons + +define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) { +; CHECK-LABEL: compare_ints: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog + %cmp = icmp eq i32 %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define amdgpu_cs i64 @compare_int_with_constant(i32 %x) { +; CHECK-LABEL: compare_int_with_constant: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_mov_b32_e32 v1, 0x63 +; CHECK-NEXT: v_cmp_ge_i32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog + %cmp = icmp sge i32 %x, 99 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define amdgpu_cs i64 @compare_floats(float %x, float %y) { +; CHECK-LABEL: compare_floats: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog + %cmp = fcmp ogt float %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i32.ll @@ -5,44 +5,37 @@ ; Test ballot(0) -define i32 @test0() { -; CHECK-LABEL: test0: +define amdgpu_cs i32 @constant_false() { +; CHECK-LABEL: constant_false: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: ; implicit-def: $vcc_hi -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: ; return to shader part epilog %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0) ret i32 %ballot } ; Test ballot(1) -define i32 @test1() { -; CHECK-LABEL: test1: +define amdgpu_cs i32 @constant_true() { +; CHECK-LABEL: constant_true: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: v_mov_b32_e32 v0, exec_lo +; CHECK-NEXT: s_mov_b32 s0, exec_lo ; CHECK-NEXT: ; implicit-def: $vcc_hi -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: ; return to shader part epilog %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) ret i32 %ballot } ; Test ballot of a non-comparison operation -define i32 @test2(i32 %x) { -; CHECK-LABEL: test2: +define amdgpu_cs i32 @non_compare(i32 %x) { +; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: ; implicit-def: $vcc_hi -; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) ret i32 %ballot @@ -50,43 +43,34 @@ ; Test ballot of comparisons -define i32 @test3(i32 %x, i32 %y) { -; CHECK-LABEL: test3: +define amdgpu_cs i32 @compare_ints(i32 %x, i32 %y) { +; CHECK-LABEL: compare_ints: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: v_cmp_eq_u32_e64 s4, v0, v1 +; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1 ; CHECK-NEXT: ; implicit-def: $vcc_hi -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: ; return to shader part epilog %cmp = icmp eq i32 %x, %y %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) ret i32 %ballot } -define i32 @test4(i32 %x) { -; CHECK-LABEL: test4: +define amdgpu_cs i32 @compare_int_with_constant(i32 %x) { +; CHECK-LABEL: compare_int_with_constant: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: v_cmp_lt_i32_e64 s4, 0x62, v0 +; CHECK-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0 ; CHECK-NEXT: ; implicit-def: $vcc_hi -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: ; return to shader part epilog %cmp = icmp sge i32 %x, 99 %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) ret i32 %ballot } -define i32 @test5(float %x, float %y) { -; CHECK-LABEL: test5: +define amdgpu_cs i32 @compare_floats(float %x, float %y) { +; CHECK-LABEL: compare_floats: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 -; CHECK-NEXT: v_cmp_gt_f32_e64 s4, v0, v1 +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 ; CHECK-NEXT: ; implicit-def: $vcc_hi -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: ; return to shader part epilog %cmp = fcmp ogt float %x, %y %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) ret i32 %ballot diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.ll @@ -5,41 +5,36 @@ ; Test ballot(0) -define i64 @test0() { -; CHECK-LABEL: test0: +define amdgpu_cs i64 @constant_false() { +; CHECK-LABEL: constant_false: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, 0 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0) ret i64 %ballot } ; Test ballot(1) -define i64 @test1() { -; CHECK-LABEL: test1: +define amdgpu_cs i64 @constant_true() { +; CHECK-LABEL: constant_true: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, exec_lo -; CHECK-NEXT: v_mov_b32_e32 v1, exec_hi -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: s_mov_b32 s0, exec_lo +; CHECK-NEXT: s_mov_b32 s1, exec_hi +; CHECK-NEXT: ; return to shader part epilog %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) ret i64 %ballot } ; Test ballot of a non-comparison operation -define i64 @test2(i32 %x) { -; CHECK-LABEL: test2: +define amdgpu_cs i64 @non_compare(i32 %x) { +; CHECK-LABEL: non_compare: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 +; CHECK-NEXT: ; return to shader part epilog %trunc = trunc i32 %x to i1 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) ret i64 %ballot @@ -47,41 +42,32 @@ ; Test ballot of comparisons -define i64 @test3(i32 %x, i32 %y) { -; CHECK-LABEL: test3: +define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) { +; CHECK-LABEL: compare_ints: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: v_cmp_eq_u32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog %cmp = icmp eq i32 %x, %y %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot } -define i64 @test4(i32 %x) { -; CHECK-LABEL: test4: +define amdgpu_cs i64 @compare_int_with_constant(i32 %x) { +; CHECK-LABEL: compare_int_with_constant: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_movk_i32 s4, 0x62 -; CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: s_movk_i32 s0, 0x62 +; CHECK-NEXT: v_cmp_lt_i32_e64 s[0:1], s0, v0 +; CHECK-NEXT: ; return to shader part epilog %cmp = icmp sge i32 %x, 99 %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot } -define i64 @test5(float %x, float %y) { -; CHECK-LABEL: test5: +define amdgpu_cs i64 @compare_floats(float %x, float %y) { +; CHECK-LABEL: compare_floats: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, v1 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: s_setpc_b64 s[30:31] +; CHECK-NEXT: v_cmp_gt_f32_e64 s[0:1], v0, v1 +; CHECK-NEXT: ; return to shader part epilog %cmp = fcmp ogt float %x, %y %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) ret i64 %ballot