Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -107,6 +107,7 @@ bool selectInterpP1F16(MachineInstr &MI) const; bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicIcmp(MachineInstr &MI) const; + bool selectBallot(MachineInstr &I) const; bool selectG_INTRINSIC(MachineInstr &I) const; bool selectEndCfIntrinsic(MachineInstr &MI) const; Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -891,6 +891,8 @@ return selectDivScale(I); case Intrinsic::amdgcn_icmp: return selectIntrinsicIcmp(I); + case Intrinsic::amdgcn_ballot: + return selectBallot(I); default: return selectImpl(I, *CoverageInfo); } @@ -1039,6 +1041,41 @@ return Ret; } +bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + Register DstReg = I.getOperand(0).getReg(); + unsigned Size = MRI->getType(DstReg).getSizeInBits(); + const bool Is64 = Size == 64; + + if (Size != STI.getWavefrontSize()) + return false; + + MachineInstr *Value = MRI->getVRegDef(I.getOperand(2).getReg()); + if (Value->getOpcode() == AMDGPU::COPY) + Value = MRI->getVRegDef(Value->getOperand(1).getReg()); + + if (Value->getOpcode() == AMDGPU::G_CONSTANT) { + const APInt &Val = Value->getOperand(1).getCImm()->getValue(); + + if (Val.isNullValue()) { + unsigned Opcode = Is64 ? AMDGPU::V_MOV_B32_e64 : AMDGPU::V_MOV_B32_e32; + BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); + } else if (Val.isAllOnesValue()) { + unsigned SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); + } else + return false; + + } else { + Register SrcReg = I.getOperand(2).getReg(); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); + } + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const { // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick // SelectionDAG uses for wave32 vs wave64. Index: llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2989,6 +2989,7 @@ constrainOpWithReadfirstlane(MI, MRI, 3); // Index return; } + case Intrinsic::amdgcn_ballot: case Intrinsic::amdgcn_interp_p1: case Intrinsic::amdgcn_interp_p2: case Intrinsic::amdgcn_interp_mov: @@ -4158,6 +4159,13 @@ OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32); break; } + case Intrinsic::amdgcn_ballot: { + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); + unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize); + OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); + break; + } } break; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel < %s | FileCheck %s + +declare i32 @llvm.amdgcn.ballot.i32(i1) + +; Test ballot(0) + +define i32 @test0() { +; CHECK-LABEL: test0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 0) + ret i32 %ballot +} + +; Test ballot(1) + +define i32 @test1() { +; CHECK-LABEL: test1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_mov_b32_e32 v0, exec_lo +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 1) + ret i32 %ballot +} + +; Test ballot of a non-comparison operation + +define i32 @test2(i32 %x) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 0, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i32 %x to i1 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %trunc) + ret i32 %ballot +} + +; Test ballot of comparisons + +define i32 @test3(i32 %x, i32 %y) { +; CHECK-LABEL: test3: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s4, v0, v1 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %x, %y + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} + +define i32 @test4(i32 %x) { +; CHECK-LABEL: test4: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_cmp_le_i32_e64 s4, 0x63, v0 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp sge i32 %x, 99 + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} + +define i32 @test5(float %x, float %y) { +; CHECK-LABEL: test5: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_waitcnt_vscnt null, 0x0 +; CHECK-NEXT: v_cmp_gt_f32_e64 s4, v0, v1 +; CHECK-NEXT: ; implicit-def: $vcc_hi +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ogt float %x, %y + %ballot = call i32 @llvm.amdgcn.ballot.i32(i1 %cmp) + ret i32 %ballot +} Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel < %s | FileCheck %s + +declare i64 @llvm.amdgcn.ballot.i64(i1) + +; Test ballot(0) + +define i64 @test0() { +; CHECK-LABEL: test0: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0) + ret i64 %ballot +} + +; Test ballot(1) + +define i64 @test1() { +; CHECK-LABEL: test1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, exec_lo +; CHECK-NEXT: v_mov_b32_e32 v1, exec_hi +; CHECK-NEXT: s_setpc_b64 s[30:31] + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) + ret i64 %ballot +} + +; Test ballot of a non-comparison operation + +define i64 @test2(i32 %x) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %trunc = trunc i32 %x to i1 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) + ret i64 %ballot +} + +; Test ballot of comparisons + +define i64 @test3(i32 %x, i32 %y) { +; CHECK-LABEL: test3: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp eq i32 %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define i64 @test4(i32 %x) { +; CHECK-LABEL: test4: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v1, 0x63 +; CHECK-NEXT: v_cmp_ge_i32_e64 s[4:5], v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = icmp sge i32 %x, 99 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define i64 @test5(float %x, float %y) { +; CHECK-LABEL: test5: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: s_setpc_b64 s[30:31] + %cmp = fcmp ogt float %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +}