diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1662,6 +1662,10 @@ Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_inverse_ballot : + Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + def int_amdgcn_readfirstlane : ClangBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2528,6 +2528,18 @@ case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); return; + case Intrinsic::amdgcn_inverse_ballot: + switch (N->getOperand(1).getValueSizeInBits()) { + case 32: + Opcode = AMDGPU::S_INVERSE_BALLOT_U32; + break; + case 64: + Opcode = AMDGPU::S_INVERSE_BALLOT_U64; + break; + default: + llvm_unreachable_internal("Unsupported size for inverse ballot mask."); + } + break; default: SelectCode(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -112,6 +112,7 @@ bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicCmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; + bool selectInverseBallot(MachineInstr &I) const; bool selectRelocConstant(MachineInstr &I) const; bool selectGroupStaticSize(MachineInstr &I) const; bool selectReturnAddress(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1046,6 +1046,8 @@ return selectIntrinsicCmp(I); case Intrinsic::amdgcn_ballot: return selectBallot(I); + case Intrinsic::amdgcn_inverse_ballot: + return selectInverseBallot(I); case Intrinsic::amdgcn_reloc_constant: return selectRelocConstant(I); case Intrinsic::amdgcn_groupstaticsize: @@ -1351,6 +1353,52 @@ return true; } +bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + const Register DstReg = I.getOperand(0).getReg(); + const Register MaskReg = I.getOperand(2).getReg(); + + const unsigned Size = MRI->getType(MaskReg).getSizeInBits(); + assert(Size == STI.getWavefrontSize() && + "Mask is not the same size as wavefront."); + const bool Is32 = Size == 32 && STI.isWave32(); + + const RegisterBank *MaskBank = RBI.getRegBank(MaskReg, *MRI, TRI); + const bool IsVALU = MaskBank->getID() == AMDGPU::VGPRRegBankID; + + if (!IsVALU) { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg); + I.eraseFromParent(); + return true; + } + + SmallVector MaskParts; + SmallVector DstParts; + unsigned NumParts = Size / 32; + + for (unsigned i = 0; i < NumParts; ++i) { + Register DstPart = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + int SubReg = Is32 ? 0 : (AMDGPU::sub0 + i * 8); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), + Is32 ? DstReg : DstPart) + .addReg(MaskReg, 0, SubReg); + + DstParts.push_back(DstPart); + } + + if (!Is32) { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(DstParts[0]) + .addImm(AMDGPU::sub0) + .addReg(DstParts[1]) + .addImm(AMDGPU::sub1); + } + + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4494,6 +4494,15 @@ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); break; } + case Intrinsic::amdgcn_inverse_ballot: { + // This must be an SGPR, but accept a VGPR. + Register MaskReg = MI.getOperand(2).getReg(); + unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); + unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); + break; + } } break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -376,6 +376,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4477,6 +4477,49 @@ return BB; } + case AMDGPU::S_INVERSE_BALLOT_U32: + case AMDGPU::S_INVERSE_BALLOT_U64: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + const Register DstReg = MI.getOperand(0).getReg(); + const Register MaskReg = MI.getOperand(1).getReg(); + + bool Is32 = (MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32); + + SmallVector DstParts; + unsigned NumParts = Is32 ? 1 : 2; + + const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg); + + if (!IsVALU) { + BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg); + MI.eraseFromParent(); + return BB; + } + + for (unsigned i = 0; i < NumParts; ++i) { + Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + int SubReg = Is32 ? 0 : (AMDGPU::sub0 + i * 8); + BuildMI(*BB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + Is32 ? DstReg : DstPart) + .addReg(MaskReg, 0, SubReg); + + DstParts.push_back(DstPart); + } + + if (!Is32) { + BuildMI(*BB, &MI, DL, TII->get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(DstParts[0]) + .addImm(AMDGPU::sub0) + .addReg(DstParts[1]) + .addImm(AMDGPU::sub1); + } + + MI.eraseFromParent(); + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -190,6 +190,12 @@ let mayStore = 0; } +let usesCustomInserter = 1, Defs = [VCC] in { +def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins unknown:$src)>; + +def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins unknown:$src)>; +} // End usesCustomInserter = 1, Defs = [VCC] + // PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { let Uses = [EXEC]; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i32.ll @@ -0,0 +1,199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +declare i1 @llvm.amdgcn.inverse.ballot(i32) + +; Test ballot(0) +define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_false_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_false_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(1) + +define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_true_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_true_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_mask_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_movk_i32 s0, 0x1000 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_mask_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_movk_i32 s0, 0x1000 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test inverse ballot using a vgpr as input + +define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) { +; GFX10-LABEL: vgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[1:2], v0, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: vgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %out) { +; GFX10-LABEL: sgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot after phi +define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { +; GFX10-LABEL: phi_uniform: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_i32 s0, s0, 1 +; GFX10-NEXT: .LBB5_2: ; %endif +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: phi_uniform: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: .LBB5_2: ; %endif +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cc = icmp ne i32 %s2, 0 + br i1 %cc, label %endif, label %if + +if: + %tmp = add i32 %s0_1, 1 + br label %endif + +endif: + %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ] + + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test for branching +define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { +; GFX10-LABEL: inverse_ballot_branch: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_xor_b32 s2, s1, -1 +; GFX10-NEXT: s_and_saveexec_b32 s1, s2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_i32 s0, s0, 1 +; GFX10-NEXT: ; %bb.2: ; %endif +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: inverse_ballot_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_xor_b32 s2, s1, -1 +; GFX11-NEXT: s_and_saveexec_b32 s1, s2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: ; %bb.2: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2) + br i1 %ballot, label %endif, label %if + +if: + %tmp = add i32 %s0_1, 1 + br label %endif + +endif: + %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ] + store i32 %input, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i64.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +declare i1 @llvm.amdgcn.inverse.ballot.i64(i64) + +; Test ballot(0) +define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_false_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_false_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(1) + +define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_true_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_true_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test inverse ballot using a vgpr as input + +define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) { +; GFX10-LABEL: vgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: vgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b64 v[2:3], v[0:1], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %out) { +; GFX10-LABEL: sgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot after phi +define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2,ptr addrspace(1) %out) { +; GFX10-LABEL: phi_uniform: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_u32 s0, s0, 1 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: .LBB4_2: ; %endif +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: phi_uniform: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_u32 s0, s0, 1 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB4_2: ; %endif +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cc = icmp ne i64 %s2, 0 + br i1 %cc, label %endif, label %if + +if: + %tmp = add i64 %s0_1, 1 + br label %endif + +endif: + %input = phi i64 [ %s0_1, %entry], [ %tmp, %if ] + + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test for branching +define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace(1) %out) { +; GFX10-LABEL: inverse_ballot_branch: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_u32 s0, s0, 1 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: ; %bb.2: ; %endif +; GFX10-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: inverse_ballot_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_xor_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_u32 s0, s0, 1 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: ; %bb.2: ; %endif +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2) + br i1 %ballot, label %endif, label %if + +if: + %tmp = add i64 %s0_1, 1 + br label %endif + +endif: + %sel = phi i64 [ %s0_1, %entry], [ %tmp, %if ] + store i64 %sel, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +declare i1 @llvm.amdgcn.inverse.ballot(i32) + +; Test ballot(0) +define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_false_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_false_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(1) + +define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_true_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s0, -1 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_true_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_mask_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_movk_i32 s0, 0x1000 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_mask_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_movk_i32 s0, 0x1000 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; ; Test inverse ballot using a vgpr as input + +define amdgpu_cs void @vgpr_inverse_ballot(ptr addrspace(1) %out, i32 %input) { +; GFX10-LABEL: vgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: vgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @sgpr_inverse_ballot(ptr addrspace(1) %out, i32 inreg %input) { +; GFX10-LABEL: sgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot after phi +define amdgpu_cs void @phi_uniform(ptr addrspace(1) %out, i32 inreg %s0_1, i32 inreg %s2) { +; GFX10-LABEL: phi_uniform: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_i32 s0, s0, 1 +; GFX10-NEXT: .LBB5_2: ; %endif +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: phi_uniform: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: .LBB5_2: ; %endif +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cc = icmp ne i32 %s2, 0 + br i1 %cc, label %endif, label %if + +if: + %tmp = add i32 %s0_1, 1 + br label %endif + +endif: + %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ] + + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test for branching +define amdgpu_cs void @inverse_ballot_branch(ptr addrspace(1) %out, i32 inreg %s0_1, i32 inreg %s2) { +; GFX10-LABEL: inverse_ballot_branch: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_xor_b32 s2, s1, -1 +; GFX10-NEXT: s_and_saveexec_b32 s1, s2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_i32 s0, s0, 1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ; %bb.2: ; %endif +; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: inverse_ballot_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_xor_b32 s2, s1, -1 +; GFX11-NEXT: s_and_saveexec_b32 s1, s2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: ; %bb.2: ; %endif +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2) + br i1 %ballot, label %endif, label %if + +if: + %tmp = add i32 %s0_1, 1 + br label %endif + +endif: + %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ] + store i32 %input, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s + +declare i1 @llvm.amdgcn.inverse.ballot.i64(i64) + +; Test ballot(0) +define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_false_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b64 s[0:1], 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_false_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], 0 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(1) + +define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { +; GFX10-LABEL: constant_true_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: s_mov_b64 s[0:1], -1 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: constant_true_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b64 s[0:1], -1 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test inverse ballot using a vgpr as input + +define amdgpu_cs void @vgpr_inverse_ballot(ptr addrspace(1) %out, i64 %input) { +; GFX10-LABEL: vgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: vgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v3 +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @sgpr_inverse_ballot(ptr addrspace(1) %out, i64 inreg %input) { +; GFX10-LABEL: sgpr_inverse_ballot: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s0 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: sgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: v_mov_b32_e32 v3, s0 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot after phi +define amdgpu_cs void @phi_uniform(ptr addrspace(1) %out, i64 inreg %s0_1, i64 inreg %s2) { +; GFX10-LABEL: phi_uniform: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX10-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_u32 s0, s0, 1 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: .LBB4_2: ; %endif +; GFX10-NEXT: s_mov_b32 s2, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX10-NEXT: v_mov_b32_e32 v3, s2 +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: phi_uniform: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_u32 s0, s0, 1 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: .LBB4_2: ; %endif +; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX11-NEXT: v_mov_b32_e32 v3, s2 +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cc = icmp ne i64 %s2, 0 + br i1 %cc, label %endif, label %if + +if: + %tmp = add i64 %s0_1, 1 + br label %endif + +endif: + %input = phi i64 [ %s0_1, %entry], [ %tmp, %if ] + + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot used for branching +define amdgpu_cs void @inverse_ballot_branch(ptr addrspace(1) %out, i64 inreg %s0_1, i64 inreg %s2) { +; GFX10-LABEL: inverse_ballot_branch: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: s_xor_b64 s[4:5], s[2:3], -1 +; GFX10-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX10-NEXT: ; %bb.1: ; %if +; GFX10-NEXT: s_add_u32 s0, s0, 1 +; GFX10-NEXT: s_addc_u32 s1, s1, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ; %bb.2: ; %endif +; GFX10-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: inverse_ballot_branch: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: s_xor_b64 s[4:5], s[2:3], -1 +; GFX11-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_u32 s0, s0, 1 +; GFX11-NEXT: s_addc_u32 s1, s1, 0 +; GFX11-NEXT: v_mov_b32_e32 v3, s1 +; GFX11-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-NEXT: ; %bb.2: ; %endif +; GFX11-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2) + br i1 %ballot, label %endif, label %if + +if: + %tmp = add i64 %s0_1, 1 + br label %endif + +endif: + %sel = phi i64 [ %s0_1, %entry], [ %tmp, %if ] + store i64 %sel, ptr addrspace(1) %out + ret void +}