diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1662,6 +1662,10 @@ Intrinsic<[llvm_anyint_ty], [llvm_i1_ty], [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; +def int_amdgcn_inverse_ballot : + Intrinsic<[llvm_i1_ty], [llvm_anyint_ty], + [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>; + def int_amdgcn_readfirstlane : ClangBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -2528,6 +2528,16 @@ case Intrinsic::amdgcn_interp_p1_f16: SelectInterpP1F16(N); return; + case Intrinsic::amdgcn_inverse_ballot: + switch (N->getOperand(1).getValueSizeInBits()) { + case 32: + Opcode = AMDGPU::S_INVERSE_BALLOT_U32; + break; + case 64: + Opcode = AMDGPU::S_INVERSE_BALLOT_U64; + break; + } + break; default: SelectCode(N); return; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -112,6 +112,7 @@ bool selectDivScale(MachineInstr &MI) const; bool selectIntrinsicCmp(MachineInstr &MI) const; bool selectBallot(MachineInstr &I) const; + bool selectInverseBallot(MachineInstr &I) const; bool selectRelocConstant(MachineInstr &I) const; bool selectGroupStaticSize(MachineInstr &I) const; bool selectReturnAddress(MachineInstr &I) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1046,6 +1046,8 @@ return selectIntrinsicCmp(I); case Intrinsic::amdgcn_ballot: return selectBallot(I); + case Intrinsic::amdgcn_inverse_ballot: + return selectInverseBallot(I); case Intrinsic::amdgcn_reloc_constant: return selectRelocConstant(I); case Intrinsic::amdgcn_groupstaticsize: @@ -1351,6 +1353,17 @@ return true; } +bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const { + MachineBasicBlock *BB = I.getParent(); + const DebugLoc &DL = I.getDebugLoc(); + const Register DstReg = I.getOperand(0).getReg(); + const Register MaskReg = I.getOperand(2).getReg(); + + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg); + I.eraseFromParent(); + return true; +} + bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const { Register DstReg = I.getOperand(0).getReg(); const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -3004,6 +3004,10 @@ case Intrinsic::amdgcn_ubfe: applyMappingBFE(OpdMapper, false); return; + case Intrinsic::amdgcn_inverse_ballot: + applyDefaultMapping(OpdMapper); + constrainOpWithReadfirstlane(MI, MRI, 2); // Mask + return; case Intrinsic::amdgcn_ballot: // Use default handling and insert copy to vcc source. break; @@ -4494,6 +4498,15 @@ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize); break; } + case Intrinsic::amdgcn_inverse_ballot: { + // This must be an SGPR, but accept a VGPR. + Register MaskReg = MI.getOperand(2).getReg(); + unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits(); + unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID); + OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1); + OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize); + break; + } } break; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -376,6 +376,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -4477,6 +4477,25 @@ return BB; } + case AMDGPU::S_INVERSE_BALLOT_U32: + case AMDGPU::S_INVERSE_BALLOT_U64: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + const Register DstReg = MI.getOperand(0).getReg(); + Register MaskReg = MI.getOperand(1).getReg(); + + const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg); + + if (IsVALU) { + MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI); + } + + BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg); + MI.eraseFromParent(); + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -190,6 +190,12 @@ let mayStore = 0; } +let usesCustomInserter = 1 in { +def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>; + +def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>; +} // End usesCustomInserter = 1 + // PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes. def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> { let Uses = [EXEC]; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG %s + +declare i1 @llvm.amdgcn.inverse.ballot(i32) + +; Test ballot(0) +define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { +; GFX11-LABEL: constant_false_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(1) + +define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { +; GFX11-LABEL: constant_true_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_mov_b32 s0, -1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { +; GFX11-LABEL: constant_mask_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_movk_i32 s0, 0x1000 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test inverse ballot using a vgpr as input + +define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) { +; GFX11-LABEL: vgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[1:2], v0, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %out) { +; GFX11-LABEL: sgpr_inverse_ballot: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot after phi +define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { +; GFX11-LABEL: phi_uniform: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: s_cbranch_scc1 .LBB5_2 +; GFX11-NEXT: ; %bb.1: ; %if +; GFX11-NEXT: s_add_i32 s0, s0, 1 +; GFX11-NEXT: .LBB5_2: ; %endif +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11-NEXT: global_store_b32 v[0:1], v2, off +; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-NEXT: s_endpgm +entry: + %cc = icmp ne i32 %s2, 0 + br i1 %cc, label %endif, label %if + +if: + %tmp = add i32 %s0_1, 1 + br label %endif + +endif: + %input = phi i32 [ %s0_1, %entry ], [ %tmp, %if ] + + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input) + %sel = select i1 %ballot, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) %out + ret void +} + +; Test for branching +; GISel implementation is currently incorrect. +; The change in the branch affects all lanes, not just the branching ones. +; This test will be fixed once GISel correctly takes uniformity analysis into account. +define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) { +; GISEL-LABEL: inverse_ballot_branch: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_xor_b32 s2, s1, -1 +; GISEL-NEXT: s_and_saveexec_b32 s1, s2 +; GISEL-NEXT: ; %bb.1: ; %if +; GISEL-NEXT: s_add_i32 s0, s0, 1 +; GISEL-NEXT: ; %bb.2: ; %endif +; GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b32 v[0:1], v2, off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: inverse_ballot_branch: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-NEXT: s_xor_b32 s2, s1, -1 +; SDAG-NEXT: s_and_saveexec_b32 s1, s2 +; SDAG-NEXT: ; %bb.1: ; %if +; SDAG-NEXT: s_add_i32 s0, s0, 1 +; SDAG-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-NEXT: ; %bb.2: ; %endif +; SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s1 +; SDAG-NEXT: global_store_b32 v[0:1], v2, off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2) + br i1 %ballot, label %endif, label %if + +if: + %tmp = add i32 %s0_1, 1 + br label %endif + +endif: + %input = phi i32 [ %s0_1, %entry ], [ %tmp, %if ] + store i32 %input, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -0,0 +1,242 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s + +declare i1 @llvm.amdgcn.inverse.ballot.i64(i64) + +; Test ballot(0) +define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) { +; GISEL-LABEL: constant_false_inverse_ballot: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b64 s[0:1], 0 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: constant_false_inverse_ballot: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s2, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], 0 +; SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(1) + +define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) { +; GISEL-LABEL: constant_true_inverse_ballot: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b64 s[0:1], -1 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: constant_true_inverse_ballot: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s2, 0 +; SDAG-NEXT: s_mov_b64 s[0:1], -1 +; SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot(u0x0040F8010000) + +define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) { +; GISEL-LABEL: constant_mask_inverse_ballot: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_mov_b32 s0, 0xf8010000 +; GISEL-NEXT: s_mov_b32 s1, 64 +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: constant_mask_inverse_ballot: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_mov_b32 s0, 0xf8010000 +; SDAG-NEXT: s_mov_b32 s2, 0 +; SDAG-NEXT: s_mov_b32 s1, 64 +; SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0x0040F8010000) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test inverse ballot using a vgpr as input + +define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) { +; GISEL-LABEL: vgpr_inverse_ballot: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: v_mov_b32_e32 v1, 0 +; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GISEL-NEXT: global_store_b64 v[2:3], v[0:1], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: vgpr_inverse_ballot: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: s_mov_b32 s2, 0 +; SDAG-NEXT: v_mov_b32_e32 v1, s2 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SDAG-NEXT: global_store_b64 v[2:3], v[0:1], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %out) { +; GISEL-LABEL: sgpr_inverse_ballot: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: sgpr_inverse_ballot: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SDAG-NEXT: s_mov_b32 s0, 0 +; SDAG-NEXT: s_waitcnt_depctr 0xfffe +; SDAG-NEXT: v_mov_b32_e32 v3, s0 +; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test ballot after phi +define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace(1) %out) { +; GISEL-LABEL: phi_uniform: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GISEL-NEXT: s_cbranch_scc1 .LBB5_2 +; GISEL-NEXT: ; %bb.1: ; %if +; GISEL-NEXT: s_add_u32 s0, s0, 1 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: .LBB5_2: ; %endif +; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: phi_uniform: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: s_cmp_lg_u64 s[2:3], 0 +; SDAG-NEXT: s_cbranch_scc1 .LBB5_2 +; SDAG-NEXT: ; %bb.1: ; %if +; SDAG-NEXT: s_add_u32 s0, s0, 1 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: .LBB5_2: ; %endif +; SDAG-NEXT: s_mov_b32 s2, 0 +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SDAG-NEXT: v_mov_b32_e32 v3, s2 +; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %cc = icmp ne i64 %s2, 0 + br i1 %cc, label %endif, label %if + +if: + %tmp = add i64 %s0_1, 1 + br label %endif + +endif: + %input = phi i64 [ %s0_1, %entry ], [ %tmp, %if ] + + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input) + %sel = select i1 %ballot, i64 1, i64 0 + store i64 %sel, ptr addrspace(1) %out + ret void +} + +; Test for branching +; GISel implementation is currently incorrect. +; The change in the branch affects all lanes, not just the branching ones. +; This test will be fixed once GISel correctly takes uniformity analysis into account. +define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace(1) %out) { +; GISEL-LABEL: inverse_ballot_branch: +; GISEL: ; %bb.0: ; %entry +; GISEL-NEXT: s_xor_b64 s[4:5], s[2:3], -1 +; GISEL-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; GISEL-NEXT: ; %bb.1: ; %if +; GISEL-NEXT: s_add_u32 s0, s0, 1 +; GISEL-NEXT: s_addc_u32 s1, s1, 0 +; GISEL-NEXT: ; %bb.2: ; %endif +; GISEL-NEXT: s_or_b64 exec, exec, s[2:3] +; GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GISEL-NEXT: global_store_b64 v[0:1], v[2:3], off +; GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GISEL-NEXT: s_endpgm +; +; SDAG-LABEL: inverse_ballot_branch: +; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-NEXT: s_xor_b64 s[4:5], s[2:3], -1 +; SDAG-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SDAG-NEXT: ; %bb.1: ; %if +; SDAG-NEXT: s_add_u32 s0, s0, 1 +; SDAG-NEXT: s_addc_u32 s1, s1, 0 +; SDAG-NEXT: v_mov_b32_e32 v3, s1 +; SDAG-NEXT: v_mov_b32_e32 v2, s0 +; SDAG-NEXT: ; %bb.2: ; %endif +; SDAG-NEXT: s_or_b64 exec, exec, s[2:3] +; SDAG-NEXT: global_store_b64 v[0:1], v[2:3], off +; SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; SDAG-NEXT: s_endpgm +entry: + %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2) + br i1 %ballot, label %endif, label %if + +if: + %tmp = add i64 %s0_1, 1 + br label %endif + +endif: + %sel = phi i64 [ %s0_1, %entry ], [ %tmp, %if ] + store i64 %sel, ptr addrspace(1) %out + ret void +}