Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1282,6 +1282,9 @@ Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; +def int_amdgcn_inverse_ballot : + Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>; + def int_amdgcn_readfirstlane : GCCBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; Index: lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -75,6 +75,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -3508,6 +3508,9 @@ MI.eraseFromParent(); return BB; } + case AMDGPU::S_INV_BALLOT: + MI.setDesc(TII->get(AMDGPU::COPY)); + return BB; default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3990,6 +3990,14 @@ return; } + // Legalize S_INV_BALLOT + if (MI.getOpcode() == AMDGPU::S_INV_BALLOT) { + MachineOperand &Src = MI.getOperand(1); + if (Src.isReg() && RI.hasVGPRs(MRI.getRegClass(Src.getReg()))) + Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); + return; + } + // Legalize MIMG and MUBUF/MTBUF for shaders. // // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -141,6 +141,13 @@ let Constraints = "$src = $vdst"; } +// Pseudoinstruction for @llvm.amdgcn.inverse.ballot. It is turned into +// potentially a readlane and then a copy. +def S_INV_BALLOT : SPseudoInstSI <(outs VReg_1:$dst), (ins SReg_64:$src), + [(set i1:$dst, (int_amdgcn_inverse_ballot i64:$src))]> { + let usesCustomInserter = 1; +} + let usesCustomInserter = 1, Defs = [SCC] in { def S_ADD_U64_PSEUDO : SPseudoInstSI < Index: test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.ll @@ -0,0 +1,83 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s + +; CHECK-LABEL: {{^}}ret: +; CHECK: s_mov_b64 s[[MASK:\[[0-9]+:[0-9]+\]]], 1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[[MASK]] +define float @ret() #1 { +main_body: + %w = call i1 @llvm.amdgcn.inverse.ballot(i64 1) + %r = select i1 %w, float 1.0, float 0.0 + ret float %r +} + +; make sure it works for things that wind up in VGPR's +; CHECK-LABEL: {{^}}vgpr: +; CHECK: v_readfirstlane_b32 s[[MASKLO:[0-9]+]], v0 +; CHECK-NEXT: v_readfirstlane_b32 s[[MASKHI:[0-9]+]], v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s{{\[}}[[MASKLO]]:[[MASKHI]]{{\]}} +define float @vgpr(i64 %v0_1) { + %inv = call i1 @llvm.amdgcn.inverse.ballot(i64 %v0_1) + %r = select i1 %inv, float 1.0, float 0.0 + ret float %r +} + +; CHECK-LABEL: {{^}}phi_uniform: +; CHECK: s_cmp_lg_u32 s2, 0 +; CHECK: s_cbranch_scc0 +; CHECK: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; CHECK: s_branch +; CHECK: s_add_u32 s[[MASKLO:[0-9]+]], s0, 1 +; CHECK: s_addc_u32 s[[MASKHI:[0-9]+]], s1, 0 +; CHECK: v_cndmask_b32_e64 v0, 0, 1.0, s{{\[}}[[MASKLO]]:[[MASKHI]]{{\]}} +define amdgpu_ps float @phi_uniform(i64 inreg %s0_1, i32 inreg %s2) { + main_body: + %cc = icmp ne i32 %s2, 0 + + br i1 %cc, label %endif, label %if + + if: + %tmp = add i64 %s0_1, 1 + br label %endif + + endif: + %sel = phi i64 [ %s0_1, %main_body], [ %tmp, %if ] + + %inv = call i1 @llvm.amdgcn.inverse.ballot(i64 %sel) + %r = select i1 %inv, float 1.0, float 0.0 + ret float %r +} + +; CHECK-LABEL: {{^}}phi_divergent: +; CHECK: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK: v_mov_b32_e32 v[[VMASKLO:[0-9]+]], s0 +; CHECK: v_mov_b32_e32 v[[VMASKHI:[0-9]+]], s1 +; CHECK: s_and_saveexec +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: v_mov_b32_e32 v[[VMASKLO]], +; CHECK: v_mov_b32_e32 v[[VMASKHI]], +; CHECK: s_or_b64 exec, exec, +; CHECK: v_readfirstlane_b32 s[[SMASKLO:[0-9]+]], v[[VMASKLO]] +; CHECK: v_readfirstlane_b32 s[[SMASKHI:[0-9]+]], v[[VMASKHI]] +; CHECK: v_cndmask_b32_e64 v0, 0, 1.0, s{{\[}}[[SMASKLO]]:[[SMASKHI]]{{\]}} +define amdgpu_ps float @phi_divergent(i32 %v0, i64 inreg %s0_1) { + main_body: + %cc = icmp ne i32 %v0, 0 + + br i1 %cc, label %endif, label %if + + if: + %tmp = add i64 %s0_1, 1 + br label %endif + + endif: + %sel = phi i64 [ %s0_1, %main_body], [ %tmp, %if ] + + %inv = call i1 @llvm.amdgcn.inverse.ballot(i64 %sel) + %r = select i1 %inv, float 1.0, float 0.0 + ret float %r +} + +declare i1 @llvm.amdgcn.inverse.ballot(i64) +declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) +