Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -1282,6 +1282,9 @@ Intrinsic<[llvm_i64_ty], [llvm_anyfloat_ty, LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem, IntrConvergent]>; +def int_amdgcn_inverse_ballot : + Intrinsic<[llvm_i1_ty], [llvm_i64_ty], [IntrNoMem, IntrSpeculatable]>; + def int_amdgcn_readfirstlane : GCCBuiltin<"__builtin_amdgcn_readfirstlane">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>; Index: lib/Target/AMDGPU/AMDGPUSearchableTables.td =================================================================== --- lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -75,6 +75,7 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; +def : SourceOfDivergence; foreach intr = AMDGPUImageDimAtomicIntrinsics in def : SourceOfDivergence; Index: lib/Target/AMDGPU/SIFixSGPRCopies.cpp =================================================================== --- lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -182,7 +182,8 @@ static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC, const TargetRegisterClass *DstRC, const SIRegisterInfo &TRI) { - return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) && + return SrcRC != &AMDGPU::VReg_1RegClass && + (TRI.isSGPRClass(DstRC) || DstRC == &AMDGPU::VReg_1RegClass) && TRI.hasVGPRs(SrcRC); } Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4376,6 +4376,16 @@ Inst.addOperand(MachineOperand::CreateImm(BitWidth)); } + if (Inst.isCopy() && + MRI.getRegClass(Inst.getOperand(0).getReg()) == &AMDGPU::VReg_1RegClass) { + // If this was a COPY from SReg64 to VReg1, as generated by + // @llvm.amdgcn.inverse.ballot, then we need to assume the source is + // uniform and insert a READFIRSTLANE. + unsigned sgpr = readlaneVGPRToSGPR(Inst.getOperand(1).getReg(), Inst, MRI); + Inst.getOperand(1).setReg(sgpr); + continue; + } + bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { Index: lib/Target/AMDGPU/SIInstructions.td =================================================================== --- lib/Target/AMDGPU/SIInstructions.td +++ lib/Target/AMDGPU/SIInstructions.td @@ -587,6 +587,12 @@ (COPY $src) // Return the SGPRs representing i1 src >; +def : Pat < + (int_amdgcn_inverse_ballot i64:$src), + (COPY $src) +>; + + //===----------------------------------------------------------------------===// // VOP1 Patterns //===----------------------------------------------------------------------===// Index: test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.ll @@ -0,0 +1,29 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=CHECK %s + +; CHECK-LABEL: {{^}}ret: +; CHECK: s_mov_b64 s[[MASK:\[[0-9]+:[0-9]+\]]], 1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[[MASK]] +define amdgpu_ps float @ret() #1 { +main_body: + %w = call i1 @llvm.amdgcn.inverse.ballot(i64 1) + %r = select i1 %w, float 1.0, float 0.0 + ret float %r +} + +; make sure it works for things that wind up in VGPR's +; CHECK-LABEL: {{^}}vgpr: +; CHECK: v_readfirstlane_b32 s0, v0 +; CHECK-NEXT: v_readfirstlane_b32 s1, v1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +define amdgpu_ps float @vgpr(i32 %v0, i32 %v1) { + %tmp = insertelement <2 x i32> undef, i32 %v0, i32 0 + %tmp2 = insertelement <2 x i32> %tmp, i32 %v1, i32 1 + %v0_1 = bitcast <2 x i32> %tmp2 to i64 + %inv = call i1 @llvm.amdgcn.inverse.ballot(i64 %v0_1) + %r = select i1 %inv, float 1.0, float 0.0 + ret float %r +} + +declare i1 @llvm.amdgcn.inverse.ballot(i64) +declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) +