diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1662,6 +1662,10 @@
   Intrinsic<[llvm_anyint_ty], [llvm_i1_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+def int_amdgcn_inverse_ballot :
+  Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
+            [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+
 def int_amdgcn_readfirstlane :
   ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -2528,6 +2528,18 @@
   case Intrinsic::amdgcn_interp_p1_f16:
     SelectInterpP1F16(N);
     return;
+  case Intrinsic::amdgcn_inverse_ballot:
+    switch (N->getOperand(1).getValueSizeInBits()) {
+    case 32:
+      Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
+      break;
+    case 64:
+      Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
+      break;
+    default:
+      llvm_unreachable_internal("Unsupported size for inverse ballot mask.");
+    }
+    break;
   default:
     SelectCode(N);
     return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -112,6 +112,7 @@
   bool selectDivScale(MachineInstr &MI) const;
   bool selectIntrinsicCmp(MachineInstr &MI) const;
   bool selectBallot(MachineInstr &I) const;
+  bool selectInverseBallot(MachineInstr &I) const;
   bool selectRelocConstant(MachineInstr &I) const;
   bool selectGroupStaticSize(MachineInstr &I) const;
   bool selectReturnAddress(MachineInstr &I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1046,6 +1046,8 @@
     return selectIntrinsicCmp(I);
   case Intrinsic::amdgcn_ballot:
     return selectBallot(I);
+  case Intrinsic::amdgcn_inverse_ballot:
+    return selectInverseBallot(I);
   case Intrinsic::amdgcn_reloc_constant:
     return selectRelocConstant(I);
   case Intrinsic::amdgcn_groupstaticsize:
@@ -1351,6 +1353,52 @@
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  const Register DstReg = I.getOperand(0).getReg();
+  const Register MaskReg = I.getOperand(2).getReg();
+
+  const unsigned Size = MRI->getType(MaskReg).getSizeInBits();
+  assert(Size == STI.getWavefrontSize() &&
+         "Mask is not the same size as wavefront.");
+  const bool Is32 = Size == 32 && STI.isWave32();
+
+  const RegisterBank *MaskBank = RBI.getRegBank(MaskReg, *MRI, TRI);
+  const bool IsVALU = MaskBank->getID() == AMDGPU::VGPRRegBankID;
+
+  if (!IsVALU) {
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
+    I.eraseFromParent();
+    return true;
+  }
+
+  SmallVector<Register, 8> MaskParts;
+  SmallVector<Register, 8> DstParts;
+  unsigned NumParts = Size / 32;
+
+  for (unsigned i = 0; i < NumParts; ++i) {
+    Register DstPart = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    int SubReg = Is32 ? 0 : (AMDGPU::sub0 + i * 8);
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32),
+            Is32 ? DstReg : DstPart)
+        .addReg(MaskReg, 0, SubReg);
+
+    DstParts.push_back(DstPart);
+  }
+
+  if (!Is32) {
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+        .addReg(DstParts[0])
+        .addImm(AMDGPU::sub0)
+        .addReg(DstParts[1])
+        .addImm(AMDGPU::sub1);
+  }
+
+  I.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectRelocConstant(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4494,6 +4494,15 @@
       OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
       break;
     }
+    case Intrinsic::amdgcn_inverse_ballot: {
+      // This must be an SGPR, but accept a VGPR.
+      Register MaskReg = MI.getOperand(2).getReg();
+      unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
+      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
+      break;
+    }
     }
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -376,6 +376,7 @@
 def : SourceOfDivergence<int_amdgcn_if>;
 def : SourceOfDivergence<int_amdgcn_else>;
 def : SourceOfDivergence<int_amdgcn_loop>;
+def : SourceOfDivergence<int_amdgcn_inverse_ballot>;
 
 foreach intr = AMDGPUImageDimAtomicIntrinsics in
 def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4477,6 +4477,49 @@
 
     return BB;
   }
+  case AMDGPU::S_INVERSE_BALLOT_U32:
+  case AMDGPU::S_INVERSE_BALLOT_U64: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+    const Register DstReg = MI.getOperand(0).getReg();
+    const Register MaskReg = MI.getOperand(1).getReg();
+
+    bool Is32 = (MI.getOpcode() == AMDGPU::S_INVERSE_BALLOT_U32);
+
+    SmallVector<Register, 8> DstParts;
+    unsigned NumParts = Is32 ? 1 : 2;
+
+    const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
+
+    if (!IsVALU) {
+      BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
+      MI.eraseFromParent();
+      return BB;
+    }
+
+    for (unsigned i = 0; i < NumParts; ++i) {
+      Register DstPart = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      int SubReg = Is32 ? 0 : (AMDGPU::sub0 + i * 8);
+      BuildMI(*BB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+              Is32 ? DstReg : DstPart)
+          .addReg(MaskReg, 0, SubReg);
+
+      DstParts.push_back(DstPart);
+    }
+
+    if (!Is32) {
+      BuildMI(*BB, &MI, DL, TII->get(AMDGPU::REG_SEQUENCE), DstReg)
+          .addReg(DstParts[0])
+          .addImm(AMDGPU::sub0)
+          .addReg(DstParts[1])
+          .addImm(AMDGPU::sub1);
+    }
+
+    MI.eraseFromParent();
+    return BB;
+  }
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -190,6 +190,12 @@
   let mayStore = 0;
 }
 
+let usesCustomInserter = 1, Defs = [VCC] in {
+def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins unknown:$src)>;
+
+def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins unknown:$src)>;
+} // End usesCustomInserter = 1, Defs = [VCC]
+
 // PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
 def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
   let Uses = [EXEC];
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i32.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i32.ll
@@ -0,0 +1,199 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+
+declare i1 @llvm.amdgcn.inverse.ballot(i32)
+
+; Test ballot(0)
+define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_false_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_false_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0)
+  %sel    = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot(1)
+
+define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_true_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_true_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF)
+  %sel    = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_mask_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_movk_i32 s0, 0x1000
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_mask_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_movk_i32 s0, 0x1000
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000)
+  %sel    = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test inverse ballot using a vgpr as input
+
+define amdgpu_cs void @vgpr_inverse_ballot(i32 %input, ptr addrspace(1) %out) {
+; GFX10-LABEL: vgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[1:2], v0, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: vgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[1:2], v0, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
+  %sel = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @sgpr_inverse_ballot(i32 inreg %input, ptr addrspace(1) %out) {
+; GFX10-LABEL: sgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
+  %sel = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot after phi
+define amdgpu_cs void @phi_uniform(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) {
+; GFX10-LABEL: phi_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB5_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_i32 s0, s0, 1
+; GFX10-NEXT:  .LBB5_2: ; %endif
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: phi_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cbranch_scc1 .LBB5_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:  .LBB5_2: ; %endif
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cc = icmp ne i32 %s2, 0
+  br i1 %cc, label %endif, label %if
+
+if:
+  %tmp = add i32 %s0_1, 1
+  br label %endif
+
+endif:
+  %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ]
+
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
+  %sel = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test for branching
+define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr addrspace(1) %out) {
+; GFX10-LABEL: inverse_ballot_branch:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
+; GFX10-NEXT:    s_and_saveexec_b32 s1, s2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_i32 s0, s0, 1
+; GFX10-NEXT:  ; %bb.2: ; %endif
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: inverse_ballot_branch:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_xor_b32 s2, s1, -1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:  ; %bb.2: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2)
+  br i1 %ballot, label %endif, label %if
+
+if:
+  %tmp = add i32 %s0_1, 1
+  br label %endif
+
+endif:
+  %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ]
+  store i32 %input, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i64.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.inverse.ballot.i64.ll
@@ -0,0 +1,195 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -global-isel -amdgpu-global-isel-risky-select -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare i1 @llvm.amdgcn.inverse.ballot.i64(i64)
+
+; Test ballot(0)
+define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_false_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_false_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0)
+  %sel    = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot(1)
+
+define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_true_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b64 s[0:1], -1
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_true_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b64 s[0:1], -1
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF)
+  %sel    = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test inverse ballot using a vgpr as input
+
+define amdgpu_cs void @vgpr_inverse_ballot(i64 %input, ptr addrspace(1) %out) {
+; GFX10-LABEL: vgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: vgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
+  %sel    = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @sgpr_inverse_ballot(i64 inreg %input, ptr addrspace(1) %out) {
+; GFX10-LABEL: sgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
+  %sel = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot after phi
+define amdgpu_cs void @phi_uniform(i64 inreg %s0_1, i64 inreg %s2,ptr addrspace(1) %out) {
+; GFX10-LABEL: phi_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_u32 s0, s0, 1
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:  .LBB4_2: ; %endif
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: phi_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_u32 s0, s0, 1
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:  .LBB4_2: ; %endif
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v3, 0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cc = icmp ne i64 %s2, 0
+  br i1 %cc, label %endif, label %if
+
+if:
+  %tmp = add  i64 %s0_1, 1
+  br label %endif
+
+endif:
+  %input = phi i64 [ %s0_1, %entry], [ %tmp, %if ]
+
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
+  %sel = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test for branching
+define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr addrspace(1) %out) {
+; GFX10-LABEL: inverse_ballot_branch:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], -1
+; GFX10-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_u32 s0, s0, 1
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:  ; %bb.2: ; %endif
+; GFX10-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: inverse_ballot_branch:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_xor_b64 s[4:5], s[2:3], -1
+; GFX11-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_u32 s0, s0, 1
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:  ; %bb.2: ; %endif
+; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX11-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2)
+  br i1 %ballot, label %endif, label %if
+
+if:
+  %tmp = add  i64 %s0_1, 1
+  br label %endif
+
+endif:
+  %sel = phi i64 [ %s0_1, %entry], [ %tmp, %if ]
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll
@@ -0,0 +1,201 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare i1 @llvm.amdgcn.inverse.ballot(i32)
+
+; Test ballot(0)
+define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_false_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_false_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 0)
+  %sel    = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot(1)
+
+define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_true_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, -1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_true_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s0, -1
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0xFFFFFFFF)
+  %sel    = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @constant_mask_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_mask_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_movk_i32 s0, 0x1000
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_mask_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_movk_i32 s0, 0x1000
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 u0x00001000)
+  %sel    = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; ; Test inverse ballot using a vgpr as input
+
+define amdgpu_cs void @vgpr_inverse_ballot(ptr addrspace(1) %out, i32 %input) {
+; GFX10-LABEL: vgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: vgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
+  %sel = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @sgpr_inverse_ballot(ptr addrspace(1) %out, i32 inreg %input) {
+; GFX10-LABEL: sgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
+  %sel = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot after phi
+define amdgpu_cs void @phi_uniform(ptr addrspace(1) %out, i32 inreg %s0_1, i32 inreg %s2) {
+; GFX10-LABEL: phi_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB5_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_i32 s0, s0, 1
+; GFX10-NEXT:  .LBB5_2: ; %endif
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: phi_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11-NEXT:    s_cbranch_scc1 .LBB5_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:  .LBB5_2: ; %endif
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cc = icmp ne i32 %s2, 0
+  br i1 %cc, label %endif, label %if
+
+if:
+  %tmp = add i32 %s0_1, 1
+  br label %endif
+
+endif:
+  %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ]
+
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %input)
+  %sel = select i1 %ballot, i32 1, i32 0
+  store i32 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test for branching
+define amdgpu_cs void @inverse_ballot_branch(ptr addrspace(1) %out, i32 inreg %s0_1, i32 inreg %s2) {
+; GFX10-LABEL: inverse_ballot_branch:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_xor_b32 s2, s1, -1
+; GFX10-NEXT:    s_and_saveexec_b32 s1, s2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_i32 s0, s0, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:  ; %bb.2: ; %endif
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: inverse_ballot_branch:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    s_xor_b32 s2, s1, -1
+; GFX11-NEXT:    s_and_saveexec_b32 s1, s2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_i32 s0, s0, 1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:  ; %bb.2: ; %endif
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot(i32 %s2)
+  br i1 %ballot, label %endif, label %if
+
+if:
+  %tmp = add i32 %s0_1, 1
+  br label %endif
+
+endif:
+  %input = phi i32 [ %s0_1, %entry], [ %tmp, %if ]
+  store i32 %input, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll
@@ -0,0 +1,210 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+
+declare i1 @llvm.amdgcn.inverse.ballot.i64(i64)
+
+; Test ballot(0)
+define amdgpu_cs void @constant_false_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_false_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b64 s[0:1], 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_false_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 0)
+  %sel    = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot(1)
+
+define amdgpu_cs void @constant_true_inverse_ballot(ptr addrspace(1) %out) {
+; GFX10-LABEL: constant_true_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b64 s[0:1], -1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: constant_true_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    s_mov_b64 s[0:1], -1
+; GFX11-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 u0xFFFFFFFFFFFFFFFF)
+  %sel    = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test inverse ballot using a vgpr as input
+
+define amdgpu_cs void @vgpr_inverse_ballot(ptr addrspace(1) %out, i64 %input) {
+; GFX10-LABEL: vgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: vgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
+  %sel    = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @sgpr_inverse_ballot(ptr addrspace(1) %out, i64 inreg %input) {
+; GFX10-LABEL: sgpr_inverse_ballot:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: sgpr_inverse_ballot:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
+; GFX11-NEXT:    v_mov_b32_e32 v3, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
+  %sel = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot after phi
+define amdgpu_cs void @phi_uniform(ptr addrspace(1) %out, i64 inreg %s0_1, i64 inreg %s2) {
+; GFX10-LABEL: phi_uniform:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_u32 s0, s0, 1
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:  .LBB4_2: ; %endif
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: phi_uniform:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX11-NEXT:    s_cbranch_scc1 .LBB4_2
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_u32 s0, s0, 1
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:  .LBB4_2: ; %endif
+; GFX11-NEXT:    s_mov_b32 s2, 0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX11-NEXT:    v_mov_b32_e32 v3, s2
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %cc = icmp ne i64 %s2, 0
+  br i1 %cc, label %endif, label %if
+
+if:
+  %tmp = add  i64 %s0_1, 1
+  br label %endif
+
+endif:
+  %input = phi i64 [ %s0_1, %entry], [ %tmp, %if ]
+
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %input)
+  %sel = select i1 %ballot, i64 1, i64 0
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}
+
+; Test ballot used for branching
+define amdgpu_cs void @inverse_ballot_branch(ptr addrspace(1) %out, i64 inreg %s0_1, i64 inreg %s2) {
+; GFX10-LABEL: inverse_ballot_branch:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:    s_xor_b64 s[4:5], s[2:3], -1
+; GFX10-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
+; GFX10-NEXT:  ; %bb.1: ; %if
+; GFX10-NEXT:    s_add_u32 s0, s0, 1
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, s1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
+; GFX10-NEXT:  ; %bb.2: ; %endif
+; GFX10-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX10-NEXT:    s_endpgm
+;
+; GFX11-LABEL: inverse_ballot_branch:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    s_xor_b64 s[4:5], s[2:3], -1
+; GFX11-NEXT:    s_and_saveexec_b64 s[2:3], s[4:5]
+; GFX11-NEXT:  ; %bb.1: ; %if
+; GFX11-NEXT:    s_add_u32 s0, s0, 1
+; GFX11-NEXT:    s_addc_u32 s1, s1, 0
+; GFX11-NEXT:    v_mov_b32_e32 v3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:  ; %bb.2: ; %endif
+; GFX11-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %ballot = call i1 @llvm.amdgcn.inverse.ballot.i64(i64 %s2)
+  br i1 %ballot, label %endif, label %if
+
+if:
+  %tmp = add  i64 %s0_1, 1
+  br label %endif
+
+endif:
+  %sel = phi i64 [ %s0_1, %entry], [ %tmp, %if ]
+  store i64 %sel, ptr addrspace(1) %out
+  ret void
+}