diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -979,6 +979,11 @@
                                              inputs. Backend will optimize out denormal scaling if
                                              marked with the :ref:`afn <fastmath_afn>` flag.
 
+  llvm.amdgcn.reduce.umin/umax               Performs the reduction across the wavefront on a given
+                                             unsigned value (first operand). It takes hint for scan
+                                             strategy using second operand (0 for `DPP`` and 1 for 
+                                             `iterative approach`.
+
   =========================================  ==========================================================
 
 .. TODO::
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1928,6 +1928,21 @@
   Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+def int_amdgcn_wave_reduce_umin :
+  Intrinsic<[llvm_i32_ty], [
+    llvm_i32_ty, // llvm value to reduce
+    llvm_i32_ty  // Strategy switch for DPP/Iterative lowering
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
+
+def int_amdgcn_wave_reduce_umax :
+  Intrinsic<[llvm_i32_ty], [
+    llvm_i32_ty, // llvm value to reduce
+    llvm_i32_ty  // Strategy switch for DPP/Iterative lowering
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree, ImmArg<ArgIndex<1>>]>;
+
+
 def int_amdgcn_readfirstlane :
   ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4514,6 +4514,18 @@
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
       break;
     }
+    case Intrinsic::amdgcn_wave_reduce_umin:
+    case Intrinsic::amdgcn_wave_reduce_umax: {
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+      if (isSALUMapping(MI))
+        OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, OpSize);
+      else {
+        OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      }
+      break;
+    }
     }
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4065,6 +4065,154 @@
   return LoopBB;
 }
 
+bool IsScanImplementedUsingDPP(const GCNSubtarget &ST) {
+  // TODO : Set it to true once Scan is implemented using DPP Approach.
+  bool isScanDPPStrategyImplemented = false;
+  if (ST.hasDPP() && isScanDPPStrategyImplemented) {
+    return true;
+  }
+  return false;
+}
+
+// Check to see whether intrisic hints to DPP Scan strategy for scan
+// implemenation
+bool IsImmHintDPP(unsigned ScanStratgyImm) {
+  ScanOptions ScanStrategy =
+      ScanStratgyImm == 0 ? ScanOptions::DPP : ScanOptions::Iterative;
+  if (ScanStrategy == ScanOptions::DPP) {
+    return true;
+  }
+  return false;
+}
+
+static MachineBasicBlock *lowerReduce(MachineInstr &MI, MachineBasicBlock &BB,
+                                      const GCNSubtarget &ST, unsigned Opc) {
+  MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const DebugLoc &DL = MI.getDebugLoc();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  // Reduction operations depend on whether the input operand is SGPR or VGPR.
+  Register SrcReg = MI.getOperand(1).getReg();
+  bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
+  Register DstReg = MI.getOperand(0).getReg();
+  MachineBasicBlock *RetBB = nullptr;
+  if (isSGPR) {
+    // These operations with a uniform value i.e. SGPR are idempotent.
+    // Reduced value will be same as given sgpr.
+    BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+    RetBB = &BB;
+  } else {
+    // Check to see which implementation to use for Scan (DPP/Iterative)
+    // For now, we fallback to iterative approach for scan implementation even
+    // thought intrinsic hits to DPP strategy.
+    if (IsImmHintDPP(MI.getOperand(2).getImm()) &&
+        IsScanImplementedUsingDPP(ST)) {
+      // TODO: Implement DPP appraoch for scan.
+    } else {
+      // To reduce the VGPR using iterative approach, we need to iterative
+      // over all the active lanes. Lowering consists of ComputeLoop,
+      // which iterative over only active lanes. We use copy of EXEC register
+      // as induction variable and every active lane modifies it using bitset0
+      // so that we will get the next active lane for next iteration.
+      MachineBasicBlock::iterator I = BB.end();
+      Register SrcReg = MI.getOperand(1).getReg();
+
+      // Create Control flow for loop
+      MachineBasicBlock *ComputeLoop;
+      MachineBasicBlock *ComputeEnd;
+
+      // Split MI's Machine Basic block into For loop
+      std::tie(ComputeLoop, ComputeEnd) = splitBlockForLoop(MI, BB, true);
+
+      bool IsWave32 = ST.isWave32();
+      const TargetRegisterClass *RegClass =
+          IsWave32 ? &AMDGPU::SReg_32RegClass : &AMDGPU::SReg_64RegClass;
+
+      // Create Registers required for lowering.
+      Register LoopIterator = MRI.createVirtualRegister(RegClass);
+      Register InitalValReg =
+          MRI.createVirtualRegister(MRI.getRegClass(DstReg));
+
+      Register AccumulatorReg =
+          MRI.createVirtualRegister(MRI.getRegClass(DstReg));
+      Register NewAccumulatorReg =
+          MRI.createVirtualRegister(MRI.getRegClass(DstReg));
+
+      Register ActiveBitsReg = MRI.createVirtualRegister(RegClass);
+      Register NewActiveBitsReg = MRI.createVirtualRegister(RegClass);
+
+      Register FF1Reg = MRI.createVirtualRegister(MRI.getRegClass(DstReg));
+      Register LaneValueReg =
+          MRI.createVirtualRegister(MRI.getRegClass(DstReg));
+
+      unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+      unsigned ExecOpc = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+      // Create initail values of induction variable from Exec, Accumulator and
+      // Branch to ComputeBlock
+      long InitalValue = (Opc == AMDGPU::S_MIN_U32) ? UINT_MAX : 0;
+      auto &TmpSReg =
+          BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecOpc);
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+          .addImm(InitalValue);
+      BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+      // Start constructing ComputeLoop
+      I = ComputeLoop->end();
+      auto Accumulator =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+              .addReg(InitalValReg)
+              .addMBB(&BB);
+      auto ActiveBits =
+          BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+              .addReg(TmpSReg->getOperand(0).getReg())
+              .addMBB(&BB);
+
+      // Perform the computations
+      unsigned SFFOpc =
+          IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+      auto &FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+                      .addReg(ActiveBits->getOperand(0).getReg());
+      auto &LaneValue = BuildMI(*ComputeLoop, I, DL,
+                                TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
+                            .addReg(SrcReg)
+                            .addReg(FF1->getOperand(0).getReg());
+      auto &NewAccumulator =
+          BuildMI(*ComputeLoop, I, DL, TII->get(Opc), NewAccumulatorReg)
+              .addReg(Accumulator->getOperand(0).getReg())
+              .addReg(LaneValue->getOperand(0).getReg());
+
+      // Manipulate the iterator to get the next active lane
+      unsigned BITSETOpc =
+          IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+      auto &NewActiveBits =
+          BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+              .addReg(FF1->getOperand(0).getReg())
+              .addReg(ActiveBits->getOperand(0).getReg());
+
+      // Add phi nodes
+      Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+          .addMBB(ComputeLoop);
+      ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+          .addMBB(ComputeLoop);
+
+      // Creating branching
+      unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+      BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+          .addReg(NewActiveBits->getOperand(0).getReg())
+          .addImm(0);
+      BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+          .addMBB(ComputeLoop);
+
+      MRI.replaceRegWith(DstReg, NewAccumulator->getOperand(0).getReg());
+      RetBB = ComputeEnd;
+    }
+  }
+  MI.eraseFromParent();
+  return RetBB;
+}
+
 MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   MachineInstr &MI, MachineBasicBlock *BB) const {
 
@@ -4073,6 +4221,10 @@
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
   switch (MI.getOpcode()) {
+  case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
+    return lowerReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+  case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
+    return lowerReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
   case AMDGPU::S_UADDO_PSEUDO:
   case AMDGPU::S_USUBO_PSEUDO: {
     const DebugLoc &DL = MI.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -258,6 +258,18 @@
 }
 } // End Defs = [SCC]
 
+let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+  def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+  }
+
+  def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+    (ins VSrc_b32: $src, VSrc_b32:$strategy),
+    [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
+  }
+}
+
 let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
 def V_ADD_U64_PSEUDO : VPseudoInstSI <
   (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll
@@ -0,0 +1,998 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL0,GFX1064GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL1,GFX1064GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL0,GFX1032GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL1,GFX1032GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL0,GFX1164GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL1,GFX1164GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL0,GFX1132GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL1,GFX1132GISEL1 %s
+
+declare i32 @llvm.amdgcn.wave.reduce.umax(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: uniform_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: uniform_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: uniform_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9GISEL0-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: uniform_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX10GISEL0-LABEL: uniform_value:
+; GFX10GISEL0:       ; %bb.0: ; %entry
+; GFX10GISEL0-NEXT:    s_clause 0x1
+; GFX10GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10GISEL0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10GISEL0-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10GISEL0-NEXT:    s_endpgm
+;
+; GFX10GISEL1-LABEL: uniform_value:
+; GFX10GISEL1:       ; %bb.0: ; %entry
+; GFX10GISEL1-NEXT:    s_clause 0x1
+; GFX10GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10GISEL1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10GISEL1-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: uniform_value:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_clause 0x1
+; GFX1164GISEL0-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: uniform_value:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_clause 0x1
+; GFX1164GISEL1-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: uniform_value:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_clause 0x1
+; GFX1132GISEL0-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: uniform_value:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_clause 0x1
+; GFX1132GISEL1-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+    %result = call i32 @llvm.amdgcn.wave.reduce.umax(i32 %in, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8GISEL0-LABEL: const_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: const_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: const_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: const_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX10GISEL0-LABEL: const_value:
+; GFX10GISEL0:       ; %bb.0: ; %entry
+; GFX10GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX10GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10GISEL0-NEXT:    s_endpgm
+;
+; GFX10GISEL1-LABEL: const_value:
+; GFX10GISEL1:       ; %bb.0: ; %entry
+; GFX10GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: const_value:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: const_value:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: const_value:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: const_value:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+    %result = call i32 @llvm.amdgcn.wave.reduce.umax(i32 123, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: poison_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: poison_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: poison_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: poison_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX10GISEL0-LABEL: poison_value:
+; GFX10GISEL0:       ; %bb.0: ; %entry
+; GFX10GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL0-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL0-NEXT:    s_endpgm
+;
+; GFX10GISEL1-LABEL: poison_value:
+; GFX10GISEL1:       ; %bb.0: ; %entry
+; GFX10GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL1-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL1-NEXT:    s_endpgm
+;
+; GFX11GISEL0-LABEL: poison_value:
+; GFX11GISEL0:       ; %bb.0: ; %entry
+; GFX11GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL0-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL0-NEXT:    s_endpgm
+;
+; GFX11GISEL1-LABEL: poison_value:
+; GFX11GISEL1:       ; %bb.0: ; %entry
+; GFX11GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11GISEL1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL1-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL1-NEXT:    s_endpgm
+entry:
+    %result = call i32 @llvm.amdgcn.wave.reduce.umax(i32 poison, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: divergent_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL0-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL0-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL0-NEXT:    s_max_u32 s4, s4, s6
+; GFX8GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL0-NEXT:  ; %bb.2:
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: divergent_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL1-NEXT:    s_mov_b32 s4, 0
+; GFX8GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL1-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL1-NEXT:    s_max_u32 s4, s4, s6
+; GFX8GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL1-NEXT:  ; %bb.2:
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: divergent_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL0-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL0-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL0-NEXT:    s_max_u32 s4, s4, s6
+; GFX9GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL0-NEXT:  ; %bb.2:
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: divergent_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL1-NEXT:    s_mov_b32 s4, 0
+; GFX9GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL1-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL1-NEXT:    s_max_u32 s4, s4, s6
+; GFX9GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL1-NEXT:  ; %bb.2:
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX1064GISEL0-LABEL: divergent_value:
+; GFX1064GISEL0:       ; %bb.0: ; %entry
+; GFX1064GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL0-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL0-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL0-NEXT:    s_max_u32 s4, s4, s6
+; GFX1064GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL0-NEXT:  ; %bb.2:
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL0-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL0-NEXT:    s_endpgm
+;
+; GFX1064GISEL1-LABEL: divergent_value:
+; GFX1064GISEL1:       ; %bb.0: ; %entry
+; GFX1064GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL1-NEXT:    s_mov_b32 s4, 0
+; GFX1064GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL1-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL1-NEXT:    s_max_u32 s4, s4, s6
+; GFX1064GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL1-NEXT:  ; %bb.2:
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL1-NEXT:    s_endpgm
+;
+; GFX1032GISEL0-LABEL: divergent_value:
+; GFX1032GISEL0:       ; %bb.0: ; %entry
+; GFX1032GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL0-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL0-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL0-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL0-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL0-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL0-NEXT:    s_max_u32 s2, s2, s5
+; GFX1032GISEL0-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL0-NEXT:  ; %bb.2:
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL0-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL0-NEXT:    s_endpgm
+;
+; GFX1032GISEL1-LABEL: divergent_value:
+; GFX1032GISEL1:       ; %bb.0: ; %entry
+; GFX1032GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL1-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL1-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL1-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL1-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL1-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL1-NEXT:    s_max_u32 s2, s2, s5
+; GFX1032GISEL1-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL1-NEXT:  ; %bb.2:
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: divergent_value:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL0-NEXT:    s_mov_b32 s4, 0
+; GFX1164GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL0-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL0-NEXT:    s_max_u32 s4, s4, s6
+; GFX1164GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL0-NEXT:  ; %bb.2:
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: divergent_value:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL1-NEXT:    s_mov_b32 s4, 0
+; GFX1164GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL1-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL1-NEXT:    s_max_u32 s4, s4, s6
+; GFX1164GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL1-NEXT:  ; %bb.2:
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: divergent_value:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL0-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL0-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL0-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL0-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL0-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL0-NEXT:    s_max_u32 s2, s2, s5
+; GFX1132GISEL0-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL0-NEXT:  ; %bb.2:
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: divergent_value:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL1-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL1-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL1-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL1-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL1-NEXT:    s_max_u32 s2, s2, s5
+; GFX1132GISEL1-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL1-NEXT:  ; %bb.2:
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+    %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+    %result = call i32 @llvm.amdgcn.wave.reduce.umax(i32 %id.x, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: divergent_cfg:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX8GISEL0-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL0-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL0-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL0-NEXT:    s_max_u32 s6, s6, s8
+; GFX8GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL0-NEXT:  ; %bb.5:
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX8GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL0-NEXT:    flat_store_dword v[2:3], v1
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: divergent_cfg:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL1-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX8GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL1-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL1-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL1-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL1-NEXT:    s_max_u32 s6, s6, s8
+; GFX8GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: divergent_cfg:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX9GISEL0-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL0-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL0-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL0-NEXT:    s_max_u32 s6, s6, s8
+; GFX9GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL0-NEXT:  ; %bb.5:
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX9GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: divergent_cfg:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL1-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX9GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL1-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL1-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL1-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL1-NEXT:    s_max_u32 s6, s6, s8
+; GFX9GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX1064GISEL0-LABEL: divergent_cfg:
+; GFX1064GISEL0:       ; %bb.0: ; %entry
+; GFX1064GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX1064GISEL0-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX1064GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL0-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL0-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL0-NEXT:    s_max_u32 s6, s6, s8
+; GFX1064GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL0-NEXT:  ; %bb.5:
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1064GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064GISEL0-NEXT:    s_endpgm
+;
+; GFX1064GISEL1-LABEL: divergent_cfg:
+; GFX1064GISEL1:       ; %bb.0: ; %entry
+; GFX1064GISEL1-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL1-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX1064GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX1064GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL1-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL1-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL1-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL1-NEXT:    s_max_u32 s6, s6, s8
+; GFX1064GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL1-NEXT:    s_endpgm
+;
+; GFX1032GISEL0-LABEL: divergent_cfg:
+; GFX1032GISEL0:       ; %bb.0: ; %entry
+; GFX1032GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032GISEL0-NEXT:    ; implicit-def: $sgpr3
+; GFX1032GISEL0-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032GISEL0-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL0-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GFX1032GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL0-NEXT:    s_or_saveexec_b32 s2, s2
+; GFX1032GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL0-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL0-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL0-NEXT:    s_mov_b32 s3, 0
+; GFX1032GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL0-NEXT:    s_ff1_i32_b32 s5, s4
+; GFX1032GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1032GISEL0-NEXT:    s_bitset0_b32 s4, s5
+; GFX1032GISEL0-NEXT:    s_max_u32 s3, s3, s6
+; GFX1032GISEL0-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL0-NEXT:  ; %bb.5:
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1032GISEL0-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032GISEL0-NEXT:    s_endpgm
+;
+; GFX1032GISEL1-LABEL: divergent_cfg:
+; GFX1032GISEL1:       ; %bb.0: ; %entry
+; GFX1032GISEL1-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL1-NEXT:    ; implicit-def: $sgpr2
+; GFX1032GISEL1-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1032GISEL1-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL1-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX1032GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL1-NEXT:    s_mov_b32 s2, s2
+; GFX1032GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL1-NEXT:    s_andn2_saveexec_b32 s3, s3
+; GFX1032GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL1-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL1-NEXT:    s_mov_b32 s2, 0
+; GFX1032GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL1-NEXT:    s_ff1_i32_b32 s5, s4
+; GFX1032GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1032GISEL1-NEXT:    s_bitset0_b32 s4, s5
+; GFX1032GISEL1-NEXT:    s_max_u32 s2, s2, s6
+; GFX1032GISEL1-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL1-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1032GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: divergent_cfg:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX1164GISEL0-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL0-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL0-NEXT:    s_mov_b32 s6, 0
+; GFX1164GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL0-NEXT:    s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL0-NEXT:    s_max_u32 s6, s6, s8
+; GFX1164GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL0-NEXT:  ; %bb.5:
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1164GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: divergent_cfg:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL1-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL1-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX1164GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL1-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL1-NEXT:    s_mov_b32 s6, 0
+; GFX1164GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL1-NEXT:    s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL1-NEXT:    s_max_u32 s6, s6, s8
+; GFX1164GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: divergent_cfg:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL0-NEXT:    ; implicit-def: $sgpr3
+; GFX1132GISEL0-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132GISEL0-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1132GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL0-NEXT:    s_load_b32 s3, s[0:1], 0x2c
+; GFX1132GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL0-NEXT:    s_or_saveexec_b32 s2, s2
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1132GISEL0-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL0-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL0-NEXT:    s_mov_b32 s3, 0
+; GFX1132GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL0-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX1132GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1132GISEL0-NEXT:    s_bitset0_b32 s4, s5
+; GFX1132GISEL0-NEXT:    s_max_u32 s3, s3, s6
+; GFX1132GISEL0-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1132GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL0-NEXT:  ; %bb.5:
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1132GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1132GISEL0-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: divergent_cfg:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL1-NEXT:    ; implicit-def: $sgpr2
+; GFX1132GISEL1-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL1-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL1-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1132GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    s_mov_b32 s2, s2
+; GFX1132GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL1-NEXT:    s_and_not1_saveexec_b32 s3, s3
+; GFX1132GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL1-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL1-NEXT:    s_mov_b32 s2, 0
+; GFX1132GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL1-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX1132GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1132GISEL1-NEXT:    s_bitset0_b32 s4, s5
+; GFX1132GISEL1-NEXT:    s_max_u32 s2, s2, s6
+; GFX1132GISEL1-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1132GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL1-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll
@@ -0,0 +1,999 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL0,GFX1064GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL1,GFX1064GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL0,GFX1032GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL1,GFX1032GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL0,GFX1164GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL1,GFX1164GISEL1 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL0,GFX1132GISEL0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL1,GFX1132GISEL1 %s
+
+
+declare i32 @llvm.amdgcn.wave.reduce.umin(i32, i32 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: uniform_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_load_dword s0, s[0:1], 0x2c
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: uniform_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: uniform_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9GISEL0-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: uniform_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX10GISEL0-LABEL: uniform_value:
+; GFX10GISEL0:       ; %bb.0: ; %entry
+; GFX10GISEL0-NEXT:    s_clause 0x1
+; GFX10GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10GISEL0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10GISEL0-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX10GISEL0-NEXT:    s_endpgm
+;
+; GFX10GISEL1-LABEL: uniform_value:
+; GFX10GISEL1:       ; %bb.0: ; %entry
+; GFX10GISEL1-NEXT:    s_clause 0x1
+; GFX10GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX10GISEL1-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10GISEL1-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: uniform_value:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_clause 0x1
+; GFX1164GISEL0-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: uniform_value:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_clause 0x1
+; GFX1164GISEL1-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: uniform_value:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_clause 0x1
+; GFX1132GISEL0-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: uniform_value:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_clause 0x1
+; GFX1132GISEL1-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+    %result = call i32 @llvm.amdgcn.wave.reduce.umin(i32 %in, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
+; GFX8GISEL0-LABEL: const_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: const_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, 0x7b
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: const_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: const_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX10GISEL0-LABEL: const_value:
+; GFX10GISEL0:       ; %bb.0: ; %entry
+; GFX10GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX10GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX10GISEL0-NEXT:    s_endpgm
+;
+; GFX10GISEL1-LABEL: const_value:
+; GFX10GISEL1:       ; %bb.0: ; %entry
+; GFX10GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX10GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: const_value:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: const_value:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, 0x7b
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: const_value:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: const_value:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+    %result = call i32 @llvm.amdgcn.wave.reduce.umin(i32 123, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: poison_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: poison_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v0
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: poison_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: poison_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX10GISEL0-LABEL: poison_value:
+; GFX10GISEL0:       ; %bb.0: ; %entry
+; GFX10GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL0-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL0-NEXT:    s_endpgm
+;
+; GFX10GISEL1-LABEL: poison_value:
+; GFX10GISEL1:       ; %bb.0: ; %entry
+; GFX10GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10GISEL1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL1-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX10GISEL1-NEXT:    s_endpgm
+;
+; GFX11GISEL0-LABEL: poison_value:
+; GFX11GISEL0:       ; %bb.0: ; %entry
+; GFX11GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL0-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL0-NEXT:    s_endpgm
+;
+; GFX11GISEL1-LABEL: poison_value:
+; GFX11GISEL1:       ; %bb.0: ; %entry
+; GFX11GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11GISEL1-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11GISEL1-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX11GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11GISEL1-NEXT:    s_endpgm
+entry:
+    %result = call i32 @llvm.amdgcn.wave.reduce.umin(i32 poison, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: divergent_value:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL0-NEXT:    s_mov_b32 s4, -1
+; GFX8GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL0-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL0-NEXT:    s_min_u32 s4, s4, s6
+; GFX8GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL0-NEXT:  ; %bb.2:
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL0-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: divergent_value:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL1-NEXT:    s_mov_b32 s4, -1
+; GFX8GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL1-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX8GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX8GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX8GISEL1-NEXT:    s_min_u32 s4, s4, s6
+; GFX8GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX8GISEL1-NEXT:  ; %bb.2:
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: divergent_value:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL0-NEXT:    s_mov_b32 s4, -1
+; GFX9GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL0-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL0-NEXT:    s_min_u32 s4, s4, s6
+; GFX9GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL0-NEXT:  ; %bb.2:
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: divergent_value:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL1-NEXT:    s_mov_b32 s4, -1
+; GFX9GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL1-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX9GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX9GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX9GISEL1-NEXT:    s_min_u32 s4, s4, s6
+; GFX9GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX9GISEL1-NEXT:  ; %bb.2:
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX1064GISEL0-LABEL: divergent_value:
+; GFX1064GISEL0:       ; %bb.0: ; %entry
+; GFX1064GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL0-NEXT:    s_mov_b32 s4, -1
+; GFX1064GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL0-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL0-NEXT:    s_min_u32 s4, s4, s6
+; GFX1064GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL0-NEXT:  ; %bb.2:
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL0-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL0-NEXT:    s_endpgm
+;
+; GFX1064GISEL1-LABEL: divergent_value:
+; GFX1064GISEL1:       ; %bb.0: ; %entry
+; GFX1064GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL1-NEXT:    s_mov_b32 s4, -1
+; GFX1064GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL1-NEXT:    s_ff1_i32_b64 s5, s[2:3]
+; GFX1064GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1064GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1064GISEL1-NEXT:    s_min_u32 s4, s4, s6
+; GFX1064GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1064GISEL1-NEXT:  ; %bb.2:
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL1-NEXT:    s_endpgm
+;
+; GFX1032GISEL0-LABEL: divergent_value:
+; GFX1032GISEL0:       ; %bb.0: ; %entry
+; GFX1032GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL0-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL0-NEXT:    s_mov_b32 s2, -1
+; GFX1032GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL0-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL0-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL0-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL0-NEXT:    s_min_u32 s2, s2, s5
+; GFX1032GISEL0-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL0-NEXT:  ; %bb.2:
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL0-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL0-NEXT:    s_endpgm
+;
+; GFX1032GISEL1-LABEL: divergent_value:
+; GFX1032GISEL1:       ; %bb.0: ; %entry
+; GFX1032GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL1-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL1-NEXT:    s_mov_b32 s2, -1
+; GFX1032GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL1-NEXT:    s_ff1_i32_b32 s4, s3
+; GFX1032GISEL1-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1032GISEL1-NEXT:    s_bitset0_b32 s3, s4
+; GFX1032GISEL1-NEXT:    s_min_u32 s2, s2, s5
+; GFX1032GISEL1-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1032GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1032GISEL1-NEXT:  ; %bb.2:
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: divergent_value:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL0-NEXT:    s_mov_b32 s4, -1
+; GFX1164GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL0-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL0-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL0-NEXT:    s_min_u32 s4, s4, s6
+; GFX1164GISEL0-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL0-NEXT:  ; %bb.2:
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: divergent_value:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL1-NEXT:    s_mov_b32 s4, -1
+; GFX1164GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL1-NEXT:    s_ctz_i32_b64 s5, s[2:3]
+; GFX1164GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1164GISEL1-NEXT:    s_bitset0_b64 s[2:3], s5
+; GFX1164GISEL1-NEXT:    s_min_u32 s4, s4, s6
+; GFX1164GISEL1-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1164GISEL1-NEXT:  ; %bb.2:
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, s4
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: divergent_value:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL0-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL0-NEXT:    s_mov_b32 s2, -1
+; GFX1132GISEL0-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL0-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL0-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL0-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL0-NEXT:    s_min_u32 s2, s2, s5
+; GFX1132GISEL0-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL0-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL0-NEXT:  ; %bb.2:
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: divergent_value:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL1-NEXT:    s_mov_b32 s2, -1
+; GFX1132GISEL1-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL1-NEXT:    s_ctz_i32_b32 s4, s3
+; GFX1132GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL1-NEXT:    v_readlane_b32 s5, v0, s4
+; GFX1132GISEL1-NEXT:    s_bitset0_b32 s3, s4
+; GFX1132GISEL1-NEXT:    s_min_u32 s2, s2, s5
+; GFX1132GISEL1-NEXT:    s_cmp_lg_u32 s3, 0
+; GFX1132GISEL1-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX1132GISEL1-NEXT:  ; %bb.2:
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+    %id.x = call i32 @llvm.amdgcn.workitem.id.x()
+    %result = call i32 @llvm.amdgcn.wave.reduce.umin(i32 %id.x, i32 1)
+    store i32 %result, ptr addrspace(1) %out
+    ret void
+}
+
+define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
+; GFX8GISEL0-LABEL: divergent_cfg:
+; GFX8GISEL0:       ; %bb.0: ; %entry
+; GFX8GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX8GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX8GISEL0-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX8GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX8GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX8GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL0-NEXT:    s_mov_b32 s6, -1
+; GFX8GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL0-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL0-NEXT:    s_min_u32 s6, s6, s8
+; GFX8GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL0-NEXT:  ; %bb.5:
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX8GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX8GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8GISEL0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8GISEL0-NEXT:    flat_store_dword v[2:3], v1
+; GFX8GISEL0-NEXT:    s_endpgm
+;
+; GFX8GISEL1-LABEL: divergent_cfg:
+; GFX8GISEL1:       ; %bb.0: ; %entry
+; GFX8GISEL1-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX8GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX8GISEL1-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX8GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX8GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX8GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX8GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX8GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX8GISEL1-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX8GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX8GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX8GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL1-NEXT:    s_mov_b32 s6, -1
+; GFX8GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL1-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX8GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL1-NEXT:    s_min_u32 s6, s6, s8
+; GFX8GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX8GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX8GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL1-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL1-NEXT:    flat_store_dword v[0:1], v2
+; GFX8GISEL1-NEXT:    s_endpgm
+;
+; GFX9GISEL0-LABEL: divergent_cfg:
+; GFX9GISEL0:       ; %bb.0: ; %entry
+; GFX9GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX9GISEL0-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX9GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX9GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL0-NEXT:    s_mov_b32 s6, -1
+; GFX9GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL0-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL0-NEXT:    s_min_u32 s6, s6, s8
+; GFX9GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL0-NEXT:  ; %bb.5:
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX9GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9GISEL0-NEXT:    s_endpgm
+;
+; GFX9GISEL1-LABEL: divergent_cfg:
+; GFX9GISEL1:       ; %bb.0: ; %entry
+; GFX9GISEL1-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX9GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX9GISEL1-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX9GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX9GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX9GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX9GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX9GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX9GISEL1-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX9GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX9GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX9GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL1-NEXT:    s_mov_b32 s6, -1
+; GFX9GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL1-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX9GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL1-NEXT:    s_min_u32 s6, s6, s8
+; GFX9GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX9GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX9GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX9GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9GISEL1-NEXT:    s_endpgm
+;
+; GFX1064GISEL0-LABEL: divergent_cfg:
+; GFX1064GISEL0:       ; %bb.0: ; %entry
+; GFX1064GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX1064GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX1064GISEL0-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL0-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX1064GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1064GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1064GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1064GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL0-NEXT:    s_mov_b32 s6, -1
+; GFX1064GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL0-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL0-NEXT:    s_min_u32 s6, s6, s8
+; GFX1064GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL0-NEXT:  ; %bb.5:
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1064GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1064GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1064GISEL0-NEXT:    s_endpgm
+;
+; GFX1064GISEL1-LABEL: divergent_cfg:
+; GFX1064GISEL1:       ; %bb.0: ; %entry
+; GFX1064GISEL1-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
+; GFX1064GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX1064GISEL1-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX1064GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1064GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1064GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1064GISEL1-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; GFX1064GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1064GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX1064GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1064GISEL1-NEXT:    s_andn2_saveexec_b64 s[2:3], s[2:3]
+; GFX1064GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1064GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1064GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL1-NEXT:    s_mov_b32 s6, -1
+; GFX1064GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL1-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1064GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL1-NEXT:    s_min_u32 s6, s6, s8
+; GFX1064GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1064GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1064GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1064GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1064GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1064GISEL1-NEXT:    s_endpgm
+;
+; GFX1032GISEL0-LABEL: divergent_cfg:
+; GFX1032GISEL0:       ; %bb.0: ; %entry
+; GFX1032GISEL0-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
+; GFX1032GISEL0-NEXT:    ; implicit-def: $sgpr3
+; GFX1032GISEL0-NEXT:    s_and_saveexec_b32 s2, vcc_lo
+; GFX1032GISEL0-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1032GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL0-NEXT:    s_load_dword s3, s[0:1], 0x2c
+; GFX1032GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL0-NEXT:    s_or_saveexec_b32 s2, s2
+; GFX1032GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL0-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1032GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL0-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL0-NEXT:    s_mov_b32 s3, -1
+; GFX1032GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL0-NEXT:    s_ff1_i32_b32 s5, s4
+; GFX1032GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1032GISEL0-NEXT:    s_bitset0_b32 s4, s5
+; GFX1032GISEL0-NEXT:    s_min_u32 s3, s3, s6
+; GFX1032GISEL0-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL0-NEXT:  ; %bb.5:
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1032GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1032GISEL0-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1032GISEL0-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL0-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX1032GISEL0-NEXT:    s_endpgm
+;
+; GFX1032GISEL1-LABEL: divergent_cfg:
+; GFX1032GISEL1:       ; %bb.0: ; %entry
+; GFX1032GISEL1-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
+; GFX1032GISEL1-NEXT:    ; implicit-def: $sgpr2
+; GFX1032GISEL1-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX1032GISEL1-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1032GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1032GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1032GISEL1-NEXT:    s_load_dword s2, s[0:1], 0x2c
+; GFX1032GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1032GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL1-NEXT:    s_mov_b32 s2, s2
+; GFX1032GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1032GISEL1-NEXT:    s_andn2_saveexec_b32 s3, s3
+; GFX1032GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1032GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1032GISEL1-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1032GISEL1-NEXT:    s_mov_b32 s2, -1
+; GFX1032GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL1-NEXT:    s_ff1_i32_b32 s5, s4
+; GFX1032GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1032GISEL1-NEXT:    s_bitset0_b32 s4, s5
+; GFX1032GISEL1-NEXT:    s_min_u32 s2, s2, s6
+; GFX1032GISEL1-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1032GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1032GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1032GISEL1-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1032GISEL1-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL1-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX1032GISEL1-NEXT:    s_endpgm
+;
+; GFX1164GISEL0-LABEL: divergent_cfg:
+; GFX1164GISEL0:       ; %bb.0: ; %entry
+; GFX1164GISEL0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL0-NEXT:    ; implicit-def: $sgpr4
+; GFX1164GISEL0-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1164GISEL0-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL0-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL0-NEXT:    s_or_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX1164GISEL0-NEXT:    s_xor_b64 exec, exec, s[2:3]
+; GFX1164GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1164GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL0-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL0-NEXT:    s_mov_b32 s6, -1
+; GFX1164GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL0-NEXT:    s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL0-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL0-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL0-NEXT:    s_min_u32 s6, s6, s8
+; GFX1164GISEL0-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL0-NEXT:  ; %bb.5:
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX1164GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1164GISEL0-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1164GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL0-NEXT:    s_endpgm
+;
+; GFX1164GISEL1-LABEL: divergent_cfg:
+; GFX1164GISEL1:       ; %bb.0: ; %entry
+; GFX1164GISEL1-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL1-NEXT:    ; implicit-def: $sgpr6
+; GFX1164GISEL1-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1164GISEL1-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX1164GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1164GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1164GISEL1-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX1164GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    s_mov_b32 s6, s4
+; GFX1164GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1164GISEL1-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[2:3]
+; GFX1164GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1164GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1164GISEL1-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164GISEL1-NEXT:    s_mov_b32 s6, -1
+; GFX1164GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL1-NEXT:    s_ctz_i32_b64 s7, s[4:5]
+; GFX1164GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1164GISEL1-NEXT:    v_readlane_b32 s8, v0, s7
+; GFX1164GISEL1-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1164GISEL1-NEXT:    s_min_u32 s6, s6, s8
+; GFX1164GISEL1-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1164GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1164GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1164GISEL1-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX1164GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v0, s6
+; GFX1164GISEL1-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1164GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1164GISEL1-NEXT:    s_endpgm
+;
+; GFX1132GISEL0-LABEL: divergent_cfg:
+; GFX1132GISEL0:       ; %bb.0: ; %entry
+; GFX1132GISEL0-NEXT:    s_mov_b32 s2, exec_lo
+; GFX1132GISEL0-NEXT:    ; implicit-def: $sgpr3
+; GFX1132GISEL0-NEXT:    v_cmpx_lt_u32_e32 15, v0
+; GFX1132GISEL0-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX1132GISEL0-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL0-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL0-NEXT:    s_load_b32 s3, s[0:1], 0x2c
+; GFX1132GISEL0-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL0-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL0-NEXT:    s_or_saveexec_b32 s2, s2
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1132GISEL0-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL0-NEXT:    s_cbranch_execz .LBB4_6
+; GFX1132GISEL0-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL0-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL0-NEXT:    s_mov_b32 s3, -1
+; GFX1132GISEL0-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL0-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX1132GISEL0-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL0-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1132GISEL0-NEXT:    s_bitset0_b32 s4, s5
+; GFX1132GISEL0-NEXT:    s_min_u32 s3, s3, s6
+; GFX1132GISEL0-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1132GISEL0-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL0-NEXT:  ; %bb.5:
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v1, s3
+; GFX1132GISEL0-NEXT:  .LBB4_6: ; %endif
+; GFX1132GISEL0-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX1132GISEL0-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1132GISEL0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL0-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1132GISEL0-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL0-NEXT:    s_endpgm
+;
+; GFX1132GISEL1-LABEL: divergent_cfg:
+; GFX1132GISEL1:       ; %bb.0: ; %entry
+; GFX1132GISEL1-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL1-NEXT:    ; implicit-def: $sgpr2
+; GFX1132GISEL1-NEXT:    v_cmpx_le_u32_e32 16, v0
+; GFX1132GISEL1-NEXT:    s_xor_b32 s3, exec_lo, s3
+; GFX1132GISEL1-NEXT:    s_cbranch_execz .LBB4_2
+; GFX1132GISEL1-NEXT:  ; %bb.1: ; %else
+; GFX1132GISEL1-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX1132GISEL1-NEXT:    ; implicit-def: $vgpr0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    s_mov_b32 s2, s2
+; GFX1132GISEL1-NEXT:  .LBB4_2: ; %Flow
+; GFX1132GISEL1-NEXT:    s_and_not1_saveexec_b32 s3, s3
+; GFX1132GISEL1-NEXT:    s_cbranch_execz .LBB4_5
+; GFX1132GISEL1-NEXT:  ; %bb.3: ; %if
+; GFX1132GISEL1-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1132GISEL1-NEXT:    s_mov_b32 s2, -1
+; GFX1132GISEL1-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL1-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX1132GISEL1-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL1-NEXT:    v_readlane_b32 s6, v0, s5
+; GFX1132GISEL1-NEXT:    s_bitset0_b32 s4, s5
+; GFX1132GISEL1-NEXT:    s_min_u32 s2, s2, s6
+; GFX1132GISEL1-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1132GISEL1-NEXT:    s_cbranch_scc1 .LBB4_4
+; GFX1132GISEL1-NEXT:  .LBB4_5: ; %endif
+; GFX1132GISEL1-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX1132GISEL1-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132GISEL1-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
+; GFX1132GISEL1-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL1-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1132GISEL1-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX1132GISEL1-NEXT:    s_endpgm
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %d_cmp = icmp ult i32 %tid, 16
+  br i1 %d_cmp, label %if, label %else
+
+if:
+  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umin(i32 %tid, i32 1)
+  br label %endif
+
+else:
+  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umin(i32 %in, i32 1)
+  br label %endif
+
+endif:
+  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
+  store i32 %combine, ptr addrspace(1) %out
+  ret void
+}