Index: llvm/lib/CodeGen/RegAllocGreedy.cpp
===================================================================
--- llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1244,6 +1244,53 @@
   return RCI.getNumAllocatableRegs(ConstrainedRC);
 }
 
+static LaneBitmask getInstReadLaneMask(const MachineRegisterInfo &MRI,
+                                       const TargetRegisterInfo &TRI,
+                                       const MachineInstr &MI, Register Reg) {
+  LaneBitmask Mask;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg == 0 && MO.isUse())
+      return MRI.getMaxLaneMaskForVReg(Reg);
+
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+    if (MO.isDef()) {
+      if (!MO.isUndef())
+        Mask |= ~SubRegMask;
+    } else
+      Mask |= SubRegMask;
+  }
+
+  return Mask;
+}
+
+/// Return true if \p MI at \P Use reads a subset of the lanes live in \p
+/// VirtReg.
+static bool readsLaneSubset(const MachineRegisterInfo &MRI,
+                            const MachineInstr *MI, const LiveInterval &VirtReg,
+                            const TargetRegisterInfo *TRI, SlotIndex Use) {
+  // Early check the common case.
+  if (MI->isCopy() &&
+      MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg())
+    return false;
+
+  // FIXME: We're only considering uses, but should be consider defs too?
+  LaneBitmask ReadMask = getInstReadLaneMask(MRI, *TRI, *MI, VirtReg.reg());
+
+  LaneBitmask LiveAtMask;
+  for (const LiveInterval::SubRange &S : VirtReg.subranges()) {
+    if (S.liveAt(Use))
+      LiveAtMask |= S.LaneMask;
+  }
+
+  // If the live lanes aren't different from the lanes used by the instruction,
+  // this doesn't help.
+  return (ReadMask & ~(LiveAtMask & TRI->getCoveringLanes())).any();
+}
+
 /// tryInstructionSplit - Split a live range around individual instructions.
 /// This is normally not worthwhile since the spiller is doing essentially the
 /// same thing. However, when the live range is in a constrained register
@@ -1256,8 +1303,13 @@
                                        SmallVectorImpl<Register> &NewVRegs) {
   const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg());
   // There is no point to this if there are no larger sub-classes.
-  if (!RegClassInfo.isProperSubClass(CurRC))
-    return 0;
+
+  bool SplitSubClass = true;
+  if (!RegClassInfo.isProperSubClass(CurRC)) {
+    if (!VirtReg.hasSubRanges())
+      return 0;
+    SplitSubClass = false;
+  }
 
   // Always enable split spill mode, since we're effectively spilling to a
   // register.
@@ -1280,14 +1332,19 @@
   // Otherwise, splitting just inserts uncoalescable copies that do not help
   // the allocation.
   for (const SlotIndex Use : Uses) {
-    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use))
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) {
       if (MI->isFullCopy() ||
-          SuperRCNumAllocatableRegs ==
-              getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
-                                                  TII, TRI, RegClassInfo)) {
+          (SplitSubClass &&
+           SuperRCNumAllocatableRegs ==
+               getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC,
+                                                   TII, TRI, RegClassInfo)) ||
+          // TODO: Handle split for subranges with subclass constraints?
+          (!SplitSubClass && VirtReg.hasSubRanges() &&
+           !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) {
         LLVM_DEBUG(dbgs() << "    skip:\t" << Use << '\t' << *MI);
         continue;
       }
+    }
     SE->openIntv();
     SlotIndex SegStart = SE->enterIntvBefore(Use);
     SlotIndex SegStop = SE->leaveIntvAfter(Use);
Index: llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/greedy-instruction-split-subrange.mir
@@ -0,0 +1,94 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-regalloc -stress-regalloc=3 -start-before=greedy,1 -stop-before=virtregrewriter,1 -o - %s | FileCheck %s
+---
+name: split_instruction_subranges
+alignment:       1
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr17' }
+  occupancy:       8
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: split_instruction_subranges
+    ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %9.sub1
+    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %11.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit %11.sub0
+    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[GLOBAL_LOAD_DWORDX2_SADDR2]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %7.sub1
+    ; CHECK-NEXT: S_ENDPGM 0
+    %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    %3:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %6:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    S_NOP 0, implicit %1.sub1
+    S_NOP 0, implicit %2.sub0
+    S_NOP 0, implicit %3.sub1
+    S_ENDPGM 0
+
+...
+
+---
+name: split_instruction_subranges_use_is_subreg_def
+alignment:       1
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr17' }
+  occupancy:       8
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: split_instruction_subranges_use_is_subreg_def
+    ; CHECK: [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %1:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR1:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %3:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: [[GLOBAL_LOAD_DWORDX2_SADDR2:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    ; CHECK-NEXT: SI_SPILL_V64_SAVE [[GLOBAL_LOAD_DWORDX2_SADDR2]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: S_NOP 0, implicit-def [[GLOBAL_LOAD_DWORDX2_SADDR]].sub0
+    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %13.sub0:vreg_64 = COPY [[SI_SPILL_V64_RESTORE]].sub0
+    ; CHECK-NEXT: S_NOP 0, implicit-def %13.sub1
+    ; CHECK-NEXT: undef %15.sub0:vreg_64 = COPY %13.sub0
+    ; CHECK-NEXT: [[SI_SPILL_V64_RESTORE1:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %7.sub1:vreg_64 = COPY [[SI_SPILL_V64_RESTORE1]].sub1
+    ; CHECK-NEXT: S_NOP 0, implicit-def %7.sub0
+    ; CHECK-NEXT: undef %9.sub1:vreg_64 = COPY %7.sub1
+    ; CHECK-NEXT: S_NOP 0, implicit [[GLOBAL_LOAD_DWORDX2_SADDR]].sub1
+    ; CHECK-NEXT: undef %14.sub0:vreg_64 = COPY %15.sub0
+    ; CHECK-NEXT: S_NOP 0, implicit %14.sub0
+    ; CHECK-NEXT: undef %8.sub1:vreg_64 = COPY %9.sub1
+    ; CHECK-NEXT: S_NOP 0, implicit %8.sub1
+    ; CHECK-NEXT: S_ENDPGM 0
+    %1:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %4:vgpr_32, 0, 0, implicit $exec :: (load (s64), addrspace 1)
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %5:vgpr_32, 4, 0, implicit $exec :: (load (s64), addrspace 1)
+    %3:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR undef $sgpr0_sgpr1, undef %6:vgpr_32, 8, 0, implicit $exec :: (load (s64), addrspace 1)
+    S_NOP 0, implicit-def %1.sub0
+    S_NOP 0, implicit-def %2.sub1
+    S_NOP 0, implicit-def %3.sub0
+    S_NOP 0, implicit %1.sub1
+    S_NOP 0, implicit %2.sub0
+    S_NOP 0, implicit %3.sub1
+    S_ENDPGM 0
+
+...
Index: llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -0,0 +1,418 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-regalloc -start-before=greedy,0 -stop-after=virtregrewriter,0  -greedy-regclass-priority-trumps-globalness=1 -o - %s | FileCheck %s
+
+# The allocation would previously fail due to poor ordering based on
+# register class. The super wide tuples should be allocated first so
+# that we don't need to try to evict them later. Currently we cannot
+# partially evict interfering register tuples.
+
+---
+name:            need_large_tuple_split
+alignment:       1
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 1, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 2, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 3, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 4, class: sreg_64, preferred-register: '$vcc' }
+  - { id: 5, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 6, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 7, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 8, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 9, class: sreg_64_xexec, preferred-register: '$vcc' }
+  - { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' }
+frameInfo:
+  maxAlignment:    1
+  hasCalls:        true
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  isEntryFunction: true
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr17' }
+  occupancy:       8
+body:             |
+  ; CHECK-LABEL: name: need_large_tuple_split
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr33 = COPY $sgpr14
+  ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr20_sgpr21 = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr22_sgpr23 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr52 = S_MOV_B32 0
+  ; CHECK-NEXT:   renamable $sgpr24_sgpr25 = V_CMP_EQ_U32_e64 undef $sgpr4, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_1024_align2 = COPY renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr100_sgpr101 = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr53 = S_MOV_B32 1083786240
+  ; CHECK-NEXT:   SI_SPILL_S1024_SAVE renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, %stack.1, implicit $exec, implicit $sgpr32 :: (store (s1024) into %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr100_sgpr101, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vreg_1024_align2 = COPY [[COPY]]
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.5(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr54 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr55 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr57 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr58 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr59 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr61 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr63 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr64 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr65 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr66 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr68 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+  ; CHECK-NEXT:   renamable $sgpr52 = COPY renamable $sgpr68
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr53 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr54 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr55 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr56 = COPY killed renamable $sgpr72
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr57 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr58 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr59 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr76
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr61 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr63 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr64 = COPY killed renamable $sgpr80
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr65 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr66 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95_sgpr96_sgpr97_sgpr98_sgpr99 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   renamable $sgpr67 = COPY killed renamable $sgpr84
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_1024_align2 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, implicit $exec
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr60 = COPY killed renamable $sgpr33
+  ; CHECK-NEXT:   renamable $sgpr62 = COPY killed renamable $sgpr15
+  ; CHECK-NEXT:   SI_SPILL_S32_SAVE killed renamable $sgpr16, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, CustomRegMask($sgpr60,$sgpr62)
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.17(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr60, $sgpr62
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $sgpr12 = COPY killed renamable $sgpr60
+  ; CHECK-NEXT:   $sgpr13 = COPY killed renamable $sgpr62
+  ; CHECK-NEXT:   $sgpr14 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_noregs, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   successors: %bb.12(0x40000000), %bb.6(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr20_sgpr21, undef renamable $sgpr88_sgpr89, implicit-def dead $scc
+  ; CHECK-NEXT:   renamable $sgpr88_sgpr89 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.12, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   successors: %bb.7(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead %27:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr22_sgpr23, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   successors: %bb.8(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr90_sgpr91 = nofpexcept V_CMP_NLT_F64_e64 0, undef $sgpr4_sgpr5, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr92_sgpr93 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   dead %30:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 [[COPY1]], undef $sgpr33, 11, implicit-def $m0, implicit $m0, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.8:
+  ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.9(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr90_sgpr91, $sgpr92_sgpr93, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr90_sgpr91, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.10, implicit $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.9:
+  ; CHECK-NEXT:   successors: %bb.10(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr90_sgpr91, $sgpr92_sgpr93, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr32 :: (load (s1024) from %stack.1, align 4, addrspace 5)
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vreg_64_align2 = COPY killed renamable $sgpr68_sgpr69, implicit $exec
+  ; CHECK-NEXT:   GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, [[COPY2]], undef renamable $sgpr4_sgpr5, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+  ; CHECK-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+  ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = V_CMP_NE_U32_e64 1, [[V_CNDMASK_B32_e64_]], implicit $exec
+  ; CHECK-NEXT:   renamable $sgpr64 = S_ADD_U32 renamable $sgpr8, 32, implicit-def dead $scc
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $sgpr4_sgpr5 = COPY renamable $sgpr34_sgpr35
+  ; CHECK-NEXT:   renamable $sgpr52_sgpr53 = COPY killed renamable $sgpr6_sgpr7
+  ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY renamable $sgpr52_sgpr53
+  ; CHECK-NEXT:   renamable $sgpr38_sgpr39 = COPY killed renamable $sgpr10_sgpr11
+  ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY renamable $sgpr38_sgpr39
+  ; CHECK-NEXT:   renamable $sgpr42_sgpr43 = COPY killed renamable $sgpr12_sgpr13
+  ; CHECK-NEXT:   $sgpr12 = COPY renamable $sgpr33
+  ; CHECK-NEXT:   $sgpr13 = COPY renamable $sgpr15
+  ; CHECK-NEXT:   renamable $sgpr36 = COPY killed renamable $sgpr16
+  ; CHECK-NEXT:   renamable $sgpr37 = COPY killed renamable $sgpr15
+  ; CHECK-NEXT:   renamable $sgpr40 = COPY killed renamable $sgpr8
+  ; CHECK-NEXT:   renamable $sgpr44_sgpr45 = COPY killed renamable $sgpr18_sgpr19
+  ; CHECK-NEXT:   renamable $sgpr46_sgpr47 = COPY killed renamable $sgpr20_sgpr21
+  ; CHECK-NEXT:   renamable $sgpr48_sgpr49 = COPY killed renamable $sgpr22_sgpr23
+  ; CHECK-NEXT:   renamable $sgpr50_sgpr51 = COPY killed renamable $sgpr24_sgpr25
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $sgpr8_sgpr9 = COPY renamable $sgpr64_sgpr65
+  ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL undef renamable $sgpr4_sgpr5, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
+  ; CHECK-NEXT:   renamable $sgpr24_sgpr25 = COPY killed renamable $sgpr50_sgpr51
+  ; CHECK-NEXT:   renamable $sgpr22_sgpr23 = COPY killed renamable $sgpr48_sgpr49
+  ; CHECK-NEXT:   renamable $sgpr20_sgpr21 = COPY killed renamable $sgpr46_sgpr47
+  ; CHECK-NEXT:   renamable $sgpr18_sgpr19 = COPY killed renamable $sgpr44_sgpr45
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13 = COPY killed renamable $sgpr42_sgpr43
+  ; CHECK-NEXT:   renamable $sgpr8 = COPY killed renamable $sgpr40
+  ; CHECK-NEXT:   renamable $sgpr10_sgpr11 = COPY killed renamable $sgpr38_sgpr39
+  ; CHECK-NEXT:   renamable $sgpr15 = COPY killed renamable $sgpr37
+  ; CHECK-NEXT:   renamable $sgpr16 = COPY killed renamable $sgpr36
+  ; CHECK-NEXT:   renamable $sgpr6_sgpr7 = COPY killed renamable $sgpr52_sgpr53
+  ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term renamable $sgpr92_sgpr93
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.10, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.10:
+  ; CHECK-NEXT:   successors: %bb.8(0x40000000), %bb.12(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr90_sgpr91, $sgpr92_sgpr93, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.8, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.12
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.11:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.17
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.12:
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.13(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr88_sgpr89, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr88_sgpr89
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.11, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.13:
+  ; CHECK-NEXT:   successors: %bb.15(0x40000000), %bb.14(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr24_sgpr25, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.15, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.14
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.14:
+  ; CHECK-NEXT:   successors: %bb.15(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.15:
+  ; CHECK-NEXT:   successors: %bb.11(0x40000000), %bb.16(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33, $sgpr6_sgpr7, $sgpr8_sgpr9:0x0000000000000003, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr20_sgpr21, $sgpr22_sgpr23, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr100_sgpr101
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_AND_B64 $exec, renamable $sgpr18_sgpr19, implicit-def dead $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.11, implicit $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.16:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.17(0x40000000)
+  ; CHECK-NEXT:   liveins: $sgpr15, $sgpr16, $sgpr33
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.3, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.17:
+  bb.0:
+    liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16
+
+    %11:sgpr_32 = COPY $sgpr16
+    %12:sgpr_32 = COPY $sgpr15
+    %13:sgpr_32 = COPY $sgpr14
+    %14:sgpr_64 = COPY $sgpr10_sgpr11
+    %15:sgpr_64 = COPY $sgpr8_sgpr9
+    %16:sgpr_64 = COPY $sgpr6_sgpr7
+    %17:sgpr_64 = COPY $sgpr4_sgpr5
+    %5:sreg_64_xexec = V_CMP_GT_I32_e64 1, undef %18:vgpr_32, implicit $exec
+    %6:sreg_64_xexec = V_CMP_EQ_U32_e64 0, undef %18:vgpr_32, implicit $exec
+    %7:sreg_64_xexec = V_CMP_NE_U32_e64 0, undef %18:vgpr_32, implicit $exec
+    %8:sreg_64_xexec = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+    undef %19.sub16:sgpr_1024 = S_MOV_B32 0
+    %9:sreg_64_xexec = V_CMP_EQ_U32_e64 undef %20:sreg_32_xm0_xexec, undef %18:vgpr_32, implicit $exec
+    %21:vreg_1024_align2 = COPY %19, implicit $exec
+    %10:sreg_64_xexec = V_CMP_NE_U32_e64 1, undef %18:vgpr_32, implicit $exec
+    %19.sub17:sgpr_1024 = S_MOV_B32 1083786240
+    S_BRANCH %bb.1
+
+  bb.1:
+    $vcc = S_AND_B64 $exec, %10, implicit-def dead $scc
+    %22:vreg_1024_align2 = COPY %21
+    S_CBRANCH_VCCNZ %bb.2, implicit $vcc
+    S_BRANCH %bb.17
+
+  bb.2:
+    undef %23.sub0:sgpr_1024 = COPY %19.sub16
+    %23.sub1:sgpr_1024 = COPY %19.sub16
+    %23.sub2:sgpr_1024 = COPY %19.sub16
+    %23.sub3:sgpr_1024 = COPY %19.sub16
+    %23.sub4:sgpr_1024 = COPY %19.sub16
+    %23.sub5:sgpr_1024 = COPY %19.sub16
+    %23.sub6:sgpr_1024 = COPY %19.sub16
+    %23.sub7:sgpr_1024 = COPY %19.sub16
+    %23.sub8:sgpr_1024 = COPY %19.sub16
+    %23.sub9:sgpr_1024 = COPY %19.sub16
+    %23.sub10:sgpr_1024 = COPY %19.sub16
+    %23.sub11:sgpr_1024 = COPY %19.sub16
+    %23.sub12:sgpr_1024 = COPY %19.sub16
+    %23.sub13:sgpr_1024 = COPY %19.sub16
+    %23.sub14:sgpr_1024 = COPY %19.sub16
+    %23.sub15:sgpr_1024 = COPY %19.sub16
+    %23.sub16:sgpr_1024 = COPY %19.sub16
+    %23.sub17:sgpr_1024 = COPY %19.sub16
+    %23.sub18:sgpr_1024 = COPY %19.sub16
+    %23.sub19:sgpr_1024 = COPY %19.sub16
+    %23.sub20:sgpr_1024 = COPY %19.sub16
+    %23.sub21:sgpr_1024 = COPY %19.sub16
+    %23.sub22:sgpr_1024 = COPY %19.sub16
+    %23.sub23:sgpr_1024 = COPY %19.sub16
+    %23.sub24:sgpr_1024 = COPY %19.sub16
+    %23.sub25:sgpr_1024 = COPY %19.sub16
+    %23.sub26:sgpr_1024 = COPY %19.sub16
+    %23.sub27:sgpr_1024 = COPY %19.sub16
+    %23.sub28:sgpr_1024 = COPY %19.sub16
+    %23.sub29:sgpr_1024 = COPY %19.sub16
+    %23.sub30:sgpr_1024 = COPY %19.sub16
+    %23.sub31:sgpr_1024 = COPY %19.sub16
+    %21:vreg_1024_align2 = COPY %23, implicit $exec
+    S_CBRANCH_EXECZ %bb.11, implicit $exec
+    S_BRANCH %bb.5
+
+  bb.3:
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    dead $sgpr30_sgpr31 = SI_CALL undef %24:sreg_64_xexec, 0, CustomRegMask($sgpr60,$sgpr62)
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+
+  bb.4:
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $sgpr12 = COPY %13
+    $sgpr13 = COPY %12
+    $sgpr14 = COPY %11
+    dead $sgpr30_sgpr31 = SI_CALL undef %25:sreg_64, 0, csr_amdgpu_noregs, implicit killed $sgpr12, implicit killed $sgpr13, implicit $sgpr14
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    S_BRANCH %bb.17
+
+  bb.5:
+    %26:sreg_64 = S_AND_B64 %7, undef %3, implicit-def dead $scc
+    %3:sreg_64 = V_CMP_GT_I32_e64 0, undef %18:vgpr_32, implicit $exec
+    $exec = S_MOV_B64_term %26
+    S_CBRANCH_EXECZ %bb.12, implicit $exec
+
+  bb.6:
+    dead %27:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %8, implicit $exec
+
+  bb.7:
+    %0:sreg_64_xexec = nofpexcept V_CMP_NLT_F64_e64 0, undef %28:sreg_64, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+    %1:sreg_64 = nofpexcept V_CMP_NLT_F64_e64 0, 4607182418800017408, 0, undef %29:vreg_64_align2, 0, implicit $mode, implicit $exec
+    dead %30:vgpr_32 = V_INDIRECT_REG_READ_GPR_IDX_B32_V32 %22, undef %13, 11, implicit-def $m0, implicit $m0, implicit $exec
+
+  bb.8:
+    $vcc = S_AND_B64 $exec, %0, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.10, implicit $vcc
+
+  bb.9:
+    %31:vreg_64_align2 = COPY %19.sub16_sub17, implicit $exec
+    GLOBAL_STORE_DWORDX2_SADDR undef %18:vgpr_32, %31, undef %24:sreg_64_xexec, 0, 0, implicit $exec :: (store (s64), addrspace 1)
+    %32:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %5, implicit $exec
+    dead %33:sreg_64_xexec = V_CMP_NE_U32_e64 1, %32, implicit $exec
+    undef %34.sub0:sreg_64 = S_ADD_U32 %15.sub0, 32, implicit-def dead $scc
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $sgpr4_sgpr5 = COPY %17
+    $sgpr6_sgpr7 = COPY %16
+    $sgpr10_sgpr11 = COPY %14
+    $sgpr12 = COPY %13
+    $sgpr13 = COPY %12
+    dead $sgpr30_sgpr31 = SI_CALL undef %33, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $sgpr8_sgpr9 = COPY %34
+    dead $sgpr30_sgpr31 = SI_CALL undef %33, 0, csr_amdgpu_gfx90ainsts, implicit $sgpr8_sgpr9
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+    $exec = S_MOV_B64_term %1
+    S_CBRANCH_EXECZ %bb.10, implicit $exec
+    S_BRANCH %bb.17
+
+  bb.10:
+    S_CBRANCH_EXECZ %bb.8, implicit $exec
+    S_BRANCH %bb.12
+
+  bb.11:
+    S_CBRANCH_EXECZ %bb.1, implicit $exec
+    S_BRANCH %bb.17
+
+  bb.12:
+    $exec = S_MOV_B64_term %3
+    S_CBRANCH_EXECZ %bb.11, implicit $exec
+
+  bb.13:
+    $vcc = S_AND_B64 $exec, %9, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.15, implicit $vcc
+    S_BRANCH %bb.14
+
+  bb.14:
+
+  bb.15:
+    $vcc = S_AND_B64 $exec, %6, implicit-def dead $scc
+    S_CBRANCH_VCCNZ %bb.11, implicit $vcc
+
+  bb.16:
+    S_CBRANCH_EXECZ %bb.3, implicit $exec
+
+  bb.17:
+
+...
Index: llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
===================================================================
--- llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
+++ llvm/test/CodeGen/AMDGPU/splitkit-copy-live-lanes.mir
@@ -31,46 +31,28 @@
     ; CHECK-NEXT:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 48, 0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
     ; CHECK-NEXT: }
     ; CHECK-NEXT: undef %47.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %47, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %52.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %52, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %57.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %57, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %62.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %62, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %67.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %67, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %72.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %72, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %77.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %77, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %82.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %82, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %87.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %91.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %95.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %95, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %19.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %153.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %153, %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %102.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %106.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %106, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %111.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %54.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %61.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %68.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %75.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %82.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %89.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %94.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %99.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %104.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %139.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %185.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %166.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %113.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %118.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 64, 0, 0, 0, implicit $exec :: (load (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %115.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %119.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %123.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %127.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %127, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %128.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %133.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %144.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %149.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 80, 0, 0, 0, implicit $exec :: (load (s128), addrspace 1)
-    ; CHECK-NEXT: undef %138.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %142.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %146.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: undef %150.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %150, %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
     ; CHECK-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %3, 0, 96, 0, 0, 0, implicit $exec :: (load (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %156.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK-NEXT: undef %36.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK-NEXT: undef %37.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK-NEXT: undef %38.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
@@ -80,73 +62,141 @@
     ; CHECK-NEXT: undef %42.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub3, implicit $exec
     ; CHECK-NEXT: undef %43.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET7]].sub2, implicit $exec
     ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE]], %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE1]], %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE2]], %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE3]], %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE4]], %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE5]], %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE6]], %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE7]], %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %131.sub2:vreg_128 = COPY %87.sub2
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %131, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE8]], %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %134.sub2:vreg_128 = COPY %91.sub2
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE %134, %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE9]], %stack.12, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE10]], %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: %19.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE11]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %103.sub2:vreg_128 = COPY %102.sub2
-    ; CHECK-NEXT: %103.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE12:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE12]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE12]], %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %112.sub2:vreg_128 = COPY %111.sub2
-    ; CHECK-NEXT: %112.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
-    ; CHECK-NEXT: undef %116.sub2:vreg_128 = COPY %115.sub2
-    ; CHECK-NEXT: %116.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %120.sub2:vreg_128 = COPY %119.sub2
-    ; CHECK-NEXT: %120.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %48.sub2:vreg_128 = COPY %47.sub2
+    ; CHECK-NEXT: %48.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %50.sub0:vreg_128 = COPY %48.sub0 {
+    ; CHECK-NEXT:   internal %50.sub2:vreg_128 = COPY %48.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %50, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %55.sub2:vreg_128 = COPY %54.sub2
+    ; CHECK-NEXT: %55.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %57.sub0:vreg_128 = COPY %55.sub0 {
+    ; CHECK-NEXT:   internal %57.sub2:vreg_128 = COPY %55.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %57, %stack.1, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %62.sub2:vreg_128 = COPY %61.sub2
+    ; CHECK-NEXT: %62.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %64.sub0:vreg_128 = COPY %62.sub0 {
+    ; CHECK-NEXT:   internal %64.sub2:vreg_128 = COPY %62.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %64, %stack.2, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %69.sub2:vreg_128 = COPY %68.sub2
+    ; CHECK-NEXT: %69.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %71.sub0:vreg_128 = COPY %69.sub0 {
+    ; CHECK-NEXT:   internal %71.sub2:vreg_128 = COPY %69.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %71, %stack.3, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %76.sub2:vreg_128 = COPY %75.sub2
+    ; CHECK-NEXT: %76.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %78.sub0:vreg_128 = COPY %76.sub0 {
+    ; CHECK-NEXT:   internal %78.sub2:vreg_128 = COPY %76.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %78, %stack.4, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %83.sub2:vreg_128 = COPY %82.sub2
+    ; CHECK-NEXT: %83.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %85.sub0:vreg_128 = COPY %83.sub0 {
+    ; CHECK-NEXT:   internal %85.sub2:vreg_128 = COPY %83.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %85, %stack.5, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %90.sub2:vreg_128 = COPY %89.sub2
+    ; CHECK-NEXT: %90.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %140.sub0:vreg_128 = COPY %90.sub0 {
+    ; CHECK-NEXT:   internal %140.sub2:vreg_128 = COPY %90.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %140, %stack.7, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %95.sub2:vreg_128 = COPY %94.sub2
+    ; CHECK-NEXT: %95.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET1]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %107.sub0:vreg_128 = COPY %95.sub0 {
+    ; CHECK-NEXT:   internal %107.sub2:vreg_128 = COPY %95.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %107, %stack.6, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %100.sub2:vreg_128 = COPY %99.sub2
+    ; CHECK-NEXT: %100.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %101.sub0:vreg_128 = COPY %100.sub0 {
+    ; CHECK-NEXT:   internal %101.sub2:vreg_128 = COPY %100.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %105.sub2:vreg_128 = COPY %104.sub2
+    ; CHECK-NEXT: %105.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %106.sub0:vreg_128 = COPY %105.sub0 {
+    ; CHECK-NEXT:   internal %106.sub2:vreg_128 = COPY %105.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %139.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %158.sub0:vreg_128 = COPY %139.sub0 {
+    ; CHECK-NEXT:   internal %158.sub2:vreg_128 = COPY %139.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %158, %stack.8, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %186.sub2:vreg_128 = COPY %185.sub2
+    ; CHECK-NEXT: %186.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET2]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %188.sub0:vreg_128 = COPY %186.sub0 {
+    ; CHECK-NEXT:   internal %188.sub2:vreg_128 = COPY %186.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %188, %stack.11, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %167.sub2:vreg_128 = COPY %166.sub2
+    ; CHECK-NEXT: %167.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %169.sub0:vreg_128 = COPY %167.sub0 {
+    ; CHECK-NEXT:   internal %169.sub2:vreg_128 = COPY %167.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %169, %stack.9, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %114.sub2:vreg_128 = COPY %113.sub2
+    ; CHECK-NEXT: %114.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %115.sub0:vreg_128 = COPY %114.sub0 {
+    ; CHECK-NEXT:   internal %115.sub2:vreg_128 = COPY %114.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %119.sub2:vreg_128 = COPY %118.sub2
+    ; CHECK-NEXT: %119.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %181.sub0:vreg_128 = COPY %119.sub0 {
+    ; CHECK-NEXT:   internal %181.sub2:vreg_128 = COPY %119.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: SI_SPILL_V128_SAVE %181, %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
     ; CHECK-NEXT: undef %124.sub2:vreg_128 = COPY %123.sub2
-    ; CHECK-NEXT: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE13:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE13]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE13]], %stack.10, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %139.sub2:vreg_128 = COPY %138.sub2
-    ; CHECK-NEXT: %139.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
-    ; CHECK-NEXT: undef %143.sub2:vreg_128 = COPY %142.sub2
-    ; CHECK-NEXT: %143.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
-    ; CHECK-NEXT: undef %147.sub2:vreg_128 = COPY %146.sub2
-    ; CHECK-NEXT: %147.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE14:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE14]].sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_V128_SAVE [[SI_SPILL_V128_RESTORE14]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.13, align 4, addrspace 5)
-    ; CHECK-NEXT: %156.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: %124.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET3]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %125.sub0:vreg_128 = COPY %124.sub0 {
+    ; CHECK-NEXT:   internal %125.sub2:vreg_128 = COPY %124.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %129.sub2:vreg_128 = COPY %128.sub2
+    ; CHECK-NEXT: %129.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %130.sub0:vreg_128 = COPY %129.sub0 {
+    ; CHECK-NEXT:   internal %130.sub2:vreg_128 = COPY %129.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %134.sub2:vreg_128 = COPY %133.sub2
+    ; CHECK-NEXT: %134.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %135.sub0:vreg_128 = COPY %134.sub0 {
+    ; CHECK-NEXT:   internal %135.sub2:vreg_128 = COPY %134.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %145.sub2:vreg_128 = COPY %144.sub2
+    ; CHECK-NEXT: %145.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %146.sub0:vreg_128 = COPY %145.sub0 {
+    ; CHECK-NEXT:   internal %146.sub2:vreg_128 = COPY %145.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %150.sub2:vreg_128 = COPY %149.sub2
+    ; CHECK-NEXT: %150.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET4]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %151.sub0:vreg_128 = COPY %150.sub0 {
+    ; CHECK-NEXT:   internal %151.sub2:vreg_128 = COPY %150.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %157.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %155.sub2:vreg_128 = COPY %157.sub2
+    ; CHECK-NEXT: %155.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %156.sub0:vreg_128 = COPY %155.sub0 {
+    ; CHECK-NEXT:   internal %156.sub2:vreg_128 = COPY %155.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %165.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %163.sub2:vreg_128 = COPY %165.sub2
+    ; CHECK-NEXT: %163.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub0, implicit $exec
+    ; CHECK-NEXT: undef %164.sub0:vreg_128 = COPY %163.sub0 {
+    ; CHECK-NEXT:   internal %164.sub2:vreg_128 = COPY %163.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %176.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %174.sub2:vreg_128 = COPY %176.sub2
+    ; CHECK-NEXT: %174.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub3, implicit $exec
+    ; CHECK-NEXT: undef %175.sub0:vreg_128 = COPY %174.sub0 {
+    ; CHECK-NEXT:   internal %175.sub2:vreg_128 = COPY %174.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: undef %195.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %180.sub2:vreg_128 = COPY %195.sub2
+    ; CHECK-NEXT: %180.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET5]].sub2, implicit $exec
+    ; CHECK-NEXT: undef %194.sub2:vreg_128 = V_LSHRREV_B32_e32 16, [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
+    ; CHECK-NEXT: undef %193.sub2:vreg_128 = COPY %194.sub2
+    ; CHECK-NEXT: %193.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub1, implicit $exec
     ; CHECK-NEXT: %36.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub0, implicit $exec
     ; CHECK-NEXT: %37.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub3, implicit $exec
     ; CHECK-NEXT: %38.sub0:vreg_128 = V_AND_B32_e32 [[S_MOV_B32_]], [[BUFFER_LOAD_DWORDX4_OFFSET6]].sub2, implicit $exec
@@ -175,164 +225,164 @@
     ; CHECK-NEXT: %36.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %36.sub3:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %36, %2, 0, 384, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: undef %157.sub0:vreg_128 = COPY %156.sub0 {
-    ; CHECK-NEXT:   internal %157.sub2:vreg_128 = COPY %156.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %157.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %157.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %157, %2, 0, 400, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE15:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.13, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %149.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub0 {
-    ; CHECK-NEXT:   internal %149.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE15]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %149.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %149.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %149, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %145.sub0:vreg_128 = COPY %147.sub0 {
-    ; CHECK-NEXT:   internal %145.sub2:vreg_128 = COPY %147.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %145.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %145.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %145, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %141.sub0:vreg_128 = COPY %143.sub0 {
-    ; CHECK-NEXT:   internal %141.sub2:vreg_128 = COPY %143.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %141.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %141.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %141, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: undef %137.sub0:vreg_128 = COPY %139.sub0 {
-    ; CHECK-NEXT:   internal %137.sub2:vreg_128 = COPY %139.sub2
+    ; CHECK-NEXT: undef %191.sub0:vreg_128 = COPY %193.sub0 {
+    ; CHECK-NEXT:   internal %191.sub2:vreg_128 = COPY %193.sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %137.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %137.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %137, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE16:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %126.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub0 {
-    ; CHECK-NEXT:   internal %126.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE16]].sub2
+    ; CHECK-NEXT: %191.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %191.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %191, %2, 0, 400, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %178.sub0:vreg_128 = COPY %180.sub0 {
+    ; CHECK-NEXT:   internal %178.sub2:vreg_128 = COPY %180.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %178.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %178.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %178, %2, 0, 352, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef %172.sub0:vreg_128 = COPY %175.sub0 {
+    ; CHECK-NEXT:   internal %172.sub2:vreg_128 = COPY %175.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %172.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %172.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %172, %2, 0, 368, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %161.sub0:vreg_128 = COPY %164.sub0 {
+    ; CHECK-NEXT:   internal %161.sub2:vreg_128 = COPY %164.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %161.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %161.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %161, %2, 0, 320, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: undef %153.sub0:vreg_128 = COPY %156.sub0 {
+    ; CHECK-NEXT:   internal %153.sub2:vreg_128 = COPY %156.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %153.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %153.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %153, %2, 0, 336, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %148.sub0:vreg_128 = COPY %151.sub0 {
+    ; CHECK-NEXT:   internal %148.sub2:vreg_128 = COPY %151.sub2
     ; CHECK-NEXT: }
-    ; CHECK-NEXT: %126.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %126.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %126, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: undef %122.sub0:vreg_128 = COPY %124.sub0 {
-    ; CHECK-NEXT:   internal %122.sub2:vreg_128 = COPY %124.sub2
+    ; CHECK-NEXT: %148.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %148.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %148, %2, 0, 288, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: undef %143.sub0:vreg_128 = COPY %146.sub0 {
+    ; CHECK-NEXT:   internal %143.sub2:vreg_128 = COPY %146.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %143.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %143.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %143, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %132.sub0:vreg_128 = COPY %135.sub0 {
+    ; CHECK-NEXT:   internal %132.sub2:vreg_128 = COPY %135.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %132.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %132.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %132, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
+    ; CHECK-NEXT: undef %127.sub0:vreg_128 = COPY %130.sub0 {
+    ; CHECK-NEXT:   internal %127.sub2:vreg_128 = COPY %130.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %127.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %127.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %127, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %122.sub0:vreg_128 = COPY %125.sub0 {
+    ; CHECK-NEXT:   internal %122.sub2:vreg_128 = COPY %125.sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %122.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %122.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 304, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %118.sub0:vreg_128 = COPY %120.sub0 {
-    ; CHECK-NEXT:   internal %118.sub2:vreg_128 = COPY %120.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %118.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %118.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %118, %2, 0, 256, 0, 0, 0, implicit $exec :: (store (s128), align 256, addrspace 1)
-    ; CHECK-NEXT: undef %114.sub0:vreg_128 = COPY %116.sub0 {
-    ; CHECK-NEXT:   internal %114.sub2:vreg_128 = COPY %116.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %114.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %114.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %114, %2, 0, 272, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %110.sub0:vreg_128 = COPY %112.sub0 {
-    ; CHECK-NEXT:   internal %110.sub2:vreg_128 = COPY %112.sub2
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %122, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.10, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %117.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub0 {
+    ; CHECK-NEXT:   internal %117.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %117.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %117.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %117, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %112.sub0:vreg_128 = COPY %115.sub0 {
+    ; CHECK-NEXT:   internal %112.sub2:vreg_128 = COPY %115.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %112.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %112.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %112, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE1:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %110.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub0 {
+    ; CHECK-NEXT:   internal %110.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE1]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %110.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %110.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %110, %2, 0, 224, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE17:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.9, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %105.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub0 {
-    ; CHECK-NEXT:   internal %105.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE17]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %105.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %105.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %105, %2, 0, 240, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: undef %101.sub0:vreg_128 = COPY %103.sub0 {
-    ; CHECK-NEXT:   internal %101.sub2:vreg_128 = COPY %103.sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %101.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %101.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %101, %2, 0, 192, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE18:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.14, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %99.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub0 {
-    ; CHECK-NEXT:   internal %99.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE18]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %99.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %99.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %99, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: %19.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %19.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %19, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE19:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %94.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub0 {
-    ; CHECK-NEXT:   internal %94.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE19]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %94.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %94.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %94, %2, 0, 176, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE20:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.12, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %90.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub0 {
-    ; CHECK-NEXT:   internal %90.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE20]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %90.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %90.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %90, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE21:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %86.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub0 {
-    ; CHECK-NEXT:   internal %86.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE21]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %86.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %86.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %86, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE22:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub0 {
-    ; CHECK-NEXT:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE22]].sub2
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %110, %2, 0, 208, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE2:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.11, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %184.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub0 {
+    ; CHECK-NEXT:   internal %184.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE2]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %184.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %184.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %184, %2, 0, 160, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE3:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.8, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %137.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub0 {
+    ; CHECK-NEXT:   internal %137.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE3]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %137.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %137.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %137, %2, 0, 176, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: undef %103.sub0:vreg_128 = COPY %106.sub0 {
+    ; CHECK-NEXT:   internal %103.sub2:vreg_128 = COPY %106.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %103.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %103.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %103, %2, 0, 128, 0, 0, 0, implicit $exec :: (store (s128), align 128, addrspace 1)
+    ; CHECK-NEXT: undef %98.sub0:vreg_128 = COPY %101.sub0 {
+    ; CHECK-NEXT:   internal %98.sub2:vreg_128 = COPY %101.sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %98.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %98.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %98, %2, 0, 144, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE4:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %93.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub0 {
+    ; CHECK-NEXT:   internal %93.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE4]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %93.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %93.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %93, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE5:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.7, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %88.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub0 {
+    ; CHECK-NEXT:   internal %88.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE5]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %88.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %88.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %88, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE6:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %81.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub0 {
+    ; CHECK-NEXT:   internal %81.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE6]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %81.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %81.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %81, %2, 0, 96, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE23:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.6, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %76.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub0 {
-    ; CHECK-NEXT:   internal %76.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE23]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %76.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %76.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %76, %2, 0, 112, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE24:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.5, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %71.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub0 {
-    ; CHECK-NEXT:   internal %71.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE24]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %71.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %71.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %71, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE25:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %66.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub0 {
-    ; CHECK-NEXT:   internal %66.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE25]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %66.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %66.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %66, %2, 0, 80, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE26:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %61.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub0 {
-    ; CHECK-NEXT:   internal %61.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE26]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %61.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %61.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %61, %2, 0, 32, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE27:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %56.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub0 {
-    ; CHECK-NEXT:   internal %56.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE27]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %56.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %56.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %56, %2, 0, 48, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE28:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %51.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub0 {
-    ; CHECK-NEXT:   internal %51.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE28]].sub2
-    ; CHECK-NEXT: }
-    ; CHECK-NEXT: %51.sub1:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: %51.sub3:vreg_128 = COPY %43.sub1
-    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %51, %2, 0, 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
-    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE29:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
-    ; CHECK-NEXT: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub0 {
-    ; CHECK-NEXT:   internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE29]].sub2
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %81, %2, 0, 64, 0, 0, 0, implicit $exec :: (store (s128), align 64, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE7:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.4, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %74.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub0 {
+    ; CHECK-NEXT:   internal %74.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE7]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %74.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %74.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %74, %2, 0, 80, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE8:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.3, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %67.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub0 {
+    ; CHECK-NEXT:   internal %67.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE8]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %67.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %67.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %67, %2, 0, 32, 0, 0, 0, implicit $exec :: (store (s128), align 32, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE9:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %60.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub0 {
+    ; CHECK-NEXT:   internal %60.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE9]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %60.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %60.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %60, %2, 0, 48, 0, 0, 0, implicit $exec :: (store (s128), addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE10:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.1, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %53.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub0 {
+    ; CHECK-NEXT:   internal %53.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE10]].sub2
+    ; CHECK-NEXT: }
+    ; CHECK-NEXT: %53.sub1:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: %53.sub3:vreg_128 = COPY %43.sub1
+    ; CHECK-NEXT: BUFFER_STORE_DWORDX4_OFFSET %53, %2, 0, 0, 0, 0, 0, implicit $exec :: (store (s128), align 512, addrspace 1)
+    ; CHECK-NEXT: [[SI_SPILL_V128_RESTORE11:%[0-9]+]]:vreg_128 = SI_SPILL_V128_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: undef %46.sub0:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub0 {
+    ; CHECK-NEXT:   internal %46.sub2:vreg_128 = COPY [[SI_SPILL_V128_RESTORE11]].sub2
     ; CHECK-NEXT: }
     ; CHECK-NEXT: %46.sub1:vreg_128 = COPY %43.sub1
     ; CHECK-NEXT: %46.sub3:vreg_128 = COPY %43.sub1
Index: llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -0,0 +1,1881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=1 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS1 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -greedy-regclass-priority-trumps-globalness=0 -o - %s | FileCheck -check-prefixes=GFX90A,GLOBALNESS0 %s
+
+declare void @wobble()
+
+define internal fastcc void @widget() {
+; GFX90A-LABEL: widget:
+; GFX90A:       ; %bb.0: ; %bb
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX90A-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX90A-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX90A-NEXT:    v_writelane_b32 v40, s33, 2
+; GFX90A-NEXT:    s_mov_b32 s33, s32
+; GFX90A-NEXT:    s_addk_i32 s32, 0x400
+; GFX90A-NEXT:    s_getpc_b64 s[16:17]
+; GFX90A-NEXT:    s_add_u32 s16, s16, wobble@gotpcrel32@lo+4
+; GFX90A-NEXT:    s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12
+; GFX90A-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
+; GFX90A-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX90A-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+bb:
+  tail call void @wobble()
+  unreachable
+}
+
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %arg1.global, i1 %tmp3.i.i, i32 %tmp5.i.i, i32 %tmp427.i, i1 %tmp438.i, double %tmp27.i, i1 %tmp48.i) {
+; GLOBALNESS1-LABEL: kernel:
+; GLOBALNESS1:       ; %bb.0: ; %bb
+; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[36:37], s[6:7]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v42, v0
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v44, 0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[56:57], s[8:9], 0x8
+; GLOBALNESS1-NEXT:    s_nop 0
+; GLOBALNESS1-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS1-NEXT:    s_nop 0
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[6:7], s[38:39], 0x18
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v44, off
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    global_load_dword v0, v44, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b32 s61, 0
+; GLOBALNESS1-NEXT:    s_mov_b32 s60, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s62, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s63, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s65, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s67, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a32, s60
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a33, s61
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a34, s62
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a35, s63
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a36, s64
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a37, s65
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a38, s66
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a39, s67
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a40, s68
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a41, s69
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a42, s70
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a43, s71
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a44, s72
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a45, s73
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a46, s74
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a47, s75
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a48, s76
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a49, s77
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a50, s78
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a51, s79
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a52, s80
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a53, s81
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a54, s82
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a55, s83
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a56, s84
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a57, s85
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a58, s86
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a59, s87
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a60, s88
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a61, s89
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a62, s90
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a63, s91
+; GLOBALNESS1-NEXT:    s_movk_i32 s60, 0x80
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s60, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s61, 1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s62, 2
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s63, 3
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s64, 4
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s65, 5
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s66, 6
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s67, 7
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s68, 8
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s69, 9
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s70, 10
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s71, 11
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s72, 12
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s73, 13
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s74, 14
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s75, 15
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s76, 16
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s77, 17
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s78, 18
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s79, 19
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s80, 20
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s81, 21
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s82, 22
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s83, 23
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s84, 24
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s85, 25
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s86, 26
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s87, 27
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s88, 28
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s89, 29
+; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s90, 30
+; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s91, 31
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], v[44:45]
+; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 32
+; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 33
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s56, 0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 34
+; GLOBALNESS1-NEXT:    s_load_dword s9, s[38:39], 0x20
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 35
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    s_xor_b64 s[46:47], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s8, 0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    s_xor_b64 s[50:51], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s9, 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
+; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, wobble@gotpcrel32@lo+4
+; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12
+; GLOBALNESS1-NEXT:    s_xor_b64 s[52:53], s[4:5], -1
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 36
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[42:43], s[6:7], 0x0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 37
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 38
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 39
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s4, 40
+; GLOBALNESS1-NEXT:    s_mov_b32 s100, s16
+; GLOBALNESS1-NEXT:    s_mov_b32 s101, s15
+; GLOBALNESS1-NEXT:    s_mov_b32 s44, s14
+; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s5, 41
+; GLOBALNESS1-NEXT:    s_mov_b32 s45, 0x3ff00000
+; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS1-NEXT:    s_branch .LBB1_4
+; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 40
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 41
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
+; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow6
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS1-NEXT:  .LBB1_3: ; %Flow19
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a63, v31
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 42
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a62, v30
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a61, v29
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a60, v28
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a59, v27
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a58, v26
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a57, v25
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a56, v24
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a55, v23
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a54, v22
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a53, v21
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a52, v20
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a51, v19
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a50, v18
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a49, v17
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a48, v16
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a47, v15
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a46, v14
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a45, v13
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a44, v12
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a43, v11
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a42, v10
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a41, v9
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a40, v8
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a39, v7
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a38, v6
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a37, v5
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a36, v4
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a35, v3
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a34, v2
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a33, v1
+; GLOBALNESS1-NEXT:    v_accvgpr_write_b32 a32, v0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 43
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_30
+; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
+; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
+; GLOBALNESS1-NEXT:    ; Child Loop BB1_17 Depth 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 1
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    buffer_store_dword v44, off, s[0:3], 0
+; GLOBALNESS1-NEXT:    flat_load_dword v43, v[0:1]
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 21
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 29
+; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 31
+; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS1-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_10
+; GLOBALNESS1-NEXT:  ; %bb.5: ; %NodeBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], -1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:    s_cmp_lt_i32 s57, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS1-NEXT:    s_cbranch_scc1 .LBB1_7
+; GLOBALNESS1-NEXT:  ; %bb.6: ; %LeafBlock3
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s57, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:  .LBB1_7: ; %Flow17
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS1-NEXT:  ; %bb.8: ; %LeafBlock
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_cmp_lg_u32 s57, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], 0
+; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS1-NEXT:  .LBB1_9: ; %Flow18
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s8, 42
+; GLOBALNESS1-NEXT:    v_writelane_b32 v41, s9, 43
+; GLOBALNESS1-NEXT:  .LBB1_10: ; %Flow16
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 1
+; GLOBALNESS1-NEXT:    s_mov_b64 s[60:61], s[68:69]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 21
+; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s92, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s93, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s94, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s95, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s96, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s97, v41, 29
+; GLOBALNESS1-NEXT:    v_readlane_b32 s98, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s99, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s92, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s93, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s94, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s95, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s96, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s97, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s98, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s99, s61
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_3
+; GLOBALNESS1-NEXT:  ; %bb.11: ; %baz.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    flat_load_dword v0, v[0:1]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 11
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 21
+; GLOBALNESS1-NEXT:    s_mov_b32 s65, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s88, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s89, v41, 29
+; GLOBALNESS1-NEXT:    v_readlane_b32 s90, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s91, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s67, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s45
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s45
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 3
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[54:55], 0, v0
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[90:91], s[54:55]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
+; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb33.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
+; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS1-NEXT:    v_readlane_b32 s4, v41, 36
+; GLOBALNESS1-NEXT:    v_readlane_b32 s5, v41, 37
+; GLOBALNESS1-NEXT:    s_mov_b64 s[92:93], s[58:59]
+; GLOBALNESS1-NEXT:    s_mov_b32 s89, s57
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS1-NEXT:  ; %bb.13: ; %bb39.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS1-NEXT:  .LBB1_14: ; %bb44.lr.ph.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v43
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 32
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 34
+; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS1-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 33
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 35
+; GLOBALNESS1-NEXT:    s_branch .LBB1_17
+; GLOBALNESS1-NEXT:  .LBB1_15: ; %Flow7
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb63.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS1-NEXT:  .LBB1_17: ; %bb44.i
+; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[46:47]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb46.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.19: ; %bb50.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS1-NEXT:  ; %bb.20: ; %bb3.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[64:65]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS1-NEXT:  ; %bb.21: ; %bb6.i.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
+; GLOBALNESS1-NEXT:  .LBB1_22: ; %spam.exit.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb55.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    s_add_u32 s60, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s61, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[8:9], s[60:61]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_15
+; GLOBALNESS1-NEXT:  ; %bb.24: ; %bb62.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[0:1], v[44:45], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_15
+; GLOBALNESS1-NEXT:  .LBB1_25: ; %Flow14
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s56, v41, 0
+; GLOBALNESS1-NEXT:    v_readlane_b32 s57, v41, 1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS1-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS1-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS1-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS1-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS1-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS1-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS1-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS1-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS1-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS1-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS1-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS1-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS1-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS1-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS1-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS1-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS1-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS1-NEXT:    s_mov_b32 s68, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s69, s57
+; GLOBALNESS1-NEXT:    v_readlane_b32 s58, v41, 2
+; GLOBALNESS1-NEXT:    v_readlane_b32 s59, v41, 3
+; GLOBALNESS1-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS1-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS1-NEXT:    s_mov_b32 s70, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s71, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s72, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s73, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s74, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s75, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s76, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s77, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s78, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s79, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s80, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s81, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s82, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s83, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s84, s57
+; GLOBALNESS1-NEXT:    s_mov_b32 s85, s57
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_readlane_b32 s60, v41, 4
+; GLOBALNESS1-NEXT:    v_readlane_b32 s61, v41, 5
+; GLOBALNESS1-NEXT:    v_readlane_b32 s62, v41, 6
+; GLOBALNESS1-NEXT:    v_readlane_b32 s63, v41, 7
+; GLOBALNESS1-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS1-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS1-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS1-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    s_mov_b32 s57, s89
+; GLOBALNESS1-NEXT:    s_mov_b64 s[58:59], s[92:93]
+; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow15
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[90:91]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[54:55]
+; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
+; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_readlane_b32 s6, v41, 38
+; GLOBALNESS1-NEXT:    v_readlane_b32 s7, v41, 39
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
+; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_1
+; GLOBALNESS1-NEXT:  .LBB1_29: ; %bb73.i
+; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS1-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS1-NEXT:    s_branch .LBB1_2
+; GLOBALNESS1-NEXT:  .LBB1_30: ; %loop.exit.guard
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_32
+; GLOBALNESS1-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
+; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget@rel32@hi+12
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS1-NEXT:  .LBB1_32: ; %Flow
+; GLOBALNESS1-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_34
+; GLOBALNESS1-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS1-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
+; GLOBALNESS1-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GLOBALNESS1-NEXT:    s_mov_b32 s12, s44
+; GLOBALNESS1-NEXT:    s_mov_b32 s13, s101
+; GLOBALNESS1-NEXT:    s_mov_b32 s14, s100
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v42
+; GLOBALNESS1-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS1-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
+; GLOBALNESS1-NEXT:    s_addc_u32 s17, s17, widget@rel32@hi+12
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS1-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
+;
+; GLOBALNESS0-LABEL: kernel:
+; GLOBALNESS0:       ; %bb.0: ; %bb
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s16, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s15, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s10, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s11, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s6, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s7, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 6
+; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 7
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, v0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v44, 0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[56:57], s[8:9], 0x8
+; GLOBALNESS0-NEXT:    s_nop 0
+; GLOBALNESS0-NEXT:    s_load_dword s8, s[8:9], 0x14
+; GLOBALNESS0-NEXT:    s_nop 0
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[6:7], s[38:39], 0x18
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v44, off
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    global_load_dword v0, v44, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, 0
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s62, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s63, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s65, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, s60
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, s61
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, s62
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a35, s63
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a36, s64
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a37, s65
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a38, s66
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a39, s67
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a40, s68
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a41, s69
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a42, s70
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a43, s71
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a44, s72
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a45, s73
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a46, s74
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a47, s75
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a48, s76
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a49, s77
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a50, s78
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a51, s79
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a52, s80
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a53, s81
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a54, s82
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a55, s83
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a56, s84
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a57, s85
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a58, s86
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a59, s87
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a60, s88
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, s89
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, s90
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, s91
+; GLOBALNESS0-NEXT:    s_movk_i32 s60, 0x80
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 21
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 23
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 24
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 25
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 26
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 27
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s80, 28
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s81, 29
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s82, 30
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s83, 31
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s84, 32
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s85, 33
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s86, 34
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s87, 35
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s88, 36
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s89, 37
+; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, 0x40994400
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s90, 38
+; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s91, 39
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], v[44:45]
+; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 40
+; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 41
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[6:7], 0
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s56, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 42
+; GLOBALNESS0-NEXT:    s_load_dword s9, s[38:39], 0x20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 43
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    s_xor_b64 s[36:37], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s8, 0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    s_xor_b64 s[34:35], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s9, 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
+; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, wobble@gotpcrel32@lo+4
+; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12
+; GLOBALNESS0-NEXT:    s_xor_b64 s[100:101], s[4:5], -1
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 44
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 45
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[4:5], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 46
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 47
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 48
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 49
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s4, 50
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s5, 51
+; GLOBALNESS0-NEXT:    s_mov_b32 s45, 0x3ff00000
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s44, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s58, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s59, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s60, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s61, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s62, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s63, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s64, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s65, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s45, 53
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s66, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s46, 54
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s67, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s47, 55
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s68, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 56
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s69, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 57
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s70, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 58
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s71, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 59
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s72, 16
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[42:43], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 60
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s73, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s74, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 62
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s75, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 63
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 20
+; GLOBALNESS0-NEXT:    s_mov_b32 s33, s14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 21
+; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
+; GLOBALNESS0-NEXT:    s_branch .LBB1_4
+; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 50
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 51
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
+; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow6
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS0-NEXT:  .LBB1_3: ; %Flow19
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a63, v31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v42, 22
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a62, v30
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a61, v29
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a60, v28
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a59, v27
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a58, v26
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a57, v25
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a56, v24
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a55, v23
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a54, v22
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a53, v21
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a52, v20
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a51, v19
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a50, v18
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a49, v17
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a48, v16
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a47, v15
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a46, v14
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a45, v13
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a44, v12
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a43, v11
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a42, v10
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a41, v9
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a40, v8
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a39, v7
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a38, v6
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a37, v5
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a36, v4
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a35, v3
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a34, v2
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a33, v1
+; GLOBALNESS0-NEXT:    v_accvgpr_write_b32 a32, v0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v42, 23
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_30
+; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
+; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
+; GLOBALNESS0-NEXT:    ; Child Loop BB1_17 Depth 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    buffer_store_dword v44, off, s[0:3], 0
+; GLOBALNESS0-NEXT:    flat_load_dword v46, v[0:1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s14, v41, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS0-NEXT:    ; kill: killed $sgpr4_sgpr5
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_10
+; GLOBALNESS0-NEXT:  ; %bb.5: ; %NodeBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], -1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:    s_cmp_lt_i32 s57, 1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS0-NEXT:    s_cbranch_scc1 .LBB1_7
+; GLOBALNESS0-NEXT:  ; %bb.6: ; %LeafBlock3
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s57, 1
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:  .LBB1_7: ; %Flow17
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_9
+; GLOBALNESS0-NEXT:  ; %bb.8: ; %LeafBlock
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_cmp_lg_u32 s57, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], 0
+; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GLOBALNESS0-NEXT:  .LBB1_9: ; %Flow18
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s8, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s9, 23
+; GLOBALNESS0-NEXT:  .LBB1_10: ; %Flow16
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s82, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s83, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s84, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s85, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s86, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s87, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s88, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s89, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s90, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s91, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s92, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s93, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s94, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s95, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s96, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s97, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s98, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s99, s61
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[96:97], s[96:97] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[98:99], s[98:99] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_3
+; GLOBALNESS0-NEXT:  ; %bb.11: ; %baz.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    flat_load_dword v0, v[0:1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 52
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 53
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 54
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 55
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 56
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 57
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 58
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 59
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 60
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 61
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 62
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 63
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v42, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v42, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v42, 2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v42, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v42, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v42, 5
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s68, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s69, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s70, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s71, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s72, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s73, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s74, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s75, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s45, s65
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v42, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v42, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v42, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v42, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v42, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v42, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v42, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v42, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v42, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v42, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v42, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v42, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v42, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v42, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s77, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s78, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s79, s65
+; GLOBALNESS0-NEXT:    s_mov_b32 s80, s61
+; GLOBALNESS0-NEXT:    s_mov_b32 s81, s65
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s44, 52
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s56, 0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s57, 1
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s58, 2
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s59, 3
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s60, 4
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s61, 5
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s62, 6
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s63, 7
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s64, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s45, 53
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s65, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s46, 54
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s66, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s47, 55
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s67, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 56
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s68, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 57
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s69, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 58
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s70, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 59
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s71, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 60
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s72, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 61
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s73, 17
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[96:97], 0, v0
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 62
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s74, 18
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 63
+; GLOBALNESS0-NEXT:    v_writelane_b32 v42, s75, 19
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[78:79], s[78:79] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[80:81], s[80:81] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[82:83], s[82:83] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[84:85], s[84:85] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[86:87], s[86:87] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[88:89], s[88:89] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[90:91], s[90:91] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[92:93], s[92:93] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[94:95], s[94:95] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[40:41], s[96:97]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
+; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb33.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
+; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 44
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 45
+; GLOBALNESS0-NEXT:    s_mov_b64 s[98:99], s[58:59]
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_14
+; GLOBALNESS0-NEXT:  ; %bb.13: ; %bb39.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[2:3], v[44:45], off
+; GLOBALNESS0-NEXT:  .LBB1_14: ; %bb44.lr.ph.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v46
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e32 v2, 0, v40, vcc
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 42
+; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
+; GLOBALNESS0-NEXT:    v_cmp_nlt_f64_e64 s[56:57], 0, v[0:1]
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e64 s[58:59], 0, v2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 41
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 43
+; GLOBALNESS0-NEXT:    s_branch .LBB1_17
+; GLOBALNESS0-NEXT:  .LBB1_15: ; %Flow7
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb63.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[100:101]
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
+; GLOBALNESS0-NEXT:  .LBB1_17: ; %bb44.i
+; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[36:37]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb46.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[34:35]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.19: ; %bb50.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[60:61]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS0-NEXT:  ; %bb.20: ; %bb3.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[62:63]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_22
+; GLOBALNESS0-NEXT:  ; %bb.21: ; %bb6.i.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[56:57]
+; GLOBALNESS0-NEXT:  .LBB1_22: ; %spam.exit.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 46
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 47
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_16
+; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb55.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    s_add_u32 s64, s38, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s46, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s48, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s44, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s65, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s47, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s49, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s45, v41, 3
+; GLOBALNESS0-NEXT:    v_readlane_b32 s50, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s51, v41, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[46:47]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[48:49]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[44:45]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s51
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[46:47]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[48:49]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[8:9], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[10:11], s[44:45]
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    s_mov_b32 s13, s50
+; GLOBALNESS0-NEXT:    s_mov_b32 s14, s51
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], a[32:33], off
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[58:59]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_15
+; GLOBALNESS0-NEXT:  ; %bb.24: ; %bb62.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_17 Depth=2
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[0:1], v[44:45], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_15
+; GLOBALNESS0-NEXT:  .LBB1_25: ; %Flow14
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s58, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s59, v41, 11
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[56:57]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 17
+; GLOBALNESS0-NEXT:    s_mov_b32 s56, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s57, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s58, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s59, s49
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s62, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s63, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s65, s49
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b32 s66, s49
+; GLOBALNESS0-NEXT:    s_mov_b32 s67, s49
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[64:65]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 31
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[66:67]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 39
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s48, 8
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s49, 9
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s50, 10
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s51, 11
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s52, 12
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s53, 13
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s54, 14
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s55, 15
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s56, 16
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s57, 17
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s58, 18
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s59, 19
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s60, 20
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s61, 21
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s62, 22
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s63, 23
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s64, 24
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s65, 25
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s66, 26
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s67, 27
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s68, 28
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s69, 29
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s70, 30
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s71, 31
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s72, 32
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s73, 33
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s74, 34
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s75, 35
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s76, 36
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s77, 37
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s78, 38
+; GLOBALNESS0-NEXT:    v_writelane_b32 v41, s79, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b32 s64, s49
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[52:53]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s64
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[36:37]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[64:65]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[48:49]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    s_mov_b32 s62, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[46:47], s[50:51]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b32 s58, s62
+; GLOBALNESS0-NEXT:    v_readlane_b32 s60, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s61, v41, 9
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[60:61]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    v_readlane_b32 s62, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s63, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 19
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s59, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[58:59]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[56:57]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[50:51]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[48:49]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[46:47]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[44:45]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[74:75], s[62:63]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    s_mov_b64 s[72:73], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[70:71], s[58:59]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[68:69], s[56:57]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[66:67], s[54:55]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[64:65], s[52:53]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[62:63], s[50:51]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[60:61], s[48:49]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:    s_mov_b32 s76, s37
+; GLOBALNESS0-NEXT:    s_mov_b64 s[44:45], s[60:61]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[46:47], s[62:63]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[48:49], s[64:65]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[50:51], s[66:67]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[52:53], s[68:69]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[54:55], s[70:71]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[56:57], s[72:73]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[74:75]
+; GLOBALNESS0-NEXT:    s_mov_b32 s60, s76
+; GLOBALNESS0-NEXT:    v_readlane_b32 s64, v41, 8
+; GLOBALNESS0-NEXT:    v_readlane_b32 s65, v41, 9
+; GLOBALNESS0-NEXT:    v_readlane_b32 s66, v41, 10
+; GLOBALNESS0-NEXT:    v_readlane_b32 s67, v41, 11
+; GLOBALNESS0-NEXT:    v_readlane_b32 s68, v41, 12
+; GLOBALNESS0-NEXT:    v_readlane_b32 s69, v41, 13
+; GLOBALNESS0-NEXT:    v_readlane_b32 s70, v41, 14
+; GLOBALNESS0-NEXT:    v_readlane_b32 s71, v41, 15
+; GLOBALNESS0-NEXT:    v_readlane_b32 s72, v41, 16
+; GLOBALNESS0-NEXT:    v_readlane_b32 s73, v41, 17
+; GLOBALNESS0-NEXT:    v_readlane_b32 s74, v41, 18
+; GLOBALNESS0-NEXT:    v_readlane_b32 s75, v41, 19
+; GLOBALNESS0-NEXT:    s_mov_b32 s61, s65
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[44:45], s[44:45] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[2:3], s[46:47], s[46:47] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[4:5], s[48:49], s[48:49] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[6:7], s[50:51], s[50:51] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[8:9], s[52:53], s[52:53] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[10:11], s[54:55], s[54:55] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[12:13], s[56:57], s[56:57] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[14:15], s[58:59], s[58:59] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[16:17], s[60:61], s[60:61] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[18:19], s[62:63], s[62:63] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[20:21], s[64:65], s[64:65] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[22:23], s[66:67], s[66:67] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[24:25], s[68:69], s[68:69] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[26:27], s[70:71], s[70:71] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[28:29], s[72:73], s[72:73] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[30:31], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s56, v42, 20
+; GLOBALNESS0-NEXT:    s_mov_b64 s[36:37], s[4:5]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s57, v42, 21
+; GLOBALNESS0-NEXT:    s_mov_b64 s[58:59], s[98:99]
+; GLOBALNESS0-NEXT:    v_readlane_b32 s76, v41, 20
+; GLOBALNESS0-NEXT:    v_readlane_b32 s77, v41, 21
+; GLOBALNESS0-NEXT:    v_readlane_b32 s78, v41, 22
+; GLOBALNESS0-NEXT:    v_readlane_b32 s79, v41, 23
+; GLOBALNESS0-NEXT:    v_readlane_b32 s80, v41, 24
+; GLOBALNESS0-NEXT:    v_readlane_b32 s81, v41, 25
+; GLOBALNESS0-NEXT:    v_readlane_b32 s82, v41, 26
+; GLOBALNESS0-NEXT:    v_readlane_b32 s83, v41, 27
+; GLOBALNESS0-NEXT:    v_readlane_b32 s84, v41, 28
+; GLOBALNESS0-NEXT:    v_readlane_b32 s85, v41, 29
+; GLOBALNESS0-NEXT:    v_readlane_b32 s86, v41, 30
+; GLOBALNESS0-NEXT:    v_readlane_b32 s87, v41, 31
+; GLOBALNESS0-NEXT:    v_readlane_b32 s88, v41, 32
+; GLOBALNESS0-NEXT:    v_readlane_b32 s89, v41, 33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s90, v41, 34
+; GLOBALNESS0-NEXT:    v_readlane_b32 s91, v41, 35
+; GLOBALNESS0-NEXT:    v_readlane_b32 s92, v41, 36
+; GLOBALNESS0-NEXT:    v_readlane_b32 s93, v41, 37
+; GLOBALNESS0-NEXT:    v_readlane_b32 s94, v41, 38
+; GLOBALNESS0-NEXT:    v_readlane_b32 s95, v41, 39
+; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow15
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[40:41]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[96:97]
+; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
+; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 48
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 49
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
+; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_1
+; GLOBALNESS0-NEXT:  .LBB1_29: ; %bb73.i
+; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v45, v44
+; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[32:33], 0, 0
+; GLOBALNESS0-NEXT:    global_store_dwordx2 v[32:33], v[44:45], off
+; GLOBALNESS0-NEXT:    s_branch .LBB1_2
+; GLOBALNESS0-NEXT:  .LBB1_30: ; %loop.exit.guard
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], -1
+; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_32
+; GLOBALNESS0-NEXT:  ; %bb.31: ; %bb7.i.i
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s14, v41, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
+; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget@rel32@hi+12
+; GLOBALNESS0-NEXT:    s_mov_b32 s34, s33
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS0-NEXT:    s_mov_b32 s33, s34
+; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], 0
+; GLOBALNESS0-NEXT:  .LBB1_32: ; %Flow
+; GLOBALNESS0-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_34
+; GLOBALNESS0-NEXT:  ; %bb.33: ; %bb11.i.i
+; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
+; GLOBALNESS0-NEXT:    v_readlane_b32 s4, v41, 6
+; GLOBALNESS0-NEXT:    v_readlane_b32 s6, v41, 4
+; GLOBALNESS0-NEXT:    v_readlane_b32 s10, v41, 2
+; GLOBALNESS0-NEXT:    s_addc_u32 s9, s39, 0
+; GLOBALNESS0-NEXT:    v_readlane_b32 s5, v41, 7
+; GLOBALNESS0-NEXT:    v_readlane_b32 s7, v41, 5
+; GLOBALNESS0-NEXT:    v_readlane_b32 s11, v41, 3
+; GLOBALNESS0-NEXT:    s_mov_b32 s12, s33
+; GLOBALNESS0-NEXT:    v_readlane_b32 s13, v41, 1
+; GLOBALNESS0-NEXT:    v_readlane_b32 s14, v41, 0
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v43
+; GLOBALNESS0-NEXT:    s_getpc_b64 s[16:17]
+; GLOBALNESS0-NEXT:    s_add_u32 s16, s16, widget@rel32@lo+4
+; GLOBALNESS0-NEXT:    s_addc_u32 s17, s17, widget@rel32@hi+12
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GLOBALNESS0-NEXT:  .LBB1_34: ; %UnifiedUnreachableBlock
+bb:
+  store i32 0, i32 addrspace(1)* null, align 4
+  %tmp4 = load i32, i32 addrspace(1)* %arg1.global, align 4
+  br label %bb5
+
+bb5:                                              ; preds = %bb5.backedge, %bb
+  %tmp4.i.sroa.0.0 = phi <9 x double> [ undef, %bb ], [ %tmp4.i.sroa.0.1, %bb5.backedge ]
+  %tmp14.1.i = load i32, i32* inttoptr (i64 128 to i32*), align 128
+  store i32 0, i32 addrspace(5)* null, align 4
+  %tmp14.2.i = load i32, i32* inttoptr (i64 128 to i32*), align 128
+  %tmp15.2.i = icmp eq i32 %tmp14.2.i, 0
+  %spec.select.2.i = select i1 %tmp15.2.i, i32 0, i32 %tmp14.1.i
+  tail call void @wobble()
+  br i1 %tmp3.i.i, label %bb4.i.i, label %baz.exit.i
+
+bb4.i.i:                                          ; preds = %bb5
+  switch i32 %tmp5.i.i, label %baz.exit.i [
+    i32 0, label %bb7.i.i
+    i32 1, label %bb11.i.i
+  ]
+
+bb7.i.i:                                          ; preds = %bb4.i.i
+  tail call fastcc void @widget()
+  unreachable
+
+bb11.i.i:                                         ; preds = %bb4.i.i
+  tail call fastcc void @widget()
+  unreachable
+
+baz.exit.i:                                       ; preds = %bb4.i.i, %bb5
+  %tmp26.i = load i32, i32* null, align 4
+  %tmp27.i4 = load double, double addrspace(1)* null, align 8
+  %tmp31.i = icmp slt i32 %tmp26.i, 0
+  br i1 %tmp31.i, label %bb33.i, label %bb64.i
+
+bb33.i:                                           ; preds = %baz.exit.i
+  %tmp38.i = icmp slt i32 %tmp4, 0
+  br i1 %tmp38.i, label %bb39.i, label %bb44.lr.ph.i
+
+bb39.i:                                           ; preds = %bb33.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb44.lr.ph.i
+
+bb44.lr.ph.i:                                     ; preds = %bb39.i, %bb33.i
+  br label %bb44.i
+
+bb44.i:                                           ; preds = %bb63.i, %bb44.lr.ph.i
+  br i1 %tmp3.i.i, label %bb63.i, label %bb46.i
+
+bb46.i:                                           ; preds = %bb44.i
+  br i1 %tmp438.i, label %bb63.i, label %bb50.i
+
+bb50.i:                                           ; preds = %bb46.i
+  switch i32 0, label %spam.exit.i [
+    i32 0, label %bb1.i.i
+  ]
+
+bb1.i.i:                                          ; preds = %bb50.i
+  %tmp2.i.i = fcmp ogt double %tmp27.i, 1.617000e+03
+  br i1 %tmp2.i.i, label %spam.exit.i, label %bb3.i.i
+
+bb3.i.i:                                          ; preds = %bb1.i.i
+  %tmp4.i.i = fcmp ogt double %tmp27.i, 0.000000e+00
+  br i1 %tmp4.i.i, label %spam.exit.i, label %bb6.i.i
+
+bb6.i.i:                                          ; preds = %bb3.i.i
+  %tmp7.i.i = fcmp ogt double %tmp27.i4, 0.000000e+00
+  br i1 %tmp7.i.i, label %spam.exit.i, label %bb8.i.i
+
+bb8.i.i:                                          ; preds = %bb6.i.i
+  tail call void null()
+  br label %spam.exit.i
+
+spam.exit.i:                                      ; preds = %bb8.i.i, %bb6.i.i, %bb3.i.i, %bb1.i.i, %bb50.i
+  %tmp22.i = icmp sgt i32 %tmp4, 0
+  br i1 %tmp22.i, label %bb63.i, label %bb55.i
+
+bb55.i:                                           ; preds = %spam.exit.i
+  tail call void @wobble()
+  %tmp0 = extractelement <9 x double> %tmp4.i.sroa.0.0, i32 0
+  store double %tmp0, double addrspace(1)* null, align 8
+  tail call void @wobble()
+  %tmp61.i = icmp eq i32 %spec.select.2.i, 0
+  br i1 %tmp61.i, label %bb62.i, label %bb63.i
+
+bb62.i:                                           ; preds = %bb55.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb63.i
+
+bb63.i:                                           ; preds = %bb62.i, %bb55.i, %spam.exit.i, %bb46.i, %bb44.i
+  br i1 %tmp48.i, label %bb44.i, label %bb64.i
+
+bb64.i:                                           ; preds = %bb63.i, %baz.exit.i
+  %tmp4.i.sroa.0.1 = phi <9 x double> [ <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %baz.exit.i ], [ zeroinitializer, %bb63.i ]
+  br i1 %tmp31.i, label %bb67.i, label %bb5.backedge
+
+bb5.backedge:                                     ; preds = %bb73.i, %bb70.i, %bb64.i
+  br label %bb5
+
+bb67.i:                                           ; preds = %bb64.i
+  %tmp68.i = icmp eq i32 %tmp4, 1
+  br i1 %tmp68.i, label %bb69.i, label %bb70.i
+
+bb69.i:                                           ; preds = %bb67.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb70.i
+
+bb70.i:                                           ; preds = %bb69.i, %bb67.i
+  %tmp3.i.i2 = icmp eq i32 %tmp4, 0
+  br i1 %tmp3.i.i2, label %bb73.i, label %bb5.backedge
+
+bb73.i:                                           ; preds = %bb70.i
+  store double 0.000000e+00, double addrspace(1)* null, align 8
+  br label %bb5.backedge
+}
Index: llvm/test/CodeGen/Thumb2/mve-vst3.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-vst3.ll
+++ llvm/test/CodeGen/Thumb2/mve-vst3.ll
@@ -181,13 +181,13 @@
 ; CHECK-NEXT:    vmov.f32 s0, s17
 ; CHECK-NEXT:    vmov.f32 s2, s14
 ; CHECK-NEXT:    vmov.f32 s3, s18
+; CHECK-NEXT:    vmov.f32 s21, s7
 ; CHECK-NEXT:    vstrw.32 q0, [sp, #96] @ 16-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s21, s7
-; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f64 d0, d4
 ; CHECK-NEXT:    vstrw.32 q5, [r1, #32]
 ; CHECK-NEXT:    vmov.f32 s22, s11
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.f64 d0, d4
 ; CHECK-NEXT:    vmov.f32 s19, s10
 ; CHECK-NEXT:    vldrw.u32 q2, [sp, #80] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s21, s7
@@ -200,44 +200,45 @@
 ; CHECK-NEXT:    vmov.f32 s16, s1
 ; CHECK-NEXT:    vmov.f32 s13, s0
 ; CHECK-NEXT:    vldrw.u32 q0, [sp, #64] @ 16-byte Reload
-; CHECK-NEXT:    vmov.f32 s18, s6
-; CHECK-NEXT:    vmov.f32 s15, s5
-; CHECK-NEXT:    vmov.f32 s5, s27
-; CHECK-NEXT:    vmov.f32 s8, s24
-; CHECK-NEXT:    vmov.f32 s6, s3
-; CHECK-NEXT:    vmov.f32 s9, s0
-; CHECK-NEXT:    vmov.f32 s24, s1
-; CHECK-NEXT:    vmov.f32 s27, s2
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
 ; CHECK-NEXT:    vmov r0, r3, d14
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #48] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s7, s11
-; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vmov.f32 s8, s24
+; CHECK-NEXT:    vmov.f32 s9, s0
 ; CHECK-NEXT:    vmov.f32 s11, s25
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.f32 s20, s12
-; CHECK-NEXT:    vmov.32 q6[1], r3
 ; CHECK-NEXT:    vmov.f32 s12, s4
-; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
 ; CHECK-NEXT:    vmov.f32 s4, s10
 ; CHECK-NEXT:    vmov.32 q2[2], r0
 ; CHECK-NEXT:    vmov r0, lr, d14
 ; CHECK-NEXT:    vldrw.u32 q7, [sp, #144] @ 16-byte Reload
-; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vmov.f32 s18, s6
 ; CHECK-NEXT:    vmov.32 q5[2], r0
-; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
-; CHECK-NEXT:    vmov r2, r4, d14
+; CHECK-NEXT:    vmov.f64 d12, d14
 ; CHECK-NEXT:    vstrw.32 q2, [r1, #48]
-; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
-; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vstrw.32 q5, [r1, #144]
+; CHECK-NEXT:    vmov.f32 s15, s5
+; CHECK-NEXT:    vmov.f32 s5, s27
+; CHECK-NEXT:    vmov.f32 s6, s3
+; CHECK-NEXT:    vmov.f32 s24, s1
+; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vmov.f32 s27, s2
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vmov r2, r4, d14
+; CHECK-NEXT:    vmov.32 q6[1], r3
+; CHECK-NEXT:    vstrw.32 q0, [r1, #128]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #96] @ 16-byte Reload
 ; CHECK-NEXT:    vmov.32 q3[2], r2
 ; CHECK-NEXT:    vmov.32 q4[1], r4
-; CHECK-NEXT:    vmov.32 q0[2], r12
-; CHECK-NEXT:    vstrw.32 q1, [r1, #80]
+; CHECK-NEXT:    vmov.32 q0[1], lr
+; CHECK-NEXT:    vstrw.32 q6, [r1, #64]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #160]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #112] @ 16-byte Reload
 ; CHECK-NEXT:    vstrw.32 q3, [r1, #96]
 ; CHECK-NEXT:    vstrw.32 q4, [r1, #112]
-; CHECK-NEXT:    vstrw.32 q5, [r1, #144]
+; CHECK-NEXT:    vstrw.32 q0, [r1, #176]
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #128] @ 16-byte Reload
+; CHECK-NEXT:    vmov.32 q0[2], r12
 ; CHECK-NEXT:    vstrw.32 q0, [r1]
 ; CHECK-NEXT:    add sp, #160
 ; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
Index: llvm/test/CodeGen/Thumb2/mve-vst4.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/mve-vst4.ll
+++ llvm/test/CodeGen/Thumb2/mve-vst4.ll
@@ -122,39 +122,29 @@
 ; CHECK-NEXT:    sub sp, #192
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
 ; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
-; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #240]
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]
@@ -922,39 +912,29 @@
 ; CHECK-NEXT:    sub sp, #192
 ; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
 ; CHECK-NEXT:    add r2, sp, #64
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #176]
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #208]
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #144]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #80]
 ; CHECK-NEXT:    vstmia r2, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
-; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
-; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #240]
-; CHECK-NEXT:    vstmia r2, {d2, d3, d4, d5, d6, d7, d8, d9} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
 ; CHECK-NEXT:    vldrw.u32 q0, [r0]
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q6, [r0, #176]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
-; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Reload
-; CHECK-NEXT:    add r2, sp, #128
-; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    vldrw.u32 q2, [r0, #128]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #240]
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vldrw.u32 q3, [r0, #192]
+; CHECK-NEXT:    vldrw.u32 q1, [r0, #64]
 ; CHECK-NEXT:    vstmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Spill
 ; CHECK-NEXT:    vldrw.u32 q2, [r0, #160]
+; CHECK-NEXT:    vldrw.u32 q4, [r0, #48]
+; CHECK-NEXT:    add r2, sp, #128
+; CHECK-NEXT:    vmov q7, q5
 ; CHECK-NEXT:    vldrw.u32 q3, [r0, #224]
 ; CHECK-NEXT:    vldrw.u32 q1, [r0, #96]
+; CHECK-NEXT:    vldrw.u32 q5, [r0, #112]
 ; CHECK-NEXT:    vstmia r2, {d8, d9, d10, d11, d12, d13, d14, d15} @ 64-byte Spill
 ; CHECK-NEXT:    vmov q6, q2
-; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vmov q5, q1
+; CHECK-NEXT:    vmov q7, q3
 ; CHECK-NEXT:    vldmia sp, {d0, d1, d2, d3, d4, d5, d6, d7} @ 64-byte Reload
 ; CHECK-NEXT:    add r2, sp, #64
 ; CHECK-NEXT:    vldrw.u32 q4, [r0, #32]