diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -7,10 +7,10 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// This pass tries to remove unnecessary VGPR live range in divergent if-else
-/// structure.
+/// This pass tries to remove unnecessary VGPR live ranges in divergent if-else
+/// structures and waterfall loops.
 ///
-/// When we do structurization, we usually transform a if-else into two
+/// When we do structurization, we usually transform an if-else into two
 /// sucessive if-then (with a flow block to do predicate inversion). Consider a
 /// simple case after structurization: A divergent value %a was defined before
 /// if-else and used in both THEN (use in THEN is optional) and ELSE part:
@@ -29,10 +29,10 @@
 ///
 ///  As register allocator has no idea of the thread-control-flow, it will just
 ///  assume %a would be alive in the whole range of bb.then because of a later
-///  use in bb.else. On AMDGPU architecture, the VGPR was accessed with respect
+///  use in bb.else. On AMDGPU architecture, the VGPR is accessed with respect
 ///  to exec mask. For this if-else case, the lanes active in bb.then will be
-///  inactive in bb.else, and vice-verse. So we are safe to say that %a was dead
-///  after the last use in bb.then untill the end of the block. The reason is
+///  inactive in bb.else, and vice-versa. So we are safe to say that %a was dead
+///  after the last use in bb.then until the end of the block. The reason is
 ///  the instructions in bb.then will only overwrite lanes that will never be
 ///  accessed in bb.else.
 ///
@@ -46,6 +46,28 @@
 ///      sure the second loop iteration still get correct data.
 ///  2.) There should be no further uses after the IF-ELSE region.
 ///
+///
+/// Waterfall loops get inserted around instructions that use divergent values
+/// but can only be executed with a uniform value. For example an indirect call
+/// to a divergent address:
+///    bb.start:
+///      %a = ...
+///      %fun = ...
+///      ...
+///    bb.loop:
+///      call %fun (%a)
+///      ... // %a can be dead here
+///      loop %bb.loop
+///
+///  The loop block is executed multiple times, but it is run exactly once for
+///  each active lane. Similar to the if-else case, the register allocator
+///  assumes that %a is live throughout the loop as it is used again in the next
+///  iteration. If %a is a VGPR that is unused after the loop, it does not need
+///  to be live after its last use in the loop block. By inserting a phi-node at
+///  the start of bb.loop that is undef when coming from bb.loop, the register
+///  allocation knows that the value of %a does not need to be preserved through
+///  iterations of the loop.
+///
 //
 //===----------------------------------------------------------------------===//
 
@@ -89,6 +111,10 @@
                             SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks,
                             SmallVectorImpl<Register> &CandidateRegs) const;
 
+  void collectWaterfallCandidateRegisters(
+      MachineBasicBlock *Loop,
+      SmallSetVector<Register, 16> &CandidateRegs) const;
+
   void findNonPHIUsesInBlock(Register Reg, MachineBasicBlock *MBB,
                              SmallVectorImpl<MachineInstr *> &Uses) const;
 
@@ -105,6 +131,8 @@
                     MachineBasicBlock *Flow, MachineBasicBlock *Endif,
                     SmallSetVector<MachineBasicBlock *, 16> &ElseBlocks) const;
 
+  void optimizeWaterfallLiveRange(Register Reg, MachineBasicBlock *If) const;
+
   SIOptimizeVGPRLiveRange() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -278,6 +306,54 @@
   }
 }
 
+/// Collect the registers used in the waterfall loop block that are defined
+/// before.
+void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
+    MachineBasicBlock *Loop,
+    SmallSetVector<Register, 16> &CandidateRegs) const {
+
+  for (auto &MI : Loop->instrs()) {
+    if (MI.isDebugInstr())
+      continue;
+
+    for (auto &MO : MI.operands()) {
+      if (!MO.isReg() || !MO.getReg() || MO.isDef())
+        continue;
+
+      Register MOReg = MO.getReg();
+      // We can only optimize AGPR/VGPR virtual register
+      if (MOReg.isPhysical() || !TRI->isVectorRegister(*MRI, MOReg))
+        continue;
+
+      if (MO.readsReg()) {
+        const MachineBasicBlock *DefMBB = MRI->getVRegDef(MOReg)->getParent();
+        // Make sure the value is defined before the LOOP block
+        if (DefMBB != Loop && !CandidateRegs.contains(MOReg)) {
+          // If the variable is used after the loop, the register coalescer will
+          // merge the newly created register and remove the phi node again.
+          // Just do nothing in that case.
+          LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(MOReg);
+          bool IsUsed = false;
+          for (auto *Succ : Loop->successors()) {
+            if (Succ != Loop && OldVarInfo.isLiveIn(*Succ, MOReg, *MRI)) {
+              IsUsed = true;
+              break;
+            }
+          }
+          if (!IsUsed) {
+            LLVM_DEBUG(dbgs() << "Found candidate reg: "
+                              << printReg(MOReg, TRI, 0, MRI) << '\n');
+            CandidateRegs.insert(MOReg);
+          } else {
+            LLVM_DEBUG(dbgs() << "Reg is used after loop, ignoring: "
+                              << printReg(MOReg, TRI, 0, MRI) << '\n');
+          }
+        }
+      }
+    }
+  }
+}
+
 // Re-calculate the liveness of \p Reg in the THEN-region
 void SIOptimizeVGPRLiveRange::updateLiveRangeInThenRegion(
     Register Reg, MachineBasicBlock *If, MachineBasicBlock *Flow) const {
@@ -403,12 +479,8 @@
   }
 
   // Replace all uses in the ELSE region or the PHIs in ENDIF block
-  for (auto I = MRI->use_begin(Reg), E = MRI->use_end(); I != E;) {
-    MachineOperand &O = *I;
-    // This is a little bit tricky, the setReg() will update the linked list,
-    // so we have to increment the iterator before setReg() to avoid skipping
-    // some uses.
-    ++I;
+  // Use early increment range because setReg() will update the linked list.
+  for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
     auto *UseMI = O.getParent();
     auto *UseBlock = UseMI->getParent();
     // Replace uses in Endif block
@@ -431,6 +503,53 @@
   updateLiveRangeInThenRegion(Reg, If, Flow);
 }
 
+void SIOptimizeVGPRLiveRange::optimizeWaterfallLiveRange(
+    Register Reg, MachineBasicBlock *Loop) const {
+  // Insert a new PHI, marking the value from the last loop iteration undef.
+  LLVM_DEBUG(dbgs() << "Optimizing " << printReg(Reg, TRI) << '\n');
+  const auto *RC = MRI->getRegClass(Reg);
+  Register NewReg = MRI->createVirtualRegister(RC);
+  Register UndefReg = MRI->createVirtualRegister(RC);
+
+  // Replace all uses in the LOOP region
+  // Use early increment range because setReg() will update the linked list.
+  for (auto &O : make_early_inc_range(MRI->use_operands(Reg))) {
+    auto *UseMI = O.getParent();
+    auto *UseBlock = UseMI->getParent();
+    // Replace uses in Loop block
+    if (UseBlock == Loop)
+      O.setReg(NewReg);
+  }
+
+  MachineInstrBuilder PHI = BuildMI(*Loop, Loop->getFirstNonPHI(), DebugLoc(),
+                                    TII->get(TargetOpcode::PHI), NewReg);
+  for (auto *Pred : Loop->predecessors()) {
+    if (Pred == Loop)
+      PHI.addReg(UndefReg, RegState::Undef).addMBB(Pred);
+    else
+      PHI.addReg(Reg).addMBB(Pred);
+  }
+
+  LiveVariables::VarInfo &NewVarInfo = LV->getVarInfo(NewReg);
+  LiveVariables::VarInfo &OldVarInfo = LV->getVarInfo(Reg);
+
+  // collectWaterfallCandidateRegisters only collects registers that are dead
+  // after the loop. So we know that the old reg is not live throughout the
+  // whole block anymore.
+  OldVarInfo.AliveBlocks.reset(Loop->getNumber());
+
+  // Mark the last use as kill
+  for (auto &MI : reverse(Loop->instrs())) {
+    if (MI.readsRegister(NewReg, TRI)) {
+      MI.addRegisterKilled(NewReg, TRI);
+      NewVarInfo.Kills.push_back(&MI);
+      break;
+    }
+  }
+  assert(!NewVarInfo.Kills.empty() &&
+         "Failed to find last usage of register in loop");
+}
+
 char SIOptimizeVGPRLiveRange::ID = 0;
 
 INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -491,6 +610,16 @@
         // Now we are safe to optimize.
         for (auto Reg : CandidateRegs)
           optimizeLiveRange(Reg, &MBB, IfTarget, Endif, ElseBlocks);
+      } else if (MI.getOpcode() == AMDGPU::SI_WATERFALL_LOOP) {
+        LLVM_DEBUG(dbgs() << "Checking Waterfall loop: "
+                          << printMBBReference(MBB) << '\n');
+
+        SmallSetVector<Register, 16> CandidateRegs;
+        collectWaterfallCandidateRegisters(&MBB, CandidateRegs);
+        MadeChange |= !CandidateRegs.empty();
+        // Now we are safe to optimize.
+        for (auto Reg : CandidateRegs)
+          optimizeWaterfallLiveRange(Reg, &MBB);
       }
     }
   }
diff --git a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
--- a/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/image-sample-waterfall.ll
@@ -24,6 +24,8 @@
 ; GCN-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN-NEXT: s_nop 0
 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG7]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
+; GCN-NEXT: ; implicit-def: $vgpr8_vgpr9
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
 ; GCN-NEXT: s_cbranch_execnz [[RSRC_LOOP]]
 define amdgpu_ps <4 x float> @water_loop_rsrc(<8 x i32> %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
@@ -48,6 +50,8 @@
 ; GCN-NEXT: s_nop 0
 
 ; GCN-NEXT: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[SREG0]]:[[SREG3]]{{\]}} dmask:0x1
+; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5
 ; GCN-NEXT: s_xor_b64 exec, exec, [[SAVE]]
 ; GCN-NEXT: s_cbranch_execnz [[SAMP_LOOP]]
 define amdgpu_ps <4 x float> @water_loop_samp(<8 x i32> inreg %rsrc, <4 x i32> %samp, float %s, float %t) {
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -202,32 +202,28 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s30, 15
-; GCN-NEXT:    v_writelane_b32 v43, s31, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -235,13 +231,11 @@
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB2_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
@@ -250,36 +244,34 @@
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
 ; GCN-NEXT:    s_cbranch_execnz BB2_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s4, v43, 15
-; GCN-NEXT:    v_readlane_b32 s5, v43, 16
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 17
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 17
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -292,32 +284,28 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s30, 15
-; GCN-NEXT:    v_writelane_b32 v43, s31, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -325,13 +313,11 @@
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB3_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
@@ -341,36 +327,34 @@
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
 ; GCN-NEXT:    s_cbranch_execnz BB3_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s4, v43, 15
-; GCN-NEXT:    v_readlane_b32 s5, v43, 16
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 17
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 17
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -383,32 +367,28 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 17
+; GCN-NEXT:    v_writelane_b32 v40, s33, 17
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s30, 15
-; GCN-NEXT:    v_writelane_b32 v43, s31, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s30, 15
+; GCN-NEXT:    v_writelane_b32 v40, s31, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -416,13 +396,11 @@
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], exec
 ; GCN-NEXT:  BB4_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[48:49], vcc
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
@@ -431,37 +409,36 @@
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    v_mov_b32_e32 v2, v0
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[48:49]
 ; GCN-NEXT:    s_cbranch_execnz BB4_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[46:47]
-; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GCN-NEXT:    v_readlane_b32 s4, v43, 15
-; GCN-NEXT:    v_readlane_b32 s5, v43, 16
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 17
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, 1, v2
+; GCN-NEXT:    v_readlane_b32 s4, v40, 15
+; GCN-NEXT:    v_readlane_b32 s5, v40, 16
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 17
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
@@ -475,32 +452,28 @@
 ; GCN:       ; %bb.0: ; %bb0
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[16:17], -1
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[16:17]
-; GCN-NEXT:    v_writelane_b32 v43, s33, 19
+; GCN-NEXT:    v_writelane_b32 v40, s33, 19
 ; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v43, s34, 0
-; GCN-NEXT:    v_writelane_b32 v43, s35, 1
-; GCN-NEXT:    v_writelane_b32 v43, s36, 2
-; GCN-NEXT:    v_writelane_b32 v43, s38, 3
-; GCN-NEXT:    v_writelane_b32 v43, s39, 4
-; GCN-NEXT:    v_writelane_b32 v43, s40, 5
-; GCN-NEXT:    v_writelane_b32 v43, s41, 6
-; GCN-NEXT:    v_writelane_b32 v43, s42, 7
-; GCN-NEXT:    v_writelane_b32 v43, s43, 8
-; GCN-NEXT:    v_writelane_b32 v43, s44, 9
-; GCN-NEXT:    v_writelane_b32 v43, s45, 10
-; GCN-NEXT:    v_writelane_b32 v43, s46, 11
-; GCN-NEXT:    v_writelane_b32 v43, s47, 12
-; GCN-NEXT:    v_writelane_b32 v43, s48, 13
-; GCN-NEXT:    v_writelane_b32 v43, s49, 14
-; GCN-NEXT:    v_writelane_b32 v43, s50, 15
-; GCN-NEXT:    v_writelane_b32 v43, s51, 16
-; GCN-NEXT:    v_mov_b32_e32 v40, v31
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s38, 3
+; GCN-NEXT:    v_writelane_b32 v40, s39, 4
+; GCN-NEXT:    v_writelane_b32 v40, s40, 5
+; GCN-NEXT:    v_writelane_b32 v40, s41, 6
+; GCN-NEXT:    v_writelane_b32 v40, s42, 7
+; GCN-NEXT:    v_writelane_b32 v40, s43, 8
+; GCN-NEXT:    v_writelane_b32 v40, s44, 9
+; GCN-NEXT:    v_writelane_b32 v40, s45, 10
+; GCN-NEXT:    v_writelane_b32 v40, s46, 11
+; GCN-NEXT:    v_writelane_b32 v40, s47, 12
+; GCN-NEXT:    v_writelane_b32 v40, s48, 13
+; GCN-NEXT:    v_writelane_b32 v40, s49, 14
+; GCN-NEXT:    v_writelane_b32 v40, s50, 15
+; GCN-NEXT:    v_writelane_b32 v40, s51, 16
 ; GCN-NEXT:    s_mov_b32 s34, s14
 ; GCN-NEXT:    s_mov_b32 s35, s13
 ; GCN-NEXT:    s_mov_b32 s36, s12
@@ -508,20 +481,18 @@
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[42:43], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[44:45], s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v42, v1
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
-; GCN-NEXT:    v_and_b32_e32 v0, 1, v2
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_and_b32_e32 v2, 1, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GCN-NEXT:    s_and_saveexec_b64 s[46:47], vcc
 ; GCN-NEXT:    s_cbranch_execz BB5_4
 ; GCN-NEXT:  ; %bb.1: ; %bb1
-; GCN-NEXT:    v_writelane_b32 v43, s30, 17
-; GCN-NEXT:    v_writelane_b32 v43, s31, 18
+; GCN-NEXT:    v_writelane_b32 v40, s30, 17
+; GCN-NEXT:    v_writelane_b32 v40, s31, 18
 ; GCN-NEXT:    s_mov_b64 s[48:49], exec
 ; GCN-NEXT:  BB5_2: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v41
-; GCN-NEXT:    v_readfirstlane_b32 s17, v42
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[41:42]
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[16:17], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[50:51], vcc
 ; GCN-NEXT:    s_mov_b64 s[4:5], s[44:45]
 ; GCN-NEXT:    s_mov_b64 s[6:7], s[42:43]
@@ -530,40 +501,38 @@
 ; GCN-NEXT:    s_mov_b32 s12, s36
 ; GCN-NEXT:    s_mov_b32 s13, s35
 ; GCN-NEXT:    s_mov_b32 s14, s34
-; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GCN-NEXT:    ; implicit-def: $vgpr31
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[50:51]
 ; GCN-NEXT:    s_cbranch_execnz BB5_2
 ; GCN-NEXT:  ; %bb.3:
 ; GCN-NEXT:    s_mov_b64 exec, s[48:49]
-; GCN-NEXT:    v_readlane_b32 s30, v43, 17
-; GCN-NEXT:    v_readlane_b32 s31, v43, 18
+; GCN-NEXT:    v_readlane_b32 s30, v40, 17
+; GCN-NEXT:    v_readlane_b32 s31, v40, 18
 ; GCN-NEXT:  BB5_4: ; %bb2
 ; GCN-NEXT:    s_or_b64 exec, exec, s[46:47]
-; GCN-NEXT:    v_readlane_b32 s51, v43, 16
-; GCN-NEXT:    v_readlane_b32 s50, v43, 15
-; GCN-NEXT:    v_readlane_b32 s49, v43, 14
-; GCN-NEXT:    v_readlane_b32 s48, v43, 13
-; GCN-NEXT:    v_readlane_b32 s47, v43, 12
-; GCN-NEXT:    v_readlane_b32 s46, v43, 11
-; GCN-NEXT:    v_readlane_b32 s45, v43, 10
-; GCN-NEXT:    v_readlane_b32 s44, v43, 9
-; GCN-NEXT:    v_readlane_b32 s43, v43, 8
-; GCN-NEXT:    v_readlane_b32 s42, v43, 7
-; GCN-NEXT:    v_readlane_b32 s41, v43, 6
-; GCN-NEXT:    v_readlane_b32 s40, v43, 5
-; GCN-NEXT:    v_readlane_b32 s39, v43, 4
-; GCN-NEXT:    v_readlane_b32 s38, v43, 3
-; GCN-NEXT:    v_readlane_b32 s36, v43, 2
-; GCN-NEXT:    v_readlane_b32 s35, v43, 1
-; GCN-NEXT:    v_readlane_b32 s34, v43, 0
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_addk_i32 s32, 0xf800
-; GCN-NEXT:    v_readlane_b32 s33, v43, 19
+; GCN-NEXT:    v_readlane_b32 s51, v40, 16
+; GCN-NEXT:    v_readlane_b32 s50, v40, 15
+; GCN-NEXT:    v_readlane_b32 s49, v40, 14
+; GCN-NEXT:    v_readlane_b32 s48, v40, 13
+; GCN-NEXT:    v_readlane_b32 s47, v40, 12
+; GCN-NEXT:    v_readlane_b32 s46, v40, 11
+; GCN-NEXT:    v_readlane_b32 s45, v40, 10
+; GCN-NEXT:    v_readlane_b32 s44, v40, 9
+; GCN-NEXT:    v_readlane_b32 s43, v40, 8
+; GCN-NEXT:    v_readlane_b32 s42, v40, 7
+; GCN-NEXT:    v_readlane_b32 s41, v40, 6
+; GCN-NEXT:    v_readlane_b32 s40, v40, 5
+; GCN-NEXT:    v_readlane_b32 s39, v40, 4
+; GCN-NEXT:    v_readlane_b32 s38, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 19
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -583,48 +552,145 @@
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_writelane_b32 v42, s33, 6
+; GCN-NEXT:    v_writelane_b32 v40, s33, 6
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v42, s34, 0
-; GCN-NEXT:    v_writelane_b32 v42, s35, 1
-; GCN-NEXT:    v_writelane_b32 v42, s36, 2
-; GCN-NEXT:    v_writelane_b32 v42, s37, 3
-; GCN-NEXT:    v_writelane_b32 v42, s30, 4
-; GCN-NEXT:    v_writelane_b32 v42, s31, 5
-; GCN-NEXT:    v_mov_b32_e32 v41, v1
-; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 4
+; GCN-NEXT:    v_writelane_b32 v40, s31, 5
 ; GCN-NEXT:    s_mov_b64 s[34:35], exec
 ; GCN-NEXT:  BB6_1: ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_readfirstlane_b32 s6, v40
-; GCN-NEXT:    v_readfirstlane_b32 s7, v41
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[40:41]
+; GCN-NEXT:    v_readfirstlane_b32 s6, v0
+; GCN-NEXT:    v_readfirstlane_b32 s7, v1
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
 ; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
 ; GCN-NEXT:    s_cbranch_execnz BB6_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    v_readlane_b32 s4, v42, 4
-; GCN-NEXT:    v_readlane_b32 s5, v42, 5
-; GCN-NEXT:    v_readlane_b32 s37, v42, 3
-; GCN-NEXT:    v_readlane_b32 s36, v42, 2
-; GCN-NEXT:    v_readlane_b32 s35, v42, 1
-; GCN-NEXT:    v_readlane_b32 s34, v42, 0
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s4, v40, 4
+; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
-; GCN-NEXT:    v_readlane_b32 s33, v42, 6
+; GCN-NEXT:    v_readlane_b32 s33, v40, 6
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
   call amdgpu_gfx void %fptr(i32 inreg 123)
   ret void
 }
+
+define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, void(i32)* %fptr) {
+; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_writelane_b32 v41, s33, 6
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v41, s34, 0
+; GCN-NEXT:    v_writelane_b32 v41, s35, 1
+; GCN-NEXT:    v_writelane_b32 v41, s36, 2
+; GCN-NEXT:    v_writelane_b32 v41, s37, 3
+; GCN-NEXT:    v_writelane_b32 v41, s30, 4
+; GCN-NEXT:    v_writelane_b32 v41, s31, 5
+; GCN-NEXT:    v_mov_b32_e32 v40, v0
+; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:  BB7_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v1
+; GCN-NEXT:    v_readfirstlane_b32 s5, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_cbranch_execnz BB7_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    v_readlane_b32 s4, v41, 4
+; GCN-NEXT:    v_readlane_b32 s5, v41, 5
+; GCN-NEXT:    v_readlane_b32 s37, v41, 3
+; GCN-NEXT:    v_readlane_b32 s36, v41, 2
+; GCN-NEXT:    v_readlane_b32 s35, v41, 1
+; GCN-NEXT:    v_readlane_b32 s34, v41, 0
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v41, 6
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+  call amdgpu_gfx void %fptr(i32 %i)
+  ret i32 %i
+}
+
+; Use a variable inside a waterfall loop and use the return variable after the loop.
+; TODO The argument and return variable could be in the same physical register, but the register
+; allocator is not able to do that because the return value clashes with the liverange of an
+; IMPLICIT_DEF of the argument.
+define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, i32(i32)* %fptr) {
+; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; GCN-NEXT:    s_mov_b64 exec, s[4:5]
+; GCN-NEXT:    v_writelane_b32 v40, s33, 6
+; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    s_addk_i32 s32, 0x400
+; GCN-NEXT:    v_writelane_b32 v40, s34, 0
+; GCN-NEXT:    v_writelane_b32 v40, s35, 1
+; GCN-NEXT:    v_writelane_b32 v40, s36, 2
+; GCN-NEXT:    v_writelane_b32 v40, s37, 3
+; GCN-NEXT:    v_writelane_b32 v40, s30, 4
+; GCN-NEXT:    v_writelane_b32 v40, s31, 5
+; GCN-NEXT:    s_mov_b64 s[34:35], exec
+; GCN-NEXT:  BB8_1: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT:    v_readfirstlane_b32 s4, v1
+; GCN-NEXT:    v_readfirstlane_b32 s5, v2
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[4:5], v[1:2]
+; GCN-NEXT:    s_and_saveexec_b64 s[36:37], vcc
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:    v_mov_b32_e32 v3, v0
+; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
+; GCN-NEXT:    ; implicit-def: $vgpr0
+; GCN-NEXT:    s_xor_b64 exec, exec, s[36:37]
+; GCN-NEXT:    s_cbranch_execnz BB8_1
+; GCN-NEXT:  ; %bb.2:
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
+; GCN-NEXT:    v_mov_b32_e32 v0, v3
+; GCN-NEXT:    v_readlane_b32 s4, v40, 4
+; GCN-NEXT:    v_readlane_b32 s5, v40, 5
+; GCN-NEXT:    v_readlane_b32 s37, v40, 3
+; GCN-NEXT:    v_readlane_b32 s36, v40, 2
+; GCN-NEXT:    v_readlane_b32 s35, v40, 1
+; GCN-NEXT:    v_readlane_b32 s34, v40, 0
+; GCN-NEXT:    s_addk_i32 s32, 0xfc00
+; GCN-NEXT:    v_readlane_b32 s33, v40, 6
+; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[6:7]
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_setpc_b64 s[4:5]
+  %ret = call amdgpu_gfx i32 %fptr(i32 %i)
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.v3f16.ll
@@ -18,6 +18,8 @@
 ; GFX10-NEXT:    s_and_b32 s0, vcc_lo, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, s0
 ; GFX10-NEXT:    buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX10-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execnz BB0_1
@@ -44,6 +46,8 @@
 ; GFX9-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX9-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX9-NEXT:    ; implicit-def: $vgpr4
 ; GFX9-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_execnz BB0_1
 ; GFX9-NEXT:  ; %bb.2:
@@ -68,6 +72,8 @@
 ; GFX8-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
 ; GFX8-NEXT:    s_nop 0
 ; GFX8-NEXT:    buffer_load_format_d16_xyz v[5:6], v4, s[4:7], 0 idxen
+; GFX8-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX8-NEXT:    ; implicit-def: $vgpr4
 ; GFX8-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GFX8-NEXT:    s_cbranch_execnz BB0_1
 ; GFX8-NEXT:  ; %bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
--- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll
@@ -13,19 +13,20 @@
 ; GCN-NEXT:    s_mov_b32 s5, exec_lo
 ; GCN-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[6:7]
-; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GCN-NEXT:    flat_load_dwordx2 v[4:5], v[6:7]
+; GCN-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:  BB0_2: ; Parent Loop BB0_1 Depth=1
 ; GCN-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_readfirstlane_b32 s8, v4
-; GCN-NEXT:    v_readfirstlane_b32 s9, v5
-; GCN-NEXT:    v_readfirstlane_b32 s10, v2
-; GCN-NEXT:    v_readfirstlane_b32 s11, v3
-; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5]
-; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GCN-NEXT:    v_readfirstlane_b32 s8, v2
+; GCN-NEXT:    v_readfirstlane_b32 s9, v3
+; GCN-NEXT:    v_readfirstlane_b32 s10, v4
+; GCN-NEXT:    v_readfirstlane_b32 s11, v5
+; GCN-NEXT:    v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3]
+; GCN-NEXT:    v_cmp_eq_u64_e64 s4, s[10:11], v[4:5]
 ; GCN-NEXT:    s_and_b32 s4, vcc_lo, s4
 ; GCN-NEXT:    s_and_saveexec_b32 s4, s4
+; GCN-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
 ; GCN-NEXT:    buffer_store_dword v0, v0, s[8:11], 0 offen
 ; GCN-NEXT:    s_waitcnt_depctr 0xffe3
 ; GCN-NEXT:    s_xor_b32 exec_lo, exec_lo, s4