diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -69,12 +69,22 @@
       Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
       Register ScratchWaveOffsetReg) const;
 
+  void emitPrologueEntryCFI(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            const DebugLoc &DL) const;
+
 public:
   bool hasFP(const MachineFunction &MF) const override;
 
   /// Create a CFI index for CFIInst and build a MachineInstr around it.
   void buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
+  /// Create a CFI index describing a spill of an SGPR to a single lane of
+  /// a VGPR and build a MachineInstr around it.
+  void buildCFIForSGPRToVGPRSpill(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const DebugLoc &DL, const Register SGPR,
+                                  const Register VGPR, const int Lane) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -13,6 +13,7 @@
 #include "SIRegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -20,12 +21,12 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/LEB128.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "frame-info"
 
-
 // Find a scratch register that we can use at the start of the prologue to
 // re-align the stack pointer. We avoid using callee-save registers since they
 // may appear to be free when this is called from canUseAsPrologue (during
@@ -700,6 +701,55 @@
   return ScratchExecCopy;
 }
 
+void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MBBI,
+                                           const DebugLoc &DL) const {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
+  Register StackPtrReg =
+      MF.getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg();
+
+  // DW_ASPACE_AMDGPU_private_wave FIXME: should be defined elsewhere
+  buildCFI(MBB, MBBI, DL,
+           MCCFIInstruction::createLLVMDefAspaceCfa(
+               nullptr, MCRI->getDwarfRegNum(StackPtrReg, false), 0, 6));
+
+  static const char PCEncodedInst[] = {
+      dwarf::DW_CFA_expression,
+      16, // PC 64
+      8,  // length
+      static_cast<char>(dwarf::DW_OP_regx),
+      62, // SGPR30
+      static_cast<char>(dwarf::DW_OP_piece),
+      4, // 32 bits
+      static_cast<char>(dwarf::DW_OP_regx),
+      63, // SGPR31
+      static_cast<char>(dwarf::DW_OP_piece),
+      4 // 32 bits
+  };
+  buildCFI(MBB, MBBI, DL,
+           MCCFIInstruction::createEscape(
+               nullptr, StringRef(PCEncodedInst, sizeof(PCEncodedInst))));
+
+  BitVector IsCalleeSaved(TRI.getNumRegs());
+  const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
+  for (unsigned I = 0; CSRegs[I]; ++I) {
+    IsCalleeSaved.set(CSRegs[I]);
+  }
+  auto ProcessReg = [&](MCPhysReg Reg) {
+    if (IsCalleeSaved.test(Reg) || !MRI.isPhysRegModified(Reg))
+      return;
+    MCRegister DwarfReg = MCRI->getDwarfRegNum(Reg, false);
+    buildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createUndefined(nullptr, DwarfReg));
+  };
+  for_each(TRI.getAllVGPR32(MF), ProcessReg);
+  for_each(TRI.getAllSGPR32(MF), ProcessReg);
+};
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -713,6 +763,7 @@
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo();
 
   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -731,6 +782,8 @@
   // turn on all lanes before doing the spill to memory.
   Register ScratchExecCopy;
 
+  emitPrologueEntryCFI(MBB, MBBI, DL);
+
   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
   bool SpillFPToMemory = false;
   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
@@ -754,6 +807,11 @@
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
       .addReg(FramePtrReg)
       .setMIFlag(MachineInstr::FrameSetup);
+    buildCFI(
+        MBB, MBBI, DL,
+        MCCFIInstruction::createRegister(
+            nullptr, MCRI->getDwarfRegNum(FramePtrReg, false),
+            MCRI->getDwarfRegNum(FuncInfo->SGPRForFPSaveRestoreCopy, false)));
   }
 
   // Emit the copy if we need a BP, and are using a free SGPR to save it.
@@ -762,6 +820,11 @@
             FuncInfo->SGPRForBPSaveRestoreCopy)
         .addReg(BasePtrReg)
         .setMIFlag(MachineInstr::FrameSetup);
+    buildCFI(
+        MBB, MBBI, DL,
+        MCCFIInstruction::createRegister(
+            nullptr, MCRI->getDwarfRegNum(BasePtrReg, false),
+            MCRI->getDwarfRegNum(FuncInfo->SGPRForBPSaveRestoreCopy, false)));
   }
 
   // If a copy has been emitted for FP and/or BP, Make the SGPRs
@@ -790,14 +853,21 @@
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
 
+    int FI = Reg.FI.getValue();
+
     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
-                     FuncInfo->getScratchRSrcReg(),
-                     StackPtrReg,
-                     Reg.FI.getValue());
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+
+    // We spill the entire VGPR, so we can get away with just cfi_offset
+    buildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createOffset(
+                 nullptr, MCRI->getDwarfRegNum(Reg.VGPR, false),
+                 MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
   }
 
   if (HasFPSaveIndex && SpillFPToMemory) {
-    assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
+    const int FI = *FuncInfo->FramePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(FI));
 
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
@@ -809,12 +879,16 @@
         .addReg(FramePtrReg);
 
     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
-                     FuncInfo->getScratchRSrcReg(), StackPtrReg,
-                     FuncInfo->FramePointerSaveIndex.getValue());
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+    buildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createOffset(
+                 nullptr, MCRI->getDwarfRegNum(FramePtrReg, false),
+                 MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
   }
 
   if (HasBPSaveIndex && SpillBPToMemory) {
-    assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
 
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
@@ -826,8 +900,11 @@
         .addReg(BasePtrReg);
 
     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
-                     FuncInfo->getScratchRSrcReg(), StackPtrReg,
-                     *FuncInfo->BasePointerSaveIndex);
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+    buildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createOffset(
+                 nullptr, MCRI->getDwarfRegNum(BasePtrReg, false),
+                 MFI.getObjectOffset(BasePtrFI) * ST.getWavefrontSize()));
   }
 
   if (ScratchExecCopy) {
@@ -841,7 +918,7 @@
 
   // In this case, spill the FP to a reserved VGPR.
   if (HasFPSaveIndex && !SpillFPToMemory) {
-    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+    const int FI = *FuncInfo->FramePointerSaveIndex;
     assert(!MFI.isDeadObjectIndex(FI));
 
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
@@ -853,9 +930,12 @@
     // FIXME: This should respect spillSGPRToVGPR;
     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
             Spill[0].VGPR)
-        .addReg(FramePtrReg)
-        .addImm(Spill[0].Lane)
-        .addReg(Spill[0].VGPR, RegState::Undef);
+      .addReg(FramePtrReg)
+      .addImm(Spill[0].Lane)
+      .addReg(Spill[0].VGPR, RegState::Undef);
+
+    buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, FramePtrReg, Spill[0].VGPR,
+                               Spill[0].Lane);
   }
 
   // In this case, spill the BP to a reserved VGPR.
@@ -875,6 +955,8 @@
         .addReg(BasePtrReg)
         .addImm(Spill[0].Lane)
         .addReg(Spill[0].VGPR, RegState::Undef);
+    buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, BasePtrReg, Spill[0].VGPR,
+                               Spill[0].Lane);
   }
 
   if (TRI.needsStackRealignment(MF)) {
@@ -921,6 +1003,12 @@
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  if (HasFP) {
+    buildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createDefCfaRegister(
+                 nullptr, MCRI->getDwarfRegNum(FramePtrReg, false)));
+  }
+
   if (HasFP && RoundedSize != 0) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
         .addReg(StackPtrReg)
@@ -954,6 +1042,7 @@
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   LivePhysRegs LiveRegs;
@@ -1004,7 +1093,7 @@
 
   Register ScratchExecCopy;
   if (HasFPSaveIndex) {
-    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+    const int FI = *FuncInfo->FramePointerSaveIndex;
     assert(!MFI.isDeadObjectIndex(FI));
     if (SpillFPToMemory) {
       if (!ScratchExecCopy)
@@ -1029,6 +1118,11 @@
     }
   }
 
+  if (hasFP(MF))
+    buildCFI(MBB, MBBI, DL,
+             MCCFIInstruction::createDefCfaRegister(
+                 nullptr, MCRI->getDwarfRegNum(StackPtrReg, false)));
+  
   if (HasBPSaveIndex) {
     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
     assert(!MFI.isDeadObjectIndex(BasePtrFI));
@@ -1314,3 +1408,56 @@
       .addCFIIndex(MF.addFrameInst(CFIInst))
       .setMIFlag(MachineInstr::FrameSetup);
 }
+
+static void encodeDwarfRegisterLocation(int DwarfReg, raw_ostream &OS) {
+  if (DwarfReg < 32) {
+    OS << uint8_t(dwarf::DW_OP_reg0 + DwarfReg);
+  } else {
+    OS << uint8_t(dwarf::DW_OP_regx);
+    encodeULEB128(DwarfReg, OS);
+  }
+}
+
+void SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, const Register SGPR, const Register VGPR,
+    const int Lane) const {
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo();
+  int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+  int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
+
+  // CFI for an SGPR spilled to a single lane of a VGPR is implemented as an
+  // expression(E) rule where E is a register location description referencing
+  // a VGPR register location storage at a byte offset of the lane index
+  // multiplied by the size of an SGPR (4 bytes). In other words we generate
+  // the following DWARF:
+  //
+  // DW_CFA_expression: <SGPR>,
+  //    (DW_OP_regx <VGPR>) (DW_OP_LLVM_offset_uconst <Lane>*4)
+  //
+  // The memory location description for the current CFA is pushed on the
+  // stack before E is evaluated, but we choose not to drop it as it would
+  // require a longer expression E and DWARF defines the result of the
+  // evaulation to be the location description on the top of the stack (i.e. the
+  // implictly pushed one is just ignored.)
+  SmallString<20> CFIInst;
+  raw_svector_ostream OSCFIInst(CFIInst);
+  SmallString<20> Block;
+  raw_svector_ostream OSBlock(Block);
+
+  OSCFIInst << uint8_t(dwarf::DW_CFA_expression);
+  encodeULEB128(DwarfSGPR, OSCFIInst);
+
+  encodeDwarfRegisterLocation(DwarfVGPR, OSBlock);
+  OSBlock << uint8_t(dwarf::DW_OP_LLVM_offset_uconst);
+  // FIXME:
+  const unsigned SGPRByteSize = 4;
+  encodeULEB128(Lane * SGPRByteSize, OSBlock);
+
+  encodeULEB128(Block.size(), OSCFIInst);
+  OSCFIInst << Block;
+
+  buildCFI(MBB, MBBI, DL,
+           MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -258,8 +258,8 @@
 ; GFX9-LABEL: func_dynamic_stackalloc_sgpr_align32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s4, s32, 0x7c0
 ; GFX9-NEXT:    s_mov_b32 s6, s33
+; GFX9-NEXT:    s_add_u32 s4, s32, 0x7c0
 ; GFX9-NEXT:    s_and_b32 s33, s4, 0xfffff800
 ; GFX9-NEXT:    s_add_u32 s32, s32, 0x1000
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
@@ -286,14 +286,16 @@
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    s_add_u32 s4, s32, 0x3e0
 ; GFX10-NEXT:    s_mov_b32 s6, s33
+; GFX10-NEXT:    s_add_u32 s4, s32, 0x3e0
 ; GFX10-NEXT:    s_and_b32 s33, s4, 0xfffffc00
 ; GFX10-NEXT:    s_add_u32 s32, s32, 0x800
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_mov_b32 s33, s6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll
@@ -166,8 +166,8 @@
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s8, s33
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz BB2_3
@@ -231,8 +231,8 @@
 ; GCN-LABEL: func_non_entry_block_static_alloca_align64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
 ; GCN-NEXT:    s_mov_b32 s8, s33
+; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
 ; GCN-NEXT:    s_and_b32 s33, s4, 0xfffff000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x2000
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -197,7 +197,7 @@
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: v_readlane_b32 s42, v0, 0
 ; GCN-NEXT: s_setpc_b64
-define void @spill_only_csr_sgpr() {
+define void @spill_only_csr_sgpr() #0 {
   call void asm sideeffect "; clobber s42", "~{s42}"()
   ret void
 }
@@ -296,8 +296,8 @@
 
 ; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
 ; GCN-NEXT: s_mov_b32 s4, s33
+; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
 ; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000
 ; GCN-NEXT: s_add_u32 s32, s32, 0x100000
 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
@@ -315,14 +315,14 @@
 ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp:
 ; GCN: s_waitcnt
 ; GCN-NEXT: v_writelane_b32 v1, s33, 2
-; GCN-NEXT: v_writelane_b32 v1, s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: v_writelane_b32 v1, s30, 0
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; GCN: v_writelane_b32 v1, s31, 1
 ; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
 ; GCN: ;;#ASMSTART
-; GCN: v_readlane_b32 s4, v1, 0
-; GCN-NEXT: s_add_u32 s32, s32, 0x200
+; GCN: s_add_u32 s32, s32, 0x200
+; GCN-NEXT: v_readlane_b32 s4, v1, 0
 ; GCN-NEXT: v_readlane_b32 s5, v1, 1
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200
 ; GCN-NEXT: v_readlane_b32 s33, v1, 2
@@ -349,8 +349,8 @@
 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN-DAG: buffer_store_dword
@@ -396,8 +396,8 @@
 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}}
 ; GCN-DAG: buffer_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -31,8 +31,8 @@
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v2f32@rel32@lo+4
@@ -65,8 +65,8 @@
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v3f32@rel32@lo+4
@@ -99,8 +99,8 @@
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_v4f16@rel32@lo+4
@@ -133,8 +133,8 @@
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_writelane_b32 v40, s33, 2
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    v_writelane_b32 v40, s30, 0
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, func_struct@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=asm -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s
 
 ; CHECK-LABEL: kern1:
 ; CHECK: .cfi_startproc
@@ -23,7 +24,506 @@
   ret void
 }
 
+; CHECK-LABEL: func_no_clobber:
+; CHECK: .cfi_startproc
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: %bb.0:
+; SGPR32 = 64
+; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6
+; DW_CFA_expression [0x10]
+;   PC_64 ULEB128(17)=[0x10]
+;   BLOCK_LENGTH ULEB128(8)=[0x08]
+;     DW_OP_regx [0x90]
+;       SGPR30 ULEB128(62)=[0x3e]
+;     DW_OP_piece [0x93]
+;       PIECE_SIZE [0x04]
+;     DW_OP_regx [0x90]
+;       SGPR31 ULEB128(63)=[0x3f]
+;     DW_OP_piece [0x93]
+;       PIECE_SIZE [0x04]
+; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: .cfi_endproc
+define hidden void @func_no_clobber() #0 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
+; CHECK: .cfi_startproc
+
+; SGPR33 = 65
+; CHECK: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
+; CHECK: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; WAVE64: .cfi_offset 65, 29184
+; WAVE32: .cfi_offset 65, 14592
+
+; CHECK: .cfi_endproc
+define void @callee_need_to_spill_fp_to_memory() #1 {
+  call void asm sideeffect "; clobber nonpreserved SGPRs",
+    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+    ,~{vcc}"()
+
+  call void asm sideeffect "; clobber all VGPRs",
+    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
+    ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
+    ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
+    ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
+    ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
+    ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
+    ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
+    ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
+    ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
+    ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
+    ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
+    ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
+    ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
+    ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
+    ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
+    ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
+    ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
+    ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
+    ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
+    ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
+    ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
+    ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
+    ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}"()
+  ret void
+}
+
+declare hidden void @ex() #0
+
+; CHECK-LABEL: func_call_clobber:
+; CHECK: .cfi_startproc
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: %bb.0:
+; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6
+; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
+
+; VGPR0_wave64 = 2560
+; WAVE64-NEXT: .cfi_undefined 2560
+; WAVE64-NEXT: .cfi_undefined 2561
+; WAVE64-NEXT: .cfi_undefined 2562
+; WAVE64-NEXT: .cfi_undefined 2563
+; WAVE64-NEXT: .cfi_undefined 2564
+; WAVE64-NEXT: .cfi_undefined 2565
+; WAVE64-NEXT: .cfi_undefined 2566
+; WAVE64-NEXT: .cfi_undefined 2567
+; WAVE64-NEXT: .cfi_undefined 2568
+; WAVE64-NEXT: .cfi_undefined 2569
+; WAVE64-NEXT: .cfi_undefined 2570
+; WAVE64-NEXT: .cfi_undefined 2571
+; WAVE64-NEXT: .cfi_undefined 2572
+; WAVE64-NEXT: .cfi_undefined 2573
+; WAVE64-NEXT: .cfi_undefined 2574
+; WAVE64-NEXT: .cfi_undefined 2575
+; WAVE64-NEXT: .cfi_undefined 2576
+; WAVE64-NEXT: .cfi_undefined 2577
+; WAVE64-NEXT: .cfi_undefined 2578
+; WAVE64-NEXT: .cfi_undefined 2579
+; WAVE64-NEXT: .cfi_undefined 2580
+; WAVE64-NEXT: .cfi_undefined 2581
+; WAVE64-NEXT: .cfi_undefined 2582
+; WAVE64-NEXT: .cfi_undefined 2583
+; WAVE64-NEXT: .cfi_undefined 2584
+; WAVE64-NEXT: .cfi_undefined 2585
+; WAVE64-NEXT: .cfi_undefined 2586
+; WAVE64-NEXT: .cfi_undefined 2587
+; WAVE64-NEXT: .cfi_undefined 2588
+; WAVE64-NEXT: .cfi_undefined 2589
+; WAVE64-NEXT: .cfi_undefined 2590
+; WAVE64-NEXT: .cfi_undefined 2591
+; WAVE64-NEXT: .cfi_undefined 2592
+; WAVE64-NEXT: .cfi_undefined 2593
+; WAVE64-NEXT: .cfi_undefined 2594
+; WAVE64-NEXT: .cfi_undefined 2595
+; WAVE64-NEXT: .cfi_undefined 2596
+; WAVE64-NEXT: .cfi_undefined 2597
+; WAVE64-NEXT: .cfi_undefined 2598
+; WAVE64-NEXT: .cfi_undefined 2599
+
+; VPGR48_wave64 = 2608
+; WAVE64-NEXT: .cfi_undefined 2608
+; WAVE64-NEXT: .cfi_undefined 2609
+; WAVE64-NEXT: .cfi_undefined 2610
+; WAVE64-NEXT: .cfi_undefined 2611
+; WAVE64-NEXT: .cfi_undefined 2612
+; WAVE64-NEXT: .cfi_undefined 2613
+; WAVE64-NEXT: .cfi_undefined 2614
+; WAVE64-NEXT: .cfi_undefined 2615
+
+; WAVE64-NEXT: .cfi_undefined 2624
+; WAVE64-NEXT: .cfi_undefined 2625
+; WAVE64-NEXT: .cfi_undefined 2626
+; WAVE64-NEXT: .cfi_undefined 2627
+; WAVE64-NEXT: .cfi_undefined 2628
+; WAVE64-NEXT: .cfi_undefined 2629
+; WAVE64-NEXT: .cfi_undefined 2630
+; WAVE64-NEXT: .cfi_undefined 2631
+
+; WAVE64-NEXT: .cfi_undefined 2640
+; WAVE64-NEXT: .cfi_undefined 2641
+; WAVE64-NEXT: .cfi_undefined 2642
+; WAVE64-NEXT: .cfi_undefined 2643
+; WAVE64-NEXT: .cfi_undefined 2644
+; WAVE64-NEXT: .cfi_undefined 2645
+; WAVE64-NEXT: .cfi_undefined 2646
+; WAVE64-NEXT: .cfi_undefined 2647
+
+; WAVE64-NEXT: .cfi_undefined 2656
+; WAVE64-NEXT: .cfi_undefined 2657
+; WAVE64-NEXT: .cfi_undefined 2658
+; WAVE64-NEXT: .cfi_undefined 2659
+; WAVE64-NEXT: .cfi_undefined 2660
+; WAVE64-NEXT: .cfi_undefined 2661
+; WAVE64-NEXT: .cfi_undefined 2662
+; WAVE64-NEXT: .cfi_undefined 2663
+
+; WAVE64-NEXT: .cfi_undefined 2672
+; WAVE64-NEXT: .cfi_undefined 2673
+; WAVE64-NEXT: .cfi_undefined 2674
+; WAVE64-NEXT: .cfi_undefined 2675
+; WAVE64-NEXT: .cfi_undefined 2676
+; WAVE64-NEXT: .cfi_undefined 2677
+; WAVE64-NEXT: .cfi_undefined 2678
+; WAVE64-NEXT: .cfi_undefined 2679
+
+; WAVE64-NEXT: .cfi_undefined 2688
+; WAVE64-NEXT: .cfi_undefined 2689
+; WAVE64-NEXT: .cfi_undefined 2690
+; WAVE64-NEXT: .cfi_undefined 2691
+; WAVE64-NEXT: .cfi_undefined 2692
+; WAVE64-NEXT: .cfi_undefined 2693
+; WAVE64-NEXT: .cfi_undefined 2694
+; WAVE64-NEXT: .cfi_undefined 2695
+
+; WAVE64-NEXT: .cfi_undefined 2704
+; WAVE64-NEXT: .cfi_undefined 2705
+; WAVE64-NEXT: .cfi_undefined 2706
+; WAVE64-NEXT: .cfi_undefined 2707
+; WAVE64-NEXT: .cfi_undefined 2708
+; WAVE64-NEXT: .cfi_undefined 2709
+; WAVE64-NEXT: .cfi_undefined 2710
+; WAVE64-NEXT: .cfi_undefined 2711
+
+; WAVE64-NEXT: .cfi_undefined 2720
+; WAVE64-NEXT: .cfi_undefined 2721
+; WAVE64-NEXT: .cfi_undefined 2722
+; WAVE64-NEXT: .cfi_undefined 2723
+; WAVE64-NEXT: .cfi_undefined 2724
+; WAVE64-NEXT: .cfi_undefined 2725
+; WAVE64-NEXT: .cfi_undefined 2726
+; WAVE64-NEXT: .cfi_undefined 2727
+
+; WAVE64-NEXT: .cfi_undefined 2736
+; WAVE64-NEXT: .cfi_undefined 2737
+; WAVE64-NEXT: .cfi_undefined 2738
+; WAVE64-NEXT: .cfi_undefined 2739
+; WAVE64-NEXT: .cfi_undefined 2740
+; WAVE64-NEXT: .cfi_undefined 2741
+; WAVE64-NEXT: .cfi_undefined 2742
+; WAVE64-NEXT: .cfi_undefined 2743
+
+; WAVE64-NEXT: .cfi_undefined 2752
+; WAVE64-NEXT: .cfi_undefined 2753
+; WAVE64-NEXT: .cfi_undefined 2754
+; WAVE64-NEXT: .cfi_undefined 2755
+; WAVE64-NEXT: .cfi_undefined 2756
+; WAVE64-NEXT: .cfi_undefined 2757
+; WAVE64-NEXT: .cfi_undefined 2758
+; WAVE64-NEXT: .cfi_undefined 2759
+
+; WAVE64-NEXT: .cfi_undefined 2768
+; WAVE64-NEXT: .cfi_undefined 2769
+; WAVE64-NEXT: .cfi_undefined 2770
+; WAVE64-NEXT: .cfi_undefined 2771
+; WAVE64-NEXT: .cfi_undefined 2772
+; WAVE64-NEXT: .cfi_undefined 2773
+; WAVE64-NEXT: .cfi_undefined 2774
+; WAVE64-NEXT: .cfi_undefined 2775
+
+; WAVE64-NEXT: .cfi_undefined 2784
+; WAVE64-NEXT: .cfi_undefined 2785
+; WAVE64-NEXT: .cfi_undefined 2786
+; WAVE64-NEXT: .cfi_undefined 2787
+; WAVE64-NEXT: .cfi_undefined 2788
+; WAVE64-NEXT: .cfi_undefined 2789
+; WAVE64-NEXT: .cfi_undefined 2790
+; WAVE64-NEXT: .cfi_undefined 2791
+
+; WAVE64-NEXT: .cfi_undefined 2800
+; WAVE64-NEXT: .cfi_undefined 2801
+; WAVE64-NEXT: .cfi_undefined 2802
+; WAVE64-NEXT: .cfi_undefined 2803
+; WAVE64-NEXT: .cfi_undefined 2804
+; WAVE64-NEXT: .cfi_undefined 2805
+; WAVE64-NEXT: .cfi_undefined 2806
+; WAVE64-NEXT: .cfi_undefined 2807
+
+
+; VGPR0_wave32 = 1536
+; WAVE32-NEXT: .cfi_undefined 1536
+; WAVE32-NEXT: .cfi_undefined 1537
+; WAVE32-NEXT: .cfi_undefined 1538
+; WAVE32-NEXT: .cfi_undefined 1539
+; WAVE32-NEXT: .cfi_undefined 1540
+; WAVE32-NEXT: .cfi_undefined 1541
+; WAVE32-NEXT: .cfi_undefined 1542
+; WAVE32-NEXT: .cfi_undefined 1543
+; WAVE32-NEXT: .cfi_undefined 1544
+; WAVE32-NEXT: .cfi_undefined 1545
+; WAVE32-NEXT: .cfi_undefined 1546
+; WAVE32-NEXT: .cfi_undefined 1547
+; WAVE32-NEXT: .cfi_undefined 1548
+; WAVE32-NEXT: .cfi_undefined 1549
+; WAVE32-NEXT: .cfi_undefined 1550
+; WAVE32-NEXT: .cfi_undefined 1551
+; WAVE32-NEXT: .cfi_undefined 1552
+; WAVE32-NEXT: .cfi_undefined 1553
+; WAVE32-NEXT: .cfi_undefined 1554
+; WAVE32-NEXT: .cfi_undefined 1555
+; WAVE32-NEXT: .cfi_undefined 1556
+; WAVE32-NEXT: .cfi_undefined 1557
+; WAVE32-NEXT: .cfi_undefined 1558
+; WAVE32-NEXT: .cfi_undefined 1559
+; WAVE32-NEXT: .cfi_undefined 1560
+; WAVE32-NEXT: .cfi_undefined 1561
+; WAVE32-NEXT: .cfi_undefined 1562
+; WAVE32-NEXT: .cfi_undefined 1563
+; WAVE32-NEXT: .cfi_undefined 1564
+; WAVE32-NEXT: .cfi_undefined 1565
+; WAVE32-NEXT: .cfi_undefined 1566
+; WAVE32-NEXT: .cfi_undefined 1567
+; WAVE32-NEXT: .cfi_undefined 1568
+; WAVE32-NEXT: .cfi_undefined 1569
+; WAVE32-NEXT: .cfi_undefined 1570
+; WAVE32-NEXT: .cfi_undefined 1571
+; WAVE32-NEXT: .cfi_undefined 1572
+; WAVE32-NEXT: .cfi_undefined 1573
+; WAVE32-NEXT: .cfi_undefined 1574
+; WAVE32-NEXT: .cfi_undefined 1575
+
+; VPGR48_wave64 = 1584
+; WAVE32-NEXT: .cfi_undefined 1584
+; WAVE32-NEXT: .cfi_undefined 1585
+; WAVE32-NEXT: .cfi_undefined 1586
+; WAVE32-NEXT: .cfi_undefined 1587
+; WAVE32-NEXT: .cfi_undefined 1588
+; WAVE32-NEXT: .cfi_undefined 1589
+; WAVE32-NEXT: .cfi_undefined 1590
+; WAVE32-NEXT: .cfi_undefined 1591
+
+; WAVE32-NEXT: .cfi_undefined 1600
+; WAVE32-NEXT: .cfi_undefined 1601
+; WAVE32-NEXT: .cfi_undefined 1602
+; WAVE32-NEXT: .cfi_undefined 1603
+; WAVE32-NEXT: .cfi_undefined 1604
+; WAVE32-NEXT: .cfi_undefined 1605
+; WAVE32-NEXT: .cfi_undefined 1606
+; WAVE32-NEXT: .cfi_undefined 1607
+
+; WAVE32-NEXT: .cfi_undefined 1616
+; WAVE32-NEXT: .cfi_undefined 1617
+; WAVE32-NEXT: .cfi_undefined 1618
+; WAVE32-NEXT: .cfi_undefined 1619
+; WAVE32-NEXT: .cfi_undefined 1620
+; WAVE32-NEXT: .cfi_undefined 1621
+; WAVE32-NEXT: .cfi_undefined 1622
+; WAVE32-NEXT: .cfi_undefined 1623
+
+; WAVE32-NEXT: .cfi_undefined 1632
+; WAVE32-NEXT: .cfi_undefined 1633
+; WAVE32-NEXT: .cfi_undefined 1634
+; WAVE32-NEXT: .cfi_undefined 1635
+; WAVE32-NEXT: .cfi_undefined 1636
+; WAVE32-NEXT: .cfi_undefined 1637
+; WAVE32-NEXT: .cfi_undefined 1638
+; WAVE32-NEXT: .cfi_undefined 1639
+
+; WAVE32-NEXT: .cfi_undefined 1648
+; WAVE32-NEXT: .cfi_undefined 1649
+; WAVE32-NEXT: .cfi_undefined 1650
+; WAVE32-NEXT: .cfi_undefined 1651
+; WAVE32-NEXT: .cfi_undefined 1652
+; WAVE32-NEXT: .cfi_undefined 1653
+; WAVE32-NEXT: .cfi_undefined 1654
+; WAVE32-NEXT: .cfi_undefined 1655
+
+; WAVE32-NEXT: .cfi_undefined 1664
+; WAVE32-NEXT: .cfi_undefined 1665
+; WAVE32-NEXT: .cfi_undefined 1666
+; WAVE32-NEXT: .cfi_undefined 1667
+; WAVE32-NEXT: .cfi_undefined 1668
+; WAVE32-NEXT: .cfi_undefined 1669
+; WAVE32-NEXT: .cfi_undefined 1670
+; WAVE32-NEXT: .cfi_undefined 1671
+
+; WAVE32-NEXT: .cfi_undefined 1680
+; WAVE32-NEXT: .cfi_undefined 1681
+; WAVE32-NEXT: .cfi_undefined 1682
+; WAVE32-NEXT: .cfi_undefined 1683
+; WAVE32-NEXT: .cfi_undefined 1684
+; WAVE32-NEXT: .cfi_undefined 1685
+; WAVE32-NEXT: .cfi_undefined 1686
+; WAVE32-NEXT: .cfi_undefined 1687
+
+; WAVE32-NEXT: .cfi_undefined 1696
+; WAVE32-NEXT: .cfi_undefined 1697
+; WAVE32-NEXT: .cfi_undefined 1698
+; WAVE32-NEXT: .cfi_undefined 1699
+; WAVE32-NEXT: .cfi_undefined 1700
+; WAVE32-NEXT: .cfi_undefined 1701
+; WAVE32-NEXT: .cfi_undefined 1702
+; WAVE32-NEXT: .cfi_undefined 1703
+
+; WAVE32-NEXT: .cfi_undefined 1712
+; WAVE32-NEXT: .cfi_undefined 1713
+; WAVE32-NEXT: .cfi_undefined 1714
+; WAVE32-NEXT: .cfi_undefined 1715
+; WAVE32-NEXT: .cfi_undefined 1716
+; WAVE32-NEXT: .cfi_undefined 1717
+; WAVE32-NEXT: .cfi_undefined 1718
+; WAVE32-NEXT: .cfi_undefined 1719
+
+; WAVE32-NEXT: .cfi_undefined 1728
+; WAVE32-NEXT: .cfi_undefined 1729
+; WAVE32-NEXT: .cfi_undefined 1730
+; WAVE32-NEXT: .cfi_undefined 1731
+; WAVE32-NEXT: .cfi_undefined 1732
+; WAVE32-NEXT: .cfi_undefined 1733
+; WAVE32-NEXT: .cfi_undefined 1734
+; WAVE32-NEXT: .cfi_undefined 1735
+
+; WAVE32-NEXT: .cfi_undefined 1744
+; WAVE32-NEXT: .cfi_undefined 1745
+; WAVE32-NEXT: .cfi_undefined 1746
+; WAVE32-NEXT: .cfi_undefined 1747
+; WAVE32-NEXT: .cfi_undefined 1748
+; WAVE32-NEXT: .cfi_undefined 1749
+; WAVE32-NEXT: .cfi_undefined 1750
+; WAVE32-NEXT: .cfi_undefined 1751
+
+; WAVE32-NEXT: .cfi_undefined 1760
+; WAVE32-NEXT: .cfi_undefined 1761
+; WAVE32-NEXT: .cfi_undefined 1762
+; WAVE32-NEXT: .cfi_undefined 1763
+; WAVE32-NEXT: .cfi_undefined 1764
+; WAVE32-NEXT: .cfi_undefined 1765
+; WAVE32-NEXT: .cfi_undefined 1766
+; WAVE32-NEXT: .cfi_undefined 1767
+
+; WAVE32-NEXT: .cfi_undefined 1776
+; WAVE32-NEXT: .cfi_undefined 1777
+; WAVE32-NEXT: .cfi_undefined 1778
+; WAVE32-NEXT: .cfi_undefined 1779
+; WAVE32-NEXT: .cfi_undefined 1780
+; WAVE32-NEXT: .cfi_undefined 1781
+; WAVE32-NEXT: .cfi_undefined 1782
+; WAVE32-NEXT: .cfi_undefined 1783
+
+
+; SGPR0 = 32
+; CHECK-NEXT: .cfi_undefined 32
+; CHECK-NEXT: .cfi_undefined 33
+; CHECK-NEXT: .cfi_undefined 34
+; CHECK-NEXT: .cfi_undefined 35
+; CHECK-NEXT: .cfi_undefined 36
+; CHECK-NEXT: .cfi_undefined 37
+; CHECK-NEXT: .cfi_undefined 38
+; CHECK-NEXT: .cfi_undefined 39
+; CHECK-NEXT: .cfi_undefined 40
+; CHECK-NEXT: .cfi_undefined 41
+; CHECK-NEXT: .cfi_undefined 42
+; CHECK-NEXT: .cfi_undefined 43
+; CHECK-NEXT: .cfi_undefined 44
+; CHECK-NEXT: .cfi_undefined 45
+; CHECK-NEXT: .cfi_undefined 46
+; CHECK-NEXT: .cfi_undefined 47
+; CHECK-NEXT: .cfi_undefined 48
+; CHECK-NEXT: .cfi_undefined 49
+; CHECK-NEXT: .cfi_undefined 50
+; CHECK-NEXT: .cfi_undefined 51
+; CHECK-NEXT: .cfi_undefined 52
+; CHECK-NEXT: .cfi_undefined 53
+; CHECK-NEXT: .cfi_undefined 54
+; CHECK-NEXT: .cfi_undefined 55
+; CHECK-NEXT: .cfi_undefined 56
+; CHECK-NEXT: .cfi_undefined 57
+; CHECK-NEXT: .cfi_undefined 58
+; CHECK-NEXT: .cfi_undefined 59
+; CHECK-NEXT: .cfi_undefined 60
+; CHECK-NEXT: .cfi_undefined 61
+; CHECK-NEXT: .cfi_undefined 62
+; CHECK-NEXT: .cfi_undefined 63
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; WAVE64: s_or_saveexec_b64 s[4:5], -1
+; WAVE32: s_or_saveexec_b32 s4, -1
+; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; VGPR40_wave64 = 2600
+; WAVE64-NEXT: .cfi_offset 2600, 0
+; VGPR40_wave32 = 1576
+; WAVE32-NEXT: .cfi_offset 1576, 0
+; CHECK-NOT: .cfi_{{.*}}
+; WAVE64: s_mov_b64 exec, s[4:5]
+; WAVE32: s_mov_b32 exec_lo, s4
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: v_writelane_b32 v40, s33, 2
+
+; DW_CFA_expression [0x10] SGPR33 ULEB128(65)=[0x41]
+;   BLOCK_LENGTH ULEB128(5)=[0x05]
+;     DW_OP_regx [0x90]
+;       VGPR40_wave64 ULEB128(2600)=[0xa8, 0x14]
+;     DW_OP_LLVM_offset_uconst [0xe4]
+;       OFFSET ULEB128(0x08) [0x08]
+; WAVE64-NEXT: .cfi_escape 0x10, 0x41, 0x05, 0x90, 0xa8, 0x14, 0xe4, 0x08
+
+; DW_CFA_expression [0x10] SGPR33 ULEB128(65)=[0x41]
+;   BLOCK_LENGTH ULEB128(5)=[0x05]
+;     DW_OP_regx [0x90]
+;       VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c]
+;     DW_OP_LLVM_offset_uconst [0xe4]
+;       OFFSET ULEB128(0x08) [0x08]
+; WAVE32-NEXT: .cfi_escape 0x10, 0x41, 0x05, 0x90, 0xa8, 0x0c, 0xe4, 0x08
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: s_mov_b32 s33, s32
+; SGPR33 = 65
+; CHECK-NEXT: .cfi_def_cfa_register 65
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: s_sub_u32 s32, s32,
+; CHECK-NEXT: v_readlane_b32 s33, v40, 2
+; SGPR32 = 64
+; CHECK-NEXT: .cfi_def_cfa_register 64
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: .cfi_endproc
+define hidden void @func_call_clobber() #0 {
+entry:
+  call void @ex() #0
+  ret void
+}
+
 attributes #0 = { nounwind }
+attributes #1 = { nounwind "frame-pointer"="all" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2, !3}
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -192,6 +192,7 @@
 ; GFX9-NEXT:    v_writelane_b32 v43, s33, 4
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_add_u32 s32, s32, 0x800
+; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -151,8 +151,8 @@
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_mov_b32 s7, s33
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_mov_b32 s33, s32
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x400
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GCN-NEXT:    s_cbranch_execz BB2_3
@@ -214,8 +214,8 @@
 ; GCN-LABEL: func_non_entry_block_static_alloca_align64:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
 ; GCN-NEXT:    s_mov_b32 s7, s33
+; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
 ; GCN-NEXT:    s_and_b32 s33, s4, 0xfffff000
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    s_add_u32 s32, s32, 0x2000
diff --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
--- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll
@@ -13,6 +13,8 @@
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp0:
 ; GCN-NEXT:    .loc 0 4 5 prologue_end ; /tmp/dbg.cl:4:5
@@ -35,6 +37,12 @@
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
+; GCN-NEXT:    .cfi_undefined 2560
+; GCN-NEXT:    .cfi_undefined 2561
+; GCN-NEXT:    .cfi_undefined 2562
+; GCN-NEXT:    .cfi_undefined 2563
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp2:
 ; GCN-NEXT:    .loc 0 8 17 prologue_end ; /tmp/dbg.cl:8:17
@@ -65,6 +73,8 @@
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp8:
 ; GCN-NEXT:    .loc 0 12 5 prologue_end ; /tmp/dbg.cl:12:5
@@ -83,6 +93,8 @@
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp10:
 ; GCN-NEXT:    .loc 0 16 5 prologue_end ; /tmp/dbg.cl:16:5
@@ -103,6 +115,8 @@
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp12:
 ; GCN-NEXT:    .loc 0 20 5 prologue_end ; /tmp/dbg.cl:20:5
@@ -121,6 +135,8 @@
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp14:
 ; GCN-NEXT:    .loc 0 24 5 prologue_end ; /tmp/dbg.cl:24:5
@@ -139,6 +155,8 @@
 ; GCN-NEXT:  ; %bb.0:
 ; GCN-NEXT:    ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1
 ; GCN-NEXT:    ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0
+; GCN-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; GCN-NEXT:    .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:  .Ltmp16:
 ; GCN-NEXT:    .loc 0 28 5 prologue_end ; /tmp/dbg.cl:28:5
diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -124,14 +124,14 @@
 }
 
 ; GCN-LABEL: {{^}}default_realign_align128:
-; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0
-; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN-NEXT: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0
 ; GCN-NEXT: s_and_b32 s33, [[TMP]], 0xffffe000
-; GCN-NEXT: s_add_u32 s32, s32, 0x4000
-; GCN-NOT: s33
+; GCN: s_add_u32 s32, s32, 0x4000
 ; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}}
 ; GCN: s_sub_u32 s32, s32, 0x4000
 ; GCN: s_mov_b32 s33, [[FP_COPY]]
+; GCN-NOT: s33
 define void @default_realign_align128(i32 %idx) #0 {
   %alloca.align = alloca i32, align 128, addrspace(5)
   store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128
@@ -164,17 +164,17 @@
 ; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
 
 ; GCN: s_mov_b32 s34, s32
-; GCN-NEXT: v_mov_b32_e32 v32, 0
+; GCN: v_mov_b32_e32 v32, 0
 
+; GCN: s_add_u32 s32, s32, 0x30000
 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
 ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
-; GCN-NEXT: s_add_u32 s32, s32, 0x30000
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
 
-; GCN: v_readlane_b32 s33, [[VGPR_REG]], 2
-; GCN-NEXT: s_sub_u32 s32, s32, 0x30000
+; GCN: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: v_readlane_b32 s33, [[VGPR_REG]], 2
 ; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
@@ -194,11 +194,11 @@
 ; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue.
 
 ; GCN-LABEL: needs_align1024_stack_args_used_inside_loop:
-; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
-; GCN-NEXT: s_mov_b32 s34, s32
+; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN-NEXT: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
 ; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
-; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+; GCN-NEXT: s_mov_b32 s34, s32
 ; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34
 ; GCN: s_add_u32 s32, s32, 0x30000
@@ -235,8 +235,8 @@
 ; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy:
 ; GCN: ; %bb.0:
 ; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0
-; GCN-NEXT: s_mov_b32 s34, s32
-; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN: s_mov_b32 s34, s32
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
 ; GCN-NEXT: ;;#ASMSTART
 ; GCN-NEXT: ;;#ASMEND