diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -10,11 +10,11 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
 
 #include "AMDGPUFrameLowering.h"
+#include "SIMachineFunctionInfo.h"
 
 namespace llvm {
 
 class SIInstrInfo;
-class SIMachineFunctionInfo;
 class SIRegisterInfo;
 class GCNSubtarget;
 
@@ -90,6 +90,12 @@
                                   MachineBasicBlock::iterator MBBI,
                                   const DebugLoc &DL, const Register SGPR,
                                   const Register VGPR, const int Lane) const;
+  /// Create a CFI index describing a spill of an SGPR to multiple lanes of
+  /// VGPRs and build a MachineInstr around it.
+  void buildCFIForSGPRToVGPRSpill(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+      const DebugLoc &DL, Register SGPR,
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills) const;
   /// Create a CFI index describing a spill of a VGPR to VMEM and
   /// build a MachineInstr around it.
   void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -748,7 +748,7 @@
   };
   for_each(TRI.getAllVGPR32(MF), ProcessReg);
   for_each(TRI.getAllSGPR32(MF), ProcessReg);
-};
+}
 
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
@@ -916,6 +916,44 @@
     LiveRegs.addReg(ScratchExecCopy);
   }
 
+  if (TRI.isCFISavedRegsSpillEnabled()) {
+    MCRegister ReturnAddressReg = TRI.getReturnAddressReg(MF);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> ReturnAddressSpill =
+        FuncInfo->getSGPRToVGPRSpills(FuncInfo->ReturnAddressSaveIndex);
+    assert(ReturnAddressSpill.size() == 2);
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            ReturnAddressSpill[0].VGPR)
+        .addReg(TRI.getSubReg(ReturnAddressReg, TRI.getSubRegFromChannel(0)))
+        .addImm(ReturnAddressSpill[0].Lane)
+        .addReg(ReturnAddressSpill[0].VGPR, RegState::Undef);
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            ReturnAddressSpill[1].VGPR)
+        .addReg(TRI.getSubReg(ReturnAddressReg, TRI.getSubRegFromChannel(1)))
+        .addImm(ReturnAddressSpill[1].Lane)
+        .addReg(ReturnAddressSpill[1].VGPR, RegState::Undef);
+    buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::PC_REG,
+                               ReturnAddressSpill);
+
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> EXECSpill =
+        FuncInfo->getSGPRToVGPRSpills(FuncInfo->EXECSaveIndex);
+    assert(EXECSpill.size());
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            EXECSpill[0].VGPR)
+        .addReg(AMDGPU::EXEC_LO)
+        .addImm(EXECSpill[0].Lane)
+        .addReg(EXECSpill[0].VGPR, RegState::Undef);
+    if (!ST.isWave32()) {
+      assert(EXECSpill.size() == 2);
+      BuildMI(MBB, MBBI, DL,
+              TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+              EXECSpill[1].VGPR)
+          .addReg(AMDGPU::EXEC_HI)
+          .addImm(EXECSpill[1].Lane)
+          .addReg(EXECSpill[1].VGPR, RegState::Undef);
+    }
+    buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::EXEC, EXECSpill);
+  }
+
   // In this case, spill the FP to a reserved VGPR.
   if (HasFPSaveIndex && !SpillFPToMemory) {
     const int FI = *FuncInfo->FramePointerSaveIndex;
@@ -1242,6 +1280,18 @@
   }
 }
 
+static void allocateCFISave(MachineFunction &MF, int &FI, Register Reg) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+  int NewFI = MF.getFrameInfo().CreateStackObject(
+      TRI->getSpillSize(*RC), TRI->getSpillAlignment(*RC), true);
+  if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+    llvm_unreachable("allocate SGPR spill should have worked");
+  FI = NewFI;
+}
+
 // Only report VGPRs to generic code.
 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                            BitVector &SavedVGPRs,
@@ -1275,6 +1325,13 @@
   for (auto SSpill : MFI->getSGPRSpillVGPRs())
     SavedVGPRs.reset(SSpill.VGPR);
 
+  if (TRI->isCFISavedRegsSpillEnabled()) {
+    allocateCFISave(MF, MFI->ReturnAddressSaveIndex,
+                    TRI->getReturnAddressReg(MF));
+    allocateCFISave(MF, MFI->EXECSaveIndex,
+                    ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+  }
+
   LivePhysRegs LiveRegs;
   LiveRegs.init(*TRI);
 
@@ -1486,6 +1543,59 @@
            MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
 }
 
+void SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, Register SGPR,
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills) const {
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo();
+  int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+
+  // CFI for an SGPR spilled to a multiple lanes of VGPRs is implemented as an
+  // expression(E) rule where E is a composite location description
+  // with multiple parts each referencing
+  // VGPR register location storage with a bit offset of the lane index
+  // multiplied by the size of an SGPR (32 bits). In other words we generate
+  // the following DWARF:
+  //
+  // DW_CFA_expression: <SGPR>,
+  //    (DW_OP_regx <VGPR[0]>) (DW_OP_bit_piece 32, <Lane[0]>*32)
+  //    (DW_OP_regx <VGPR[1]>) (DW_OP_bit_piece 32, <Lane[1]>*32)
+  //    ...
+  //    (DW_OP_regx <VGPR[N]>) (DW_OP_bit_piece 32, <Lane[N]>*32)
+  //
+  // The memory location description for the current CFA is pushed on the
+  // stack before E is evaluated, but we choose not to drop it as it would
+  // require a longer expression E and DWARF defines the result of the
+  // evaulation to be the location description on the top of the stack (i.e. the
+  // implictly pushed one is just ignored.)
+  SmallString<20> CFIInst;
+  raw_svector_ostream OSCFIInst(CFIInst);
+  SmallString<20> Block;
+  raw_svector_ostream OSBlock(Block);
+
+  OSCFIInst << uint8_t(dwarf::DW_CFA_expression);
+  encodeULEB128(DwarfSGPR, OSCFIInst);
+
+  // TODO: Detect when we can merge multiple adjacent pieces, or even reduce
+  // this to a register location description (when all pieces are adjacent).
+  for (SIMachineFunctionInfo::SpilledReg Spill : VGPRSpills) {
+    encodeDwarfRegisterLocation(MCRI.getDwarfRegNum(Spill.VGPR, false),
+                                OSBlock);
+    OSBlock << uint8_t(dwarf::DW_OP_bit_piece);
+    // FIXME: Can this be a function of the SGPR?
+    const unsigned SGPRBitSize = 32;
+    encodeULEB128(SGPRBitSize, OSBlock);
+    encodeULEB128(SGPRBitSize * Spill.Lane, OSBlock);
+  }
+
+  encodeULEB128(Block.size(), OSCFIInst);
+  OSCFIInst << Block;
+
+  buildCFI(MBB, MBBI, DL,
+           MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
+}
+
 void SIFrameLowering::buildCFIForVGPRToVMEMSpill(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, unsigned VGPR, int64_t Offset) const {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -490,6 +490,9 @@
   Register SGPRForBPSaveRestoreCopy;
   Optional<int> BasePointerSaveIndex;
 
+  int ReturnAddressSaveIndex;
+  int EXECSaveIndex;
+
   Register VGPRReservedForSGPRSpill;
   bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -53,6 +53,8 @@
     return SpillSGPRToVGPR;
   }
 
+  bool isCFISavedRegsSpillEnabled() const;
+
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
   MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -42,6 +42,11 @@
 
 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
 
+static cl::opt<bool> EnableSpillCFISavedRegs(
+    "amdgpu-spill-cfi-saved-regs",
+    cl::desc("Enable spilling the registers required for CFI emission"),
+    cl::ReallyHidden, cl::init(false));
+
 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
@@ -224,6 +229,10 @@
   return SubRegFromChannelTable[NumRegIndex][Channel];
 }
 
+bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const {
+  return EnableSpillCFISavedRegs;
+}
+
 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll
@@ -0,0 +1,106 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=asm -amdgpu-spill-cfi-saved-regs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -filetype=asm -amdgpu-spill-cfi-saved-regs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s
+
+; CHECK-LABEL: kern:
+; CHECK: .cfi_startproc
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: %bb.0:
+; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe1
+; CHECK-NEXT: .cfi_undefined 16
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: .cfi_endproc
+define protected amdgpu_kernel void @kern() #0 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: func:
+; CHECK: .cfi_startproc
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: %bb.0:
+; SGPR32 = 64
+; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6
+; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
+
+
+; FIXME: ideally this would not care what VGPR we spill to, but since we are
+; using .cfi_escape it isn't trivial/possible to make this general yet
+
+; CHECK: v_writelane_b32 v0, s30, 0
+; CHECK-NEXT: v_writelane_b32 v0, s31, 1
+
+; DW_CFA_expression [0x10]
+;   PC_64 ULEB128(17)=[0x10]
+;   BLOCK_LENGTH ULEB128(12)=[0x0c]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x00]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x20]
+; WAVE64-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x20
+
+; DW_CFA_expression [0x10]
+;   PC_64 ULEB128(17)=[0x10]
+;   BLOCK_LENGTH ULEB128(12)=[0x0c]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x00]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x20]
+; WAVE32-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x20
+
+
+; WAVE64: v_writelane_b32 v0, exec_lo, 2
+; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 3
+; DW_CFA_expression [0x10]
+;   EXEC_MASK_wave64 ULEB128(17)=[0x11]
+;   BLOCK_LENGTH ULEB128(12)=[0x0c]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x40]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x60]
+; WAVE64-NEXT: .cfi_escape 0x10, 0x11, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x40, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x60
+
+; WAVE32: v_writelane_b32 v0, exec_lo, 2
+; DW_CFA_expression [0x10]
+;   EXEC_MASK_wave32 ULEB128(1)=[0x01]
+;   BLOCK_LENGTH ULEB128(6)=[0x06]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x40]
+; WAVE32-NEXT: .cfi_escape 0x10, 0x01, 0x06, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x40
+
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: .cfi_endproc
+define hidden void @func() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "filename", directory: "directory")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}