diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -10,11 +10,11 @@
 #define LLVM_LIB_TARGET_AMDGPU_SIFRAMELOWERING_H
 
 #include "AMDGPUFrameLowering.h"
+#include "SIMachineFunctionInfo.h"
 
 namespace llvm {
 
 class SIInstrInfo;
-class SIMachineFunctionInfo;
 class SIRegisterInfo;
 class GCNSubtarget;
 
@@ -82,6 +82,14 @@
   bool hasFP(const MachineFunction &MF) const override;
 
   bool requiresStackPointerReference(const MachineFunction &MF) const;
+
+  /// If '-amdgpu-spill-cfi-saved-regs' is enabled, emit RA/EXEC spills to
+  /// a free VGPR (lanes) or memory and corresponding CFI rules.
+  void emitCFISavedRegSpills(MachineFunction &MF, MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             LivePhysRegs &LiveRegs, Register &ScratchExecCopy,
+                             bool emitSpillsToMem) const;
+
   /// Create a CFI index for CFIInst and build a MachineInstr around it.
   void buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
@@ -91,6 +99,12 @@
                                   MachineBasicBlock::iterator MBBI,
                                   const DebugLoc &DL, const Register SGPR,
                                   const Register VGPR, const int Lane) const;
+  /// Create a CFI index describing a spill of an SGPR to multiple lanes of
+  /// VGPRs and build a MachineInstr around it.
+  void buildCFIForSGPRToVGPRSpill(
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+      const DebugLoc &DL, Register SGPR,
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills) const;
   /// Create a CFI index describing a spill of a VGPR to VMEM and
   /// build a MachineInstr around it.
   void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -123,11 +123,13 @@
                              MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
                              const SIInstrInfo *TII, Register SpillReg,
-                             Register ScratchRsrcReg, Register SPReg, int FI) {
+                             Register ScratchRsrcReg, Register SPReg, int FI,
+                             int DwordOff) {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
 
-  int64_t Offset = MFI.getObjectOffset(FI);
+  // 'DwordOff' is the offset of the lower/upper double word
+  int64_t Offset = MFI.getObjectOffset(FI) + DwordOff;
 
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
@@ -483,12 +485,25 @@
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
-// memory. They should have been removed by now.
-static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+// memory. They should have been removed by now, except CFI Saved Reg spills.
+static bool allStackObjectsAreDead(const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
-    if (!MFI.isDeadObjectIndex(I))
+    if (!MFI.isDeadObjectIndex(I)) {
+      // determineCalleeSaves() might have added the SGPRSpill stack IDs for
+      // CFI saves into scratch VGPR, ignore them
+      if (MFI.getStackID(I) == TargetStackID::SGPRSpill &&
+          TRI->isCFISavedRegsSpillEnabled() &&
+          (I == FuncInfo->ReturnAddressSaveIndex ||
+           I == FuncInfo->EXECSaveIndex)) {
+        continue;
+      }
       return false;
+    }
   }
 
   return true;
@@ -508,8 +523,8 @@
 
   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
 
-  if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
-                          allStackObjectsAreDead(MF.getFrameInfo())))
+  if (!ScratchRsrcReg ||
+      (!MRI.isPhysRegUsed(ScratchRsrcReg) && allStackObjectsAreDead(MF)))
     return Register();
 
   if (ST.hasSGPRInitBug() ||
@@ -882,6 +897,166 @@
   return ScratchExecCopy;
 }
 
+// A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+// Otherwise we are spilling to memory.
+static bool spilledToMemory(const MachineFunction &MF, int SaveIndex) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.getStackID(SaveIndex) != TargetStackID::SGPRSpill;
+}
+
+// Emit the RA and EXEC saves for the non-kernel functions in the
+// prologue and the corresponding CFI rules.
+void SIFrameLowering::emitCFISavedRegSpills(MachineFunction &MF,
+                                            MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            LivePhysRegs &LiveRegs,
+                                            Register &ScratchExecCopy,
+                                            bool emitSpillsToMem) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  const MCRegisterInfo *MCRI = MF.getMMI().getContext().getRegisterInfo();
+
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+
+  Optional<int> RASaveIndex = FuncInfo->ReturnAddressSaveIndex;
+  Optional<int> EXECSaveIndex = FuncInfo->EXECSaveIndex;
+  Register RetAddrReg = TRI.getReturnAddressReg(MF);
+  DebugLoc DL;
+
+  if (emitSpillsToMem) {
+    // Return address is being spilled into memory at the frame
+    // index <RASaveIndex> and consumes two double words. And
+    // build the corresponding CFI rule.
+    if (RASaveIndex && spilledToMemory(MF, *RASaveIndex)) {
+      const int FI = *RASaveIndex;
+      assert(!MFI.isDeadObjectIndex(FI));
+
+      if (!ScratchExecCopy)
+        ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+      MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+          .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub0));
+
+      int DwordOff = 0;
+      buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                       FuncInfo->getScratchRSrcReg(), StackPtrReg, FI,
+                       DwordOff);
+
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+          .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub1));
+
+      DwordOff = 4;
+      buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                       FuncInfo->getScratchRSrcReg(), StackPtrReg, FI,
+                       DwordOff);
+
+      buildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createOffset(
+                   nullptr, MCRI->getDwarfRegNum(AMDGPU::PC_REG, false),
+                   MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
+    }
+
+    // EXEC mask is being spilled into memory at the frame
+    // index <EXECSaveIndex> and consumes two double words in
+    // wave64 mode and one doble word in wave32 mode. And
+    // build the corresponding CFI rule.
+    if (EXECSaveIndex && spilledToMemory(MF, *EXECSaveIndex)) {
+      const int FI = *EXECSaveIndex;
+      assert(!MFI.isDeadObjectIndex(FI));
+
+      if (!ScratchExecCopy)
+        ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+      MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+          .addReg(TRI.getSubReg(ScratchExecCopy, AMDGPU::sub0));
+
+      int DwordOff = 0;
+      buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                       FuncInfo->getScratchRSrcReg(), StackPtrReg, FI,
+                       DwordOff);
+
+      if (!ST.isWave32()) {
+        BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+            .addReg(TRI.getSubReg(ScratchExecCopy, AMDGPU::sub1));
+
+        DwordOff = 4;
+        buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI,
+                         DwordOff);
+      }
+
+      buildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createOffset(
+                   nullptr, MCRI->getDwarfRegNum(AMDGPU::EXEC, false),
+                   MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
+    }
+  }
+
+  if (!emitSpillsToMem) {
+    // Return address is being spilled into free VGPR lanes
+    // and consumes two lanes, build the corresponding CFI rule.
+    if (RASaveIndex && !spilledToMemory(MF, *RASaveIndex)) {
+      MCRegister RetAddrReg = TRI.getReturnAddressReg(MF);
+      if (!MBB.isLiveIn(RetAddrReg))
+        MBB.addLiveIn(RetAddrReg);
+
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> ReturnAddressSpill =
+          FuncInfo->getSGPRToVGPRSpills(*RASaveIndex);
+      assert(ReturnAddressSpill.size() == 2);
+      BuildMI(MBB, MBBI, DL,
+              TII->get(AMDGPU::V_WRITELANE_B32),
+              ReturnAddressSpill[0].VGPR)
+          .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub0))
+          .addImm(ReturnAddressSpill[0].Lane)
+          .addReg(ReturnAddressSpill[0].VGPR, RegState::Undef);
+      BuildMI(MBB, MBBI, DL,
+              TII->get(AMDGPU::V_WRITELANE_B32),
+              ReturnAddressSpill[1].VGPR)
+          .addReg(TRI.getSubReg(RetAddrReg, AMDGPU::sub1))
+          .addImm(ReturnAddressSpill[1].Lane)
+          .addReg(ReturnAddressSpill[1].VGPR, RegState::Undef);
+      buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::PC_REG,
+                                 ReturnAddressSpill);
+    }
+
+    // EXEC mask is being spilled into free VGPR lanes and consumes
+    // two lanes in wave64 mode and one lane in wave32 mode, build
+    // the corresponding CFI rule.
+    if (EXECSaveIndex && !spilledToMemory(MF, *EXECSaveIndex)) {
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> EXECSpill =
+          FuncInfo->getSGPRToVGPRSpills(*EXECSaveIndex);
+      assert(EXECSpill.size());
+      BuildMI(MBB, MBBI, DL,
+              TII->get(AMDGPU::V_WRITELANE_B32),
+              EXECSpill[0].VGPR)
+          .addReg(AMDGPU::EXEC_LO)
+          .addImm(EXECSpill[0].Lane)
+          .addReg(EXECSpill[0].VGPR, RegState::Undef);
+      if (!ST.isWave32()) {
+        assert(EXECSpill.size() == 2);
+        BuildMI(MBB, MBBI, DL,
+                TII->get(AMDGPU::V_WRITELANE_B32),
+                EXECSpill[1].VGPR)
+            .addReg(AMDGPU::EXEC_HI)
+            .addImm(EXECSpill[1].Lane)
+            .addReg(EXECSpill[1].VGPR, RegState::Undef);
+      }
+      buildCFIForSGPRToVGPRSpill(MBB, MBBI, DL, AMDGPU::EXEC, EXECSpill);
+    }
+  }
+}
+
+// Emit the CFI rules for non-kernel functions in the prologue
 void SIFrameLowering::emitPrologueEntryCFI(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MBBI,
                                            const DebugLoc &DL) const {
@@ -974,23 +1149,8 @@
 
   emitPrologueEntryCFI(MBB, MBBI, DL);
 
-  bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
-  bool SpillFPToMemory = false;
-  // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
-  // Otherwise we are spilling the FP to memory.
-  if (HasFPSaveIndex) {
-    SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
-                      TargetStackID::SGPRSpill;
-  }
-
-  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
-  bool SpillBPToMemory = false;
-  // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
-  // Otherwise we are spilling the BP to memory.
-  if (HasBPSaveIndex) {
-    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
-                      TargetStackID::SGPRSpill;
-  }
+  Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
+  Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
 
   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
          : FuncInfo->getSGPRSpillVGPRs()) {
@@ -1001,9 +1161,10 @@
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
 
     int FI = Reg.FI.getValue();
+    int DwordOff = 0;
 
     buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
-                     FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, DwordOff);
 
     // We spill the entire VGPR, so we can get away with just cfi_offset
     buildCFI(MBB, MBBI, DL,
@@ -1012,8 +1173,14 @@
                  MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
   }
 
-  if (HasFPSaveIndex && SpillFPToMemory) {
-    const int FI = *FuncInfo->FramePointerSaveIndex;
+  if (TRI.isCFISavedRegsSpillEnabled()) {
+    bool emitSpillsToMem = true;
+    emitCFISavedRegSpills(MF, MBB, MBBI, LiveRegs, ScratchExecCopy,
+                          emitSpillsToMem);
+  }
+
+  if (FPSaveIndex && spilledToMemory(MF, *FPSaveIndex)) {
+    const int FI = *FPSaveIndex;
     assert(!MFI.isDeadObjectIndex(FI));
 
     if (!ScratchExecCopy)
@@ -1027,16 +1194,17 @@
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(FramePtrReg);
 
+    int DwordOff = 0;
     buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
-                     FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg, FI, DwordOff);
     buildCFI(MBB, MBBI, DL,
              MCCFIInstruction::createOffset(
                  nullptr, MCRI->getDwarfRegNum(FramePtrReg, false),
                  MFI.getObjectOffset(FI) * ST.getWavefrontSize()));
   }
 
-  if (HasBPSaveIndex && SpillBPToMemory) {
-    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+  if (BPSaveIndex && spilledToMemory(MF, *BPSaveIndex)) {
+    const int BasePtrFI = *BPSaveIndex;
     assert(!MFI.isDeadObjectIndex(BasePtrFI));
 
     if (!ScratchExecCopy)
@@ -1050,8 +1218,10 @@
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(BasePtrReg);
 
+    int DwordOff = 0;
     buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
-                     FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI,
+                     DwordOff);
     buildCFI(MBB, MBBI, DL,
              MCCFIInstruction::createOffset(
                  nullptr, MCRI->getDwarfRegNum(BasePtrReg, false),
@@ -1067,9 +1237,15 @@
     LiveRegs.addReg(ScratchExecCopy);
   }
 
+  if (TRI.isCFISavedRegsSpillEnabled()) {
+    bool emitSpillsToMem = false;
+    emitCFISavedRegSpills(MF, MBB, MBBI, LiveRegs, ScratchExecCopy,
+                          emitSpillsToMem);
+  }
+
   // In this case, spill the FP to a reserved VGPR.
-  if (HasFPSaveIndex && !SpillFPToMemory) {
-    const int FI = *FuncInfo->FramePointerSaveIndex;
+  if (FPSaveIndex && !spilledToMemory(MF, *FPSaveIndex)) {
+    const int FI = *FPSaveIndex;
     assert(!MFI.isDeadObjectIndex(FI));
 
     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
@@ -1089,8 +1265,8 @@
   }
 
   // In this case, spill the BP to a reserved VGPR.
-  if (HasBPSaveIndex && !SpillBPToMemory) {
-    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+  if (BPSaveIndex && !spilledToMemory(MF, *BPSaveIndex)) {
+    const int BasePtrFI = *BPSaveIndex;
     assert(!MFI.isDeadObjectIndex(BasePtrFI));
 
     assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
@@ -1248,19 +1424,8 @@
   const Register BasePtrReg =
       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
 
-  bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
-  bool SpillFPToMemory = false;
-  if (HasFPSaveIndex) {
-    SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
-                      TargetStackID::SGPRSpill;
-  }
-
-  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
-  bool SpillBPToMemory = false;
-  if (HasBPSaveIndex) {
-    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
-                      TargetStackID::SGPRSpill;
-  }
+  Optional<int> FPSaveIndex = FuncInfo->FramePointerSaveIndex;
+  Optional<int> BPSaveIndex = FuncInfo->BasePointerSaveIndex;
 
   if (RoundedSize != 0 && hasFP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
@@ -1282,10 +1447,10 @@
   }
 
   Register ScratchExecCopy;
-  if (HasFPSaveIndex) {
-    const int FI = *FuncInfo->FramePointerSaveIndex;
+  if (FPSaveIndex) {
+    const int FI = *FPSaveIndex;
     assert(!MFI.isDeadObjectIndex(FI));
-    if (SpillFPToMemory) {
+    if (spilledToMemory(MF, *FPSaveIndex)) {
       if (!ScratchExecCopy)
         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
 
@@ -1314,10 +1479,10 @@
              MCCFIInstruction::createDefCfaRegister(
                  nullptr, MCRI->getDwarfRegNum(StackPtrReg, false)));
 
-  if (HasBPSaveIndex) {
-    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+  if (BPSaveIndex) {
+    const int BasePtrFI = *BPSaveIndex;
     assert(!MFI.isDeadObjectIndex(BasePtrFI));
-    if (SpillBPToMemory) {
+    if (spilledToMemory(MF, *BPSaveIndex)) {
       if (!ScratchExecCopy)
         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
 
@@ -1366,14 +1531,25 @@
 #ifndef NDEBUG
 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
     if (!MFI.isDeadObjectIndex(I) &&
-        MFI.getStackID(I) == TargetStackID::SGPRSpill &&
-        (I != FuncInfo->FramePointerSaveIndex &&
-         I != FuncInfo->BasePointerSaveIndex)) {
-      return false;
+        MFI.getStackID(I) == TargetStackID::SGPRSpill) {
+      // Found a non dead SGPR spill
+      if (I != FuncInfo->FramePointerSaveIndex &&
+          I != FuncInfo->BasePointerSaveIndex &&
+          (!TRI->isCFISavedRegsSpillEnabled() ||
+           (I != FuncInfo->ReturnAddressSaveIndex &&
+            I != FuncInfo->EXECSaveIndex))) {
+        // This is a hack. Consider the SGPR spill as dead if it is for the
+        // FP, BP or RA/EXEC if '-amdgpu-spill-cfi-saved-regs' is enabled.
+        return false;
+      } else {
+        return true;
+      }
     }
   }
 
@@ -1399,14 +1575,14 @@
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
-  FuncInfo->removeDeadFrameIndices(MFI);
+  FuncInfo->removeDeadFrameIndices(MF);
   assert(allSGPRSpillsAreDead(MF) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 
   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
   // but currently hasNonSpillStackObjects is set only from source
   // allocas. Stack temps produced from legalization are not counted currently.
-  if (!allStackObjectsAreDead(MFI)) {
+  if (!allStackObjectsAreDead(MF)) {
     assert(RS && "RegScavenger required if spilling");
 
     if (FuncInfo->isEntryFunction()) {
@@ -1422,6 +1598,35 @@
   }
 }
 
+// Find a register/memory location for RA and EXEC saves
+static void allocateCFISave(MachineFunction &MF, int &FI, Register Reg) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+  if (MFI->haveFreeLanesForSGPRSpill(MF, TRI->getSpillSize(*RC) / 4)) {
+    int NewFI = MF.getFrameInfo().CreateStackObject(
+        TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr,
+        TargetStackID::SGPRSpill);
+    if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+      FI = NewFI;
+    }
+  } else {
+    int NewFI = MF.getFrameInfo().CreateStackObject(
+        TRI->getSpillSize(*RC), TRI->getSpillAlign(*RC), true, nullptr,
+        TargetStackID::SGPRSpill);
+    if (TRI->spillSGPRToVGPR() && MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+      FI = NewFI;
+    } else {
+      // Remove dead <NewFI> index
+      MF.getFrameInfo().RemoveStackObject(NewFI);
+      FI = MF.getFrameInfo().CreateSpillStackObject(
+          TRI->getSpillSize(*RC), Align(TRI->getSpillAlign(*RC)));
+    }
+  }
+  return;
+}
+
 // Only report VGPRs to generic code.
 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                            BitVector &SavedVGPRs,
@@ -1438,6 +1643,13 @@
   // Ignore the SGPRs the default implementation found.
   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
 
+  if (TRI->isCFISavedRegsSpillEnabled()) {
+    allocateCFISave(MF, MFI->ReturnAddressSaveIndex,
+                    TRI->getReturnAddressReg(MF));
+    allocateCFISave(MF, MFI->EXECSaveIndex,
+                    ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+  }
+
   // hasFP only knows about stack objects that already exist. We're now
   // determining the stack slots that will be created, so we have to predict
   // them. Stack objects force FP usage with calls.
@@ -1446,9 +1658,8 @@
   // don't want to report it here.
   //
   // FIXME: Is this really hasReservedCallFrame?
-  const bool WillHaveFP =
-      FrameInfo.hasCalls() &&
-      (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
+  bool WillHaveFP = FrameInfo.hasCalls() &&
+                    (SavedVGPRs.any() || !allStackObjectsAreDead(MF));
 
   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
   // so don't allow the default insertion to handle them.
@@ -1587,6 +1798,7 @@
   return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
 }
 
+// Emit the spill instructions for CSRs
 bool SIFrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
@@ -1685,6 +1897,7 @@
   }
 }
 
+// Emit CFI for an SGPR spilled to a single lane of a VGPR
 void SIFrameLowering::buildCFIForSGPRToVGPRSpill(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, const Register SGPR, const Register VGPR,
@@ -1729,6 +1942,60 @@
            MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
 }
 
+// Emit CFI for an SGPR spilled to multiple lanes of VGPRs
+void SIFrameLowering::buildCFIForSGPRToVGPRSpill(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, Register SGPR,
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills) const {
+  MachineFunction &MF = *MBB.getParent();
+  const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo();
+  int DwarfSGPR = MCRI.getDwarfRegNum(SGPR, false);
+
+  // CFI for an SGPR spilled to a multiple lanes of VGPRs is implemented as an
+  // expression(E) rule where E is a composite location description
+  // with multiple parts each referencing
+  // VGPR register location storage with a bit offset of the lane index
+  // multiplied by the size of an SGPR (32 bits). In other words we generate
+  // the following DWARF:
+  //
+  // DW_CFA_expression: <SGPR>,
+  //    (DW_OP_regx <VGPR[0]>) (DW_OP_bit_piece 32, <Lane[0]>*32)
+  //    (DW_OP_regx <VGPR[1]>) (DW_OP_bit_piece 32, <Lane[1]>*32)
+  //    ...
+  //    (DW_OP_regx <VGPR[N]>) (DW_OP_bit_piece 32, <Lane[N]>*32)
+  //
+  // The memory location description for the current CFA is pushed on the
+  // stack before E is evaluated, but we choose not to drop it as it would
+  // require a longer expression E and DWARF defines the result of the
+  // evaulation to be the location description on the top of the stack (i.e. the
+  // implictly pushed one is just ignored.)
+  SmallString<20> CFIInst;
+  raw_svector_ostream OSCFIInst(CFIInst);
+  SmallString<20> Block;
+  raw_svector_ostream OSBlock(Block);
+
+  OSCFIInst << uint8_t(dwarf::DW_CFA_expression);
+  encodeULEB128(DwarfSGPR, OSCFIInst);
+
+  // TODO: Detect when we can merge multiple adjacent pieces, or even reduce
+  // this to a register location description (when all pieces are adjacent).
+  for (SIMachineFunctionInfo::SpilledReg Spill : VGPRSpills) {
+    encodeDwarfRegisterLocation(MCRI.getDwarfRegNum(Spill.VGPR, false),
+                                OSBlock);
+    OSBlock << uint8_t(dwarf::DW_OP_bit_piece);
+    // FIXME:Can this be a function of the SGPR?
+    const unsigned SGPRBitSize = 32;
+    encodeULEB128(SGPRBitSize, OSBlock);
+    encodeULEB128(SGPRBitSize * Spill.Lane, OSBlock);
+  }
+
+  encodeULEB128(Block.size(), OSCFIInst);
+  OSCFIInst << Block;
+
+  buildCFI(MBB, MBBI, DL,
+           MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
+}
+
 void SIFrameLowering::buildCFIForVGPRToVMEMSpill(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     const DebugLoc &DL, unsigned VGPR, int64_t Offset) const {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -491,6 +491,9 @@
   Register SGPRForBPSaveRestoreCopy;
   Optional<int> BasePointerSaveIndex;
 
+  int ReturnAddressSaveIndex;
+  int EXECSaveIndex;
+
   Register VGPRReservedForSGPRSpill;
   bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
 
@@ -536,7 +539,7 @@
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
   bool reserveVGPRforSGPRSpills(MachineFunction &MF);
   bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
-  void removeDeadFrameIndices(MachineFrameInfo &MFI);
+  void removeDeadFrameIndices(MachineFunction &MF);
 
   bool hasCalculatedTID() const { return TIDReg != 0; };
   Register getTIDReg() const { return TIDReg; };
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -420,10 +420,16 @@
   return Spill.FullyAllocated;
 }
 
-void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
-  // The FP & BP spills haven't been inserted yet, so keep them around.
+// Remove the dead spill locations
+void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFunction &MF) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  // RA, EXEC, FP & BP spills haven't been inserted yet, so keep them around.
   for (auto &R : SGPRToVGPRSpills) {
-    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex)
+    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex &&
+        (!TRI->isCFISavedRegsSpillEnabled() ||
+         (R.first != ReturnAddressSaveIndex && R.first != EXECSaveIndex)))
       MFI.RemoveStackObject(R.first);
   }
 
@@ -431,7 +437,9 @@
   // ID.
   for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
        ++i)
-    if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
+    if (i != FramePointerSaveIndex && i != BasePointerSaveIndex &&
+        (!TRI->isCFISavedRegsSpillEnabled() ||
+         (i != ReturnAddressSaveIndex && i != EXECSaveIndex)))
       MFI.setStackID(i, TargetStackID::Default);
 
   for (auto &R : VGPRToAGPRSpills) {
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -56,6 +56,8 @@
     return SpillSGPRToVGPR;
   }
 
+  bool isCFISavedRegsSpillEnabled() const;
+
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
   MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -33,6 +33,11 @@
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool> EnableSpillCFISavedRegs(
+    "amdgpu-spill-cfi-saved-regs",
+    cl::desc("Enable spilling the registers required for CFI emission"),
+    cl::ReallyHidden, cl::init(false), cl::ZeroOrMore);
+
 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
 
@@ -194,6 +199,10 @@
   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
 }
 
+bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const {
+  return EnableSpillCFISavedRegs;
+}
+
 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll
@@ -0,0 +1,170 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s
+
+; CHECK-LABEL: kern:
+; CHECK: .cfi_startproc
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: %bb.0:
+; CHECK-NEXT: .cfi_escape 0x0f, 0x03, 0x30, 0x36, 0xe1
+; CHECK-NEXT: .cfi_undefined 16
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: .cfi_endproc
+define protected amdgpu_kernel void @kern() #0 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: func_saved_in_clobbered_vgpr:
+; CHECK: .cfi_startproc
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: %bb.0:
+; SGPR32 = 64
+; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6
+; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
+
+
+; FIXME: ideally this would not care what VGPR we spill to, but since we are
+; using .cfi_escape it isn't trivial/possible to make this general yet
+
+; CHECK: v_writelane_b32 v0, s30, 0
+; CHECK-NEXT: v_writelane_b32 v0, s31, 1
+
+; DW_CFA_expression [0x10]
+;   PC_64 ULEB128(17)=[0x10]
+;   BLOCK_LENGTH ULEB128(12)=[0x0c]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x00]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x20]
+; WAVE64-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x20
+
+; DW_CFA_expression [0x10]
+;   PC_64 ULEB128(17)=[0x10]
+;   BLOCK_LENGTH ULEB128(12)=[0x0c]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x00]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x20]
+; WAVE32-NEXT: .cfi_escape 0x10, 0x10, 0x0c, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x00, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x20
+
+
+; WAVE64: v_writelane_b32 v0, exec_lo, 2
+; WAVE64-NEXT: v_writelane_b32 v0, exec_hi, 3
+; DW_CFA_expression [0x10]
+;   EXEC_MASK_wave64 ULEB128(17)=[0x11]
+;   BLOCK_LENGTH ULEB128(12)=[0x0c]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x40]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave64 ULEB128(2560)=[0x80, 0x14]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x60]
+; WAVE64-NEXT: .cfi_escape 0x10, 0x11, 0x0c, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x40, 0x90, 0x80, 0x14, 0x9d, 0x20, 0x60
+
+; WAVE32: v_writelane_b32 v0, exec_lo, 2
+; DW_CFA_expression [0x10]
+;   EXEC_MASK_wave32 ULEB128(1)=[0x01]
+;   BLOCK_LENGTH ULEB128(6)=[0x06]
+;     DW_OP_regx [0x90]
+;       VGPR0_wave32 ULEB128(1536)=[0x80, 0x0c]
+;     DW_OP_bit_piece [0x9d]
+;       PIECE_SIZE [0x20]
+;       PIECE_OFFSET [0x40]
+; WAVE32-NEXT: .cfi_escape 0x10, 0x01, 0x06, 0x90, 0x80, 0x0c, 0x9d, 0x20, 0x40
+
+; CHECK-NOT: .cfi_{{.*}}
+; CHECK: .cfi_endproc
+define hidden void @func_saved_in_clobbered_vgpr() #0 {
+entry:
+  ret void
+}
+
+; Check that the option causes a CSR VGPR to spill when needed.
+
+; CHECK-LABEL: func_saved_in_preserved_vgpr:
+; CHECK: %bb.0:
+
+; CHECK: s_or_saveexec_b{{(32|64)}}
+; CHECK: buffer_store_dword [[CSR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK: s_mov_b{{(32|64)}} {{(exec|exec_lo)}},
+
+; CHECK: v_writelane_b32 [[CSR]], s30, {{[0-9]+}}
+; CHECK-NEXT: v_writelane_b32 [[CSR]], s31, {{[0-9]+}}
+
+; WAVE64: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}}
+; WAVE64-NEXT: v_writelane_b32 [[CSR]], exec_hi, {{[0-9]+}}
+
+; WAVE32: v_writelane_b32 [[CSR]], exec_lo, {{[0-9]+}}
+
+define hidden void @func_saved_in_preserved_vgpr() #0 {
+entry:
+  call void asm sideeffect "; clobber nonpreserved VGPRs",
+    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"()
+  ret void
+}
+
+; There's no return here, so the return address live in was
+; deleted. It needs to be re-added as a live in to the entry block.
+; CHECK-LABEL: {{^}}empty_func:
+; CHECK: v_writelane_b32 v0, s30, 0
+; CHECK: v_writelane_b32 v0, s31, 1
+define void @empty_func() {
+  unreachable
+}
+
+; Check that the option causes RA and EXEC to be spilled to memory.
+
+; CHECK-LABEL: no_vgprs_to_spill_into:
+; CHECK: %bb.0:
+
+; WAVE64: s_or_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT: v_mov_b32_e32 v0, s30
+; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-NEXT: v_mov_b32_e32 v0, s31
+; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE64-NEXT: .cfi_offset 16, 0
+; WAVE64-NEXT: v_mov_b32_e32 v0, s4
+; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; WAVE64-NEXT: v_mov_b32_e32 v0, s5
+; WAVE64-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; WAVE64-NEXT: .cfi_offset 17, 512
+; WAVE64-NEXT: s_mov_b64 exec, s[4:5]
+ 
+define void @no_vgprs_to_spill_into() #1 {
+  call void asm sideeffect "",
+    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+    ,~{v20},~{v21},~{v22},~{v23},~{v24}"()
+
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "filename", directory: "directory")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -522,102 +522,6 @@
   ret void
 }
 
-; CHECK-LABEL: func_spill_vgpr_to_vmem:
-; CHECK: .cfi_startproc
-
-; CHECK-NOT: .cfi_{{.*}}
-
-; CHECK: %bb.0:
-; SGPR32 = 64
-; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6
-; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
-
-; CHECK-NOT: .cfi_{{.*}}
-
-; CHECK: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-
-; DW_CFA_expression [0x10]
-;   VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14]
-;   BLOCK_LENGTH ULEB128(14)=[0x0e]
-;     DW_OP_regx [0x90]
-;       VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14]
-;     DW_OP_swap [0x16]
-;     DW_OP_LLVM_offset_uconst [0xe4]
-;       OFFSET ULEB128(256)=[0x80, 0x02]
-;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
-;       EXEC_MASK_wave64 ULEB128(17)=[0x11]
-;     DW_OP_deref_size [0x94]
-;       SIZE [0x08]
-;     DW_OP_LLVM_select_bit_piece [0xec]
-;       ELEMENT_SIZE [0x20]
-;       ELEMENT_COUNT [0x40]
-; WAVE64-NEXT: .cfi_escape 0x10, 0xa8, 0x14, 0x0e, 0x90, 0xa8, 0x14, 0x16, 0xe4, 0x80, 0x02, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40
-
-; DW_CFA_expression [0x10]
-;   VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c]
-;   BLOCK_LENGTH ULEB128(14)=[0x0e]
-;     DW_OP_regx [0x90]
-;       VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c]
-;     DW_OP_swap [0x16]
-;     DW_OP_LLVM_offset_uconst [0xe4]
-;       OFFSET ULEB128(128)=[0x80, 0x01]
-;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
-;       EXEC_MASK_wave32 ULEB128(1)=[0x01]
-;     DW_OP_deref_size [0x94]
-;       SIZE [0x04]
-;     DW_OP_LLVM_select_bit_piece [0xec]
-;       ELEMENT_SIZE [0x20]
-;       ELEMENT_COUNT [0x20]
-; WAVE32-NEXT: .cfi_escape 0x10, 0xa8, 0x0c, 0x0e, 0x90, 0xa8, 0x0c, 0x16, 0xe4, 0x80, 0x01, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20
-
-; CHECK-NOT: .cfi_{{.*}}
-
-; CHECK: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
-
-; DW_CFA_expression [0x10]
-;   VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14]
-;   BLOCK_LENGTH ULEB128(13)=[0x0d]
-;     DW_OP_regx [0x90]
-;       VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14]
-;     DW_OP_swap [0x16]
-;     DW_OP_LLVM_offset_uconst [0xe4]
-;       OFFSET ULEB128(0)=[0x00]
-;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
-;       EXEC_MASK_wave64 ULEB128(17)=[0x11]
-;     DW_OP_deref_size [0x94]
-;       SIZE [0x08]
-;     DW_OP_LLVM_select_bit_piece [0xec]
-;       ELEMENT_SIZE [0x20]
-;       ELEMENT_COUNT [0x40]
-; WAVE64-NEXT: .cfi_escape 0x10, 0xa9, 0x14, 0x0d, 0x90, 0xa9, 0x14, 0x16, 0xe4, 0x00, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40
-
-; DW_CFA_expression [0x10]
-;   VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c]
-;   BLOCK_LENGTH ULEB128(13)=[0x0d]
-;     DW_OP_regx [0x90]
-;       VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c]
-;     DW_OP_swap [0x16]
-;     DW_OP_LLVM_offset_uconst [0xe4]
-;       OFFSET ULEB128(0)=[0x00]
-;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
-;       EXEC_MASK_wave32 ULEB128(1)=[0x01]
-;     DW_OP_deref_size [0x94]
-;       SIZE [0x04]
-;     DW_OP_LLVM_select_bit_piece [0xec]
-;       ELEMENT_SIZE [0x20]
-;       ELEMENT_COUNT [0x20]
-; WAVE32-NEXT: .cfi_escape 0x10, 0xa9, 0x0c, 0x0d, 0x90, 0xa9, 0x0c, 0x16, 0xe4, 0x00, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20
-
-; CHECK-NOT: .cfi_{{.*}}
-
-; CHECK: .cfi_endproc
-define hidden void @func_spill_vgpr_to_vmem() #0 {
-entry:
-  call void asm sideeffect "; clobber", "~{v40}"() #0
-  call void asm sideeffect "; clobber", "~{v41}"() #0
-  ret void
-}
-
 ; NOTE: Number of VGPRs available to kernel, and in turn number of corresponding CFIs generated,
 ; is dependent on waves/WG size. Since the intent here is to check whether we generate the correct
 ; CFIs, doing it for any one set of details is sufficient which also makes the test insensitive to
diff --git a/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll b/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pei-cfi-saves-bug.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-cfi-saved-regs < %s | FileCheck %s
+
+; Function Attrs: noinline optnone
+define fastcc void @tail_callee() #2 {
+; CHECK-LABEL: tail_callee:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v0, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v0, exec_lo, 2
+; CHECK-NEXT:    v_writelane_b32 v0, exec_hi, 3
+; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  ret void
+}
+
+; Function Attrs: noinline
+define fastcc void @callee_no_fp() #0 {
+; CHECK-LABEL: callee_no_fp:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v1, exec_lo, 2
+; CHECK-NEXT:    v_writelane_b32 v1, exec_hi, 3
+; CHECK-NEXT:    v_writelane_b32 v1, s33, 4
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_add_u32 s32, s32, 0x400
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, tail_callee@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, tail_callee@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+entry:
+  tail call fastcc void @tail_callee() #3
+  unreachable
+}
+
+define protected amdgpu_kernel void @kernel() #1 {
+; CHECK-LABEL: kernel:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_cbranch_scc0 BB2_2
+; CHECK-NEXT:  ; %bb.1: ; %end
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  BB2_2: ; %body
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, callee_no_fp@gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, callee_no_fp@gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+entry:
+  br i1 undef, label %end, label %body
+
+body:                                 ; preds = %entry
+  tail call fastcc void @callee_no_fp() #3
+  unreachable
+
+end:                                  ; preds = %entry
+  ret void
+}
+
+; When we have calls, spilling a CSR VGPR for CFI saves should force FP usage
+; Function Attrs: noinline
+define dso_local fastcc void @func_needs_fp() unnamed_addr #0 {
+; CHECK-LABEL: func_needs_fp:
+; CHECK:       func_needs_fp$local:
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v40, exec_lo, 2
+; CHECK-NEXT:    v_writelane_b32 v40, exec_hi, 3
+; CHECK-NEXT:    v_writelane_b32 v40, s33, 4
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_add_u32 s32, s32, 0x400
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, tail_callee_fp@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, tail_callee_fp@rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+entry:
+  tail call fastcc void @tail_callee_fp() #3
+  unreachable
+}
+
+; Function Attrs: noinline optnone
+declare dso_local fastcc void @tail_callee_fp() unnamed_addr #2
+
+attributes #0 = { noinline }
+attributes #1 = { "use-soft-float"="false" }
+attributes #2 = { noinline optnone }
+attributes #3 = { convergent nounwind }
+