diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -54,6 +54,11 @@
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
 
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const ArrayRef<CalleeSavedInfo> CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
 private:
   void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
@@ -85,6 +90,12 @@
                                   MachineBasicBlock::iterator MBBI,
                                   const DebugLoc &DL, const Register SGPR,
                                   const Register VGPR, const int Lane) const;
+  /// Create a CFI index describing a spill of a VGPR to VMEM and
+  /// build a MachineInstr around it.
+  void buildCFIForVGPRToVMEMSpill(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const DebugLoc &DL, unsigned VGPR,
+                                  int64_t Offset) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1377,6 +1377,30 @@
   return MBB.erase(I);
 }
 
+bool SIFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const SIInstrInfo *TII = MF.getSubtarget<GCNSubtarget>().getInstrInfo();
+
+  for (const CalleeSavedInfo &CS : CSI) {
+    // Insert the spill to the stack frame.
+    unsigned Reg = CS.getReg();
+
+    if (CS.isSpilledToReg()) {
+      BuildMI(MBB, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY),
+              CS.getDstReg())
+          .addReg(Reg, getKillRegState(true));
+    } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII->storeRegToStackSlotCFI(MBB, MBBI, Reg, true, CS.getFrameIdx(), RC,
+                                  TRI);
+    }
+  }
+
+  return true;
+}
+
 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
@@ -1461,3 +1485,42 @@
   buildCFI(MBB, MBBI, DL,
            MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
 }
+
+void SIFrameLowering::buildCFIForVGPRToVMEMSpill(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, unsigned VGPR, int64_t Offset) const {
+  MachineFunction &MF = *MBB.getParent();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const MCRegisterInfo &MCRI = *MF.getMMI().getContext().getRegisterInfo();
+  int DwarfVGPR = MCRI.getDwarfRegNum(VGPR, false);
+
+  SmallString<20> CFIInst;
+  raw_svector_ostream OSCFIInst(CFIInst);
+  SmallString<20> Block;
+  raw_svector_ostream OSBlock(Block);
+
+  OSCFIInst << uint8_t(dwarf::DW_CFA_expression);
+  encodeULEB128(DwarfVGPR, OSCFIInst);
+
+  encodeDwarfRegisterLocation(DwarfVGPR, OSBlock);
+  OSBlock << uint8_t(dwarf::DW_OP_swap);
+  OSBlock << uint8_t(dwarf::DW_OP_LLVM_offset_uconst);
+  encodeULEB128(Offset, OSBlock);
+  OSBlock << uint8_t(dwarf::DW_OP_LLVM_call_frame_entry_reg);
+  encodeULEB128(MCRI.getDwarfRegNum(
+                    ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC, false),
+                OSBlock);
+  OSBlock << uint8_t(dwarf::DW_OP_deref_size);
+  OSBlock << uint8_t(ST.getWavefrontSize() / CHAR_BIT);
+  OSBlock << uint8_t(dwarf::DW_OP_LLVM_select_bit_piece);
+  // FIXME: Can this be a function of the VGPR?
+  const unsigned VGPRLaneBitSize = 32;
+  encodeULEB128(VGPRLaneBitSize, OSBlock);
+  encodeULEB128(ST.getWavefrontSize(), OSBlock);
+
+  encodeULEB128(Block.size(), OSCFIInst);
+  OSCFIInst << Block;
+
+  buildCFI(MBB, MBBI, DL,
+           MCCFIInstruction::createEscape(nullptr, OSCFIInst.str()));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -222,12 +222,27 @@
                     MachineBasicBlock::iterator I, const DebugLoc &DL,
                     Register SrcReg, int Value)  const;
 
+private:
+  void storeRegToStackSlotImpl(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MI, Register SrcReg,
+                               bool isKill, int FrameIndex,
+                               const TargetRegisterClass *RC,
+                               const TargetRegisterInfo *TRI,
+                               bool NeedsCFI) const;
+
+public:
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
+  void storeRegToStackSlotCFI(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI, Register SrcReg,
+                              bool isKill, int FrameIndex,
+                              const TargetRegisterClass *RC,
+                              const TargetRegisterInfo *TRI) const;
+
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1161,51 +1161,63 @@
   return get(getIndirectVGPRWritePseudoOpc(VecSize));
 }
 
-static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
+static unsigned getSGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
   switch (Size) {
   case 4:
-    return AMDGPU::SI_SPILL_S32_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S32_CFI_SAVE : AMDGPU::SI_SPILL_S32_SAVE;
   case 8:
-    return AMDGPU::SI_SPILL_S64_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S64_CFI_SAVE : AMDGPU::SI_SPILL_S64_SAVE;
   case 12:
-    return AMDGPU::SI_SPILL_S96_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S96_CFI_SAVE : AMDGPU::SI_SPILL_S96_SAVE;
   case 16:
-    return AMDGPU::SI_SPILL_S128_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S128_CFI_SAVE
+                    : AMDGPU::SI_SPILL_S128_SAVE;
   case 20:
-    return AMDGPU::SI_SPILL_S160_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S160_CFI_SAVE
+                    : AMDGPU::SI_SPILL_S160_SAVE;
   case 24:
-    return AMDGPU::SI_SPILL_S192_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S192_CFI_SAVE
+                    : AMDGPU::SI_SPILL_S192_SAVE;
   case 32:
-    return AMDGPU::SI_SPILL_S256_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S256_CFI_SAVE
+                    : AMDGPU::SI_SPILL_S256_SAVE;
   case 64:
-    return AMDGPU::SI_SPILL_S512_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S512_CFI_SAVE
+                    : AMDGPU::SI_SPILL_S512_SAVE;
   case 128:
-    return AMDGPU::SI_SPILL_S1024_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_S1024_CFI_SAVE
+                    : AMDGPU::SI_SPILL_S1024_SAVE;
   default:
     llvm_unreachable("unknown register size");
   }
 }
 
-static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
+static unsigned getVGPRSpillSaveOpcode(unsigned Size, bool NeedsCFI) {
   switch (Size) {
   case 4:
-    return AMDGPU::SI_SPILL_V32_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V32_CFI_SAVE : AMDGPU::SI_SPILL_V32_SAVE;
   case 8:
-    return AMDGPU::SI_SPILL_V64_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V64_CFI_SAVE : AMDGPU::SI_SPILL_V64_SAVE;
   case 12:
-    return AMDGPU::SI_SPILL_V96_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V96_CFI_SAVE : AMDGPU::SI_SPILL_V96_SAVE;
   case 16:
-    return AMDGPU::SI_SPILL_V128_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V128_CFI_SAVE
+                    : AMDGPU::SI_SPILL_V128_SAVE;
   case 20:
-    return AMDGPU::SI_SPILL_V160_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V160_CFI_SAVE
+                    : AMDGPU::SI_SPILL_V160_SAVE;
   case 24:
-    return AMDGPU::SI_SPILL_V192_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V192_CFI_SAVE
+                    : AMDGPU::SI_SPILL_V192_SAVE;
   case 32:
-    return AMDGPU::SI_SPILL_V256_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V256_CFI_SAVE
+                    : AMDGPU::SI_SPILL_V256_SAVE;
   case 64:
-    return AMDGPU::SI_SPILL_V512_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V512_CFI_SAVE
+                    : AMDGPU::SI_SPILL_V512_SAVE;
   case 128:
-    return AMDGPU::SI_SPILL_V1024_SAVE;
+    return NeedsCFI ? AMDGPU::SI_SPILL_V1024_CFI_SAVE
+                    : AMDGPU::SI_SPILL_V1024_SAVE;
   default:
     llvm_unreachable("unknown register size");
   }
@@ -1228,12 +1240,10 @@
   }
 }
 
-void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MI,
-                                      Register SrcReg, bool isKill,
-                                      int FrameIndex,
-                                      const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI) const {
+void SIInstrInfo::storeRegToStackSlotImpl(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
+    bool isKill, int FrameIndex, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI, bool NeedsCFI) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
@@ -1254,7 +1264,8 @@
 
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling SGPRs.
-    const MCInstrDesc &OpDesc = get(getSGPRSpillSaveOpcode(SpillSize));
+    const MCInstrDesc &OpDesc =
+        get(getSGPRSpillSaveOpcode(SpillSize, NeedsCFI));
 
     // The SGPR spill/restore instructions only work on number sgprs, so we need
     // to make sure we are using the correct register class.
@@ -1277,8 +1288,9 @@
     return;
   }
 
-  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
-                                    : getVGPRSpillSaveOpcode(SpillSize);
+  unsigned Opcode = RI.hasAGPRs(RC)
+                        ? getAGPRSpillSaveOpcode(SpillSize)
+                        : getVGPRSpillSaveOpcode(SpillSize, NeedsCFI);
   MFI->setHasSpilledVGPRs();
 
   auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
@@ -1295,6 +1307,24 @@
      .addMemOperand(MMO);
 }
 
+void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MI,
+                                      Register SrcReg, bool isKill,
+                                      int FrameIndex,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, false);
+}
+
+void SIInstrInfo::storeRegToStackSlotCFI(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         Register SrcReg, bool isKill,
+                                         int FrameIndex,
+                                         const TargetRegisterClass *RC,
+                                         const TargetRegisterInfo *TRI) const {
+  storeRegToStackSlotImpl(MBB, MI, SrcReg, isKill, FrameIndex, RC, TRI, true);
+}
+
 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
   switch (Size) {
   case 4:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -645,6 +645,13 @@
       let mayLoad = 0;
     }
 
+    def _CFI_SAVE : PseudoInstSI <
+      (outs),
+      (ins sgpr_class:$data, i32imm:$addr)> {
+      let mayStore = 1;
+      let mayLoad = 0;
+    }
+
     def _RESTORE : PseudoInstSI <
       (outs sgpr_class:$data),
       (ins i32imm:$addr)> {
@@ -683,6 +690,18 @@
       let Size = !if(!le(MaxSize, 256), MaxSize, 252);
     }
 
+    def _CFI_SAVE : VPseudoInstSI <
+      (outs),
+      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+           SReg_32:$soffset, i32imm:$offset)> {
+      let mayStore = 1;
+      let mayLoad = 0;
+      // (2 * 4) + (8 * num_subregs) bytes maximum
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+    }
+
     def _RESTORE : VPseudoInstSI <
       (outs vgpr_class:$vdata),
       (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -89,7 +89,7 @@
                            ArrayRef<CalleeSavedInfo> CSI,
                            LiveIntervals *LIS) {
   MachineFunction &MF = *SaveBlock.getParent();
-  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+  const SIInstrInfo &TII = *MF.getSubtarget<GCNSubtarget>().getInstrInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
 
@@ -103,8 +103,8 @@
       const TargetRegisterClass *RC =
         TRI->getMinimalPhysRegClass(Reg, MVT::i32);
 
-      TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
-                              TRI);
+      TII.storeRegToStackSlotCFI(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
+                                 TRI);
 
       if (LIS) {
         assert(std::distance(MIS.begin(), I) == 1);
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -109,9 +109,8 @@
                                bool IsLoad) const;
 
   /// If \p OnlyToVGPR is true, this will only succeed if this
-  bool spillSGPR(MachineBasicBlock::iterator MI,
-                 int FI, RegScavenger *RS,
-                 bool OnlyToVGPR = false) const;
+  bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
+                 bool OnlyToVGPR = false, bool NeedsCFI = false) const;
 
   bool restoreSGPR(MachineBasicBlock::iterator MI,
                    int FI, RegScavenger *RS,
@@ -326,16 +325,12 @@
   ArrayRef<MCPhysReg> getAllVGPR32(const MachineFunction &MF) const;
 
 private:
-  void buildSpillLoadStore(MachineBasicBlock::iterator MI,
-                           unsigned LoadStoreOp,
-                           int Index,
-                           Register ValueReg,
-                           bool ValueIsKill,
+  void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp,
+                           int Index, Register ValueReg, bool ValueIsKill,
                            MCRegister ScratchRsrcReg,
-                           MCRegister ScratchOffsetReg,
-                           int64_t InstrOffset,
-                           MachineMemOperand *MMO,
-                           RegScavenger *RS) const;
+                           MCRegister ScratchOffsetReg, int64_t InstrOffset,
+                           MachineMemOperand *MMO, RegScavenger *RS,
+                           bool NeedsCFI = false) const;
 };
 
 } // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -14,14 +14,15 @@
 #include "SIRegisterInfo.h"
 #include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUInstPrinter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/Function.h"
@@ -538,22 +539,28 @@
 
   switch (Op) {
   case AMDGPU::SI_SPILL_S1024_SAVE:
+  case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
   case AMDGPU::SI_SPILL_S1024_RESTORE:
   case AMDGPU::SI_SPILL_V1024_SAVE:
+  case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
   case AMDGPU::SI_SPILL_V1024_RESTORE:
   case AMDGPU::SI_SPILL_A1024_SAVE:
   case AMDGPU::SI_SPILL_A1024_RESTORE:
     return 32;
   case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_CFI_SAVE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_V512_SAVE:
+  case AMDGPU::SI_SPILL_V512_CFI_SAVE:
   case AMDGPU::SI_SPILL_V512_RESTORE:
   case AMDGPU::SI_SPILL_A512_SAVE:
   case AMDGPU::SI_SPILL_A512_RESTORE:
     return 16;
   case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_CFI_SAVE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
   case AMDGPU::SI_SPILL_V256_SAVE:
+  case AMDGPU::SI_SPILL_V256_CFI_SAVE:
   case AMDGPU::SI_SPILL_V256_RESTORE:
     return 8;
   case AMDGPU::SI_SPILL_S192_SAVE:
@@ -562,32 +569,42 @@
   case AMDGPU::SI_SPILL_V192_RESTORE:
     return 6;
   case AMDGPU::SI_SPILL_S160_SAVE:
+  case AMDGPU::SI_SPILL_S160_CFI_SAVE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
   case AMDGPU::SI_SPILL_V160_SAVE:
+  case AMDGPU::SI_SPILL_V160_CFI_SAVE:
   case AMDGPU::SI_SPILL_V160_RESTORE:
     return 5;
   case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_CFI_SAVE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
   case AMDGPU::SI_SPILL_V128_SAVE:
+  case AMDGPU::SI_SPILL_V128_CFI_SAVE:
   case AMDGPU::SI_SPILL_V128_RESTORE:
   case AMDGPU::SI_SPILL_A128_SAVE:
   case AMDGPU::SI_SPILL_A128_RESTORE:
     return 4;
   case AMDGPU::SI_SPILL_S96_SAVE:
+  case AMDGPU::SI_SPILL_S96_CFI_SAVE:
   case AMDGPU::SI_SPILL_S96_RESTORE:
   case AMDGPU::SI_SPILL_V96_SAVE:
+  case AMDGPU::SI_SPILL_V96_CFI_SAVE:
   case AMDGPU::SI_SPILL_V96_RESTORE:
     return 3;
   case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_CFI_SAVE:
   case AMDGPU::SI_SPILL_S64_RESTORE:
   case AMDGPU::SI_SPILL_V64_SAVE:
+  case AMDGPU::SI_SPILL_V64_CFI_SAVE:
   case AMDGPU::SI_SPILL_V64_RESTORE:
   case AMDGPU::SI_SPILL_A64_SAVE:
   case AMDGPU::SI_SPILL_A64_RESTORE:
     return 2;
   case AMDGPU::SI_SPILL_S32_SAVE:
+  case AMDGPU::SI_SPILL_S32_CFI_SAVE:
   case AMDGPU::SI_SPILL_S32_RESTORE:
   case AMDGPU::SI_SPILL_V32_SAVE:
+  case AMDGPU::SI_SPILL_V32_CFI_SAVE:
   case AMDGPU::SI_SPILL_V32_RESTORE:
   case AMDGPU::SI_SPILL_A32_SAVE:
   case AMDGPU::SI_SPILL_A32_RESTORE:
@@ -721,20 +738,16 @@
   return true;
 }
 
-void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
-                                         unsigned LoadStoreOp,
-                                         int Index,
-                                         Register ValueReg,
-                                         bool IsKill,
-                                         MCRegister ScratchRsrcReg,
-                                         MCRegister ScratchOffsetReg,
-                                         int64_t InstOffset,
-                                         MachineMemOperand *MMO,
-                                         RegScavenger *RS) const {
+void SIRegisterInfo::buildSpillLoadStore(
+    MachineBasicBlock::iterator MI, unsigned LoadStoreOp, int Index,
+    Register ValueReg, bool IsKill, MCRegister ScratchRsrcReg,
+    MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
+    RegScavenger *RS, bool NeedsCFI) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MI->getParent()->getParent();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const SIFrameLowering *TFL = ST.getFrameLowering();
   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
 
   const MCInstrDesc &Desc = TII->get(LoadStoreOp);
@@ -750,6 +763,7 @@
   unsigned Size = NumSubRegs * EltSize;
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
   int64_t ScratchOffsetRegDelta = 0;
+  int64_t AdditionalCFIOffset = 0;
 
   Align Alignment = MFI.getObjectAlign(Index);
   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
@@ -792,6 +806,8 @@
     if (!SOffset)
       report_fatal_error("could not scavenge SGPR to spill in entry function");
 
+    AdditionalCFIOffset = Offset;
+
     if (ScratchOffsetReg == AMDGPU::NoRegister) {
       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
           .addImm(Offset);
@@ -850,6 +866,11 @@
           .addImm(0) // swz
           .addMemOperand(NewMMO);
 
+      if (IsStore && NeedsCFI)
+        TFL->buildCFIForVGPRToVMEMSpill(*MBB, MI, DL, SubReg,
+                                        Offset * ST.getWavefrontSize() +
+                                            AdditionalCFIOffset);
+
       if (!IsStore && TmpReg != AMDGPU::NoRegister)
         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
                       FinalReg)
@@ -980,14 +1001,14 @@
   }
 }
 
-bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
-                               int Index,
-                               RegScavenger *RS,
-                               bool OnlyToVGPR) const {
+bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
+                               RegScavenger *RS, bool OnlyToVGPR,
+                               bool NeedsCFI) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
+  const SIFrameLowering *TFL = ST.getFrameLowering();
 
   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
     = MFI->getSGPRToVGPRSpills(Index);
@@ -1036,6 +1057,10 @@
         .addImm(Spill.Lane)
         .addReg(Spill.VGPR, VGPRDefined ? 0 : RegState::Undef);
 
+      if (NeedsCFI)
+        TFL->buildCFIForSGPRToVGPRSpill(*MBB, MI, DL, SubReg, Spill.VGPR,
+                                        Spill.Lane);
+
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to
       // it are fixed.
@@ -1086,6 +1111,8 @@
       // Write out VGPR
       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
                               RS, false);
+
+      // TODO: Implement CFI for SpillToVMEM if/when it is fully supported.
     }
   }
 
@@ -1181,7 +1208,18 @@
   MachineBasicBlock::iterator MI,
   int FI,
   RegScavenger *RS) const {
+  bool NeedsCFI = false;
   switch (MI->getOpcode()) {
+  case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S512_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S256_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S160_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S128_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S96_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S64_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S32_CFI_SAVE:
+    NeedsCFI = true;
+    LLVM_FALLTHROUGH;
   case AMDGPU::SI_SPILL_S1024_SAVE:
   case AMDGPU::SI_SPILL_S512_SAVE:
   case AMDGPU::SI_SPILL_S256_SAVE:
@@ -1191,7 +1229,7 @@
   case AMDGPU::SI_SPILL_S96_SAVE:
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S32_SAVE:
-    return spillSGPR(MI, FI, RS, true);
+    return spillSGPR(MI, FI, RS, true, NeedsCFI);
   case AMDGPU::SI_SPILL_S1024_RESTORE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
@@ -1226,8 +1264,21 @@
                           ? getBaseRegister()
                           : getFrameRegister(*MF);
 
+  bool NeedsCFI = false;
+
   switch (MI->getOpcode()) {
     // SGPR register spill
+  case AMDGPU::SI_SPILL_S1024_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S512_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S256_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S160_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S128_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S96_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S64_CFI_SAVE:
+  case AMDGPU::SI_SPILL_S32_CFI_SAVE: {
+    NeedsCFI = true;
+    LLVM_FALLTHROUGH;
+  }
     case AMDGPU::SI_SPILL_S1024_SAVE:
     case AMDGPU::SI_SPILL_S512_SAVE:
     case AMDGPU::SI_SPILL_S256_SAVE:
@@ -1256,6 +1307,16 @@
     }
 
     // VGPR register spill
+    case AMDGPU::SI_SPILL_V1024_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V512_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V256_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V160_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V128_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V96_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V64_CFI_SAVE:
+    case AMDGPU::SI_SPILL_V32_CFI_SAVE:
+      NeedsCFI = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::SI_SPILL_V1024_SAVE:
     case AMDGPU::SI_SPILL_V512_SAVE:
     case AMDGPU::SI_SPILL_V256_SAVE:
@@ -1274,14 +1335,12 @@
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
              MFI->getStackPtrOffsetReg());
 
-      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
-            Index,
-            VData->getReg(), VData->isKill(),
-            TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
-            FrameReg,
-            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
-            *MI->memoperands_begin(),
-            RS);
+      buildSpillLoadStore(
+          MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VData->getReg(),
+          VData->isKill(),
+          TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(), FrameReg,
+          TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+          *MI->memoperands_begin(), RS, NeedsCFI);
       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
       MI->eraseFromParent();
       break;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dynamic-alloca-uniform.ll
@@ -294,8 +294,6 @@
 ; GFX10-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    ; implicit-def: $vcc_hi
-; GFX10-NEXT:    s_mov_b32 s33, s6
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GFX10-NEXT:    ; implicit-def: $vcc_hi
 ; GFX10-NEXT:    s_mov_b32 s33, s6
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -205,18 +205,18 @@
 ; TODO: Can the SP inc/deec be remvoed?
 ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr:
 ; GCN: s_waitcnt
-; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x300
+; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8
+; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8
 
 ; GCN:	;;#ASMSTART
 ; GCN-NEXT: ; clobber v41
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN: s_add_u32 s32, s32, 0x300
 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300
 ; GCN-NEXT: s_mov_b32 s33, s4
 ; GCN-NEXT: s_waitcnt vmcnt(0)
@@ -233,14 +233,14 @@
 ; GCN: s_waitcnt
 ; GCN-NEXT: v_writelane_b32 v1, s33, 63
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x300
 ; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-COUNT-63: v_writelane_b32 v1
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
 ; GCN: ;;#ASMSTART
 ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1
 
-; GCN: s_add_u32 s32, s32, 0x300
-; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; GCN: s_sub_u32 s32, s32, 0x300
 ; GCN-NEXT: v_readlane_b32 s33, v1, 63
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -265,6 +265,7 @@
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_add_u32 s32, s32, 0x300
 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-COUNT-64: v_writelane_b32 v1,
 
@@ -273,8 +274,7 @@
 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
 
 ; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN: s_add_u32 s32, s32, 0x300
-; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; GCN: s_sub_u32 s32, s32, 0x300
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/debug-frame.ll b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
--- a/llvm/test/CodeGen/AMDGPU/debug-frame.ll
+++ b/llvm/test/CodeGen/AMDGPU/debug-frame.ll
@@ -522,6 +522,103 @@
   ret void
 }
 
+
+; CHECK-LABEL: func_spill_vgpr_to_vmem:
+; CHECK: .cfi_startproc
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: %bb.0:
+; SGPR32 = 64
+; CHECK-NEXT: .cfi_llvm_def_aspace_cfa 64, 0, 6
+; CHECK-NEXT: .cfi_escape 0x10, 0x10, 0x08, 0x90, 0x3e, 0x93, 0x04, 0x90, 0x3f, 0x93, 0x04
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+
+; DW_CFA_expression [0x10]
+;   VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14]
+;   BLOCK_LENGTH ULEB128(14)=[0x0e]
+;     DW_OP_regx [0x90]
+;       VGPR40_wave64 ULEB128(1576)=[0xa8, 0x14]
+;     DW_OP_swap [0x16]
+;     DW_OP_LLVM_offset_uconst [0xe4]
+;       OFFSET ULEB128(256)=[0x80, 0x02]
+;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
+;       EXEC_MASK_wave64 ULEB128(17)=[0x11]
+;     DW_OP_deref_size [0x94]
+;       SIZE [0x08]
+;     DW_OP_LLVM_select_bit_piece [0xec]
+;       ELEMENT_SIZE [0x20]
+;       ELEMENT_COUNT [0x40]
+; WAVE64-NEXT: .cfi_escape 0x10, 0xa8, 0x14, 0x0e, 0x90, 0xa8, 0x14, 0x16, 0xe4, 0x80, 0x02, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40
+
+; DW_CFA_expression [0x10]
+;   VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c]
+;   BLOCK_LENGTH ULEB128(14)=[0x0e]
+;     DW_OP_regx [0x90]
+;       VGPR40_wave32 ULEB128(1576)=[0xa8, 0x0c]
+;     DW_OP_swap [0x16]
+;     DW_OP_LLVM_offset_uconst [0xe4]
+;       OFFSET ULEB128(128)=[0x80, 0x01]
+;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
+;       EXEC_MASK_wave32 ULEB128(1)=[0x01]
+;     DW_OP_deref_size [0x94]
+;       SIZE [0x04]
+;     DW_OP_LLVM_select_bit_piece [0xec]
+;       ELEMENT_SIZE [0x20]
+;       ELEMENT_COUNT [0x20]
+; WAVE32-NEXT: .cfi_escape 0x10, 0xa8, 0x0c, 0x0e, 0x90, 0xa8, 0x0c, 0x16, 0xe4, 0x80, 0x01, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: buffer_store_dword v41, off, s[0:3], s32 ; 4-byte Folded Spill
+
+; DW_CFA_expression [0x10]
+;   VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14]
+;   BLOCK_LENGTH ULEB128(13)=[0x0d]
+;     DW_OP_regx [0x90]
+;       VGPR41_wave64 ULEB128(2601)=[0xa9, 0x14]
+;     DW_OP_swap [0x16]
+;     DW_OP_LLVM_offset_uconst [0xe4]
+;       OFFSET ULEB128(0)=[0x00]
+;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
+;       EXEC_MASK_wave64 ULEB128(17)=[0x11]
+;     DW_OP_deref_size [0x94]
+;       SIZE [0x08]
+;     DW_OP_LLVM_select_bit_piece [0xec]
+;       ELEMENT_SIZE [0x20]
+;       ELEMENT_COUNT [0x40]
+; WAVE64-NEXT: .cfi_escape 0x10, 0xa9, 0x14, 0x0d, 0x90, 0xa9, 0x14, 0x16, 0xe4, 0x00, 0xe6, 0x11, 0x94, 0x08, 0xec, 0x20, 0x40
+
+; DW_CFA_expression [0x10]
+;   VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c]
+;   BLOCK_LENGTH ULEB128(13)=[0x0d]
+;     DW_OP_regx [0x90]
+;       VGPR41_wave32 ULEB128(1577)=[0xa9, 0x0c]
+;     DW_OP_swap [0x16]
+;     DW_OP_LLVM_offset_uconst [0xe4]
+;       OFFSET ULEB128(0)=[0x00]
+;     DW_OP_LLVM_call_frame_entry_reg [0xe6]
+;       EXEC_MASK_wave32 ULEB128(1)=[0x01]
+;     DW_OP_deref_size [0x94]
+;       SIZE [0x04]
+;     DW_OP_LLVM_select_bit_piece [0xec]
+;       ELEMENT_SIZE [0x20]
+;       ELEMENT_COUNT [0x20]
+; WAVE32-NEXT: .cfi_escape 0x10, 0xa9, 0x0c, 0x0d, 0x90, 0xa9, 0x0c, 0x16, 0xe4, 0x00, 0xe6, 0x01, 0x94, 0x04, 0xec, 0x20, 0x20
+
+; CHECK-NOT: .cfi_{{.*}}
+
+; CHECK: .cfi_endproc
+define hidden void @func_spill_vgpr_to_vmem() #0 {
+entry:
+  call void asm sideeffect "; clobber", "~{v40}"() #0
+  call void asm sideeffect "; clobber", "~{v41}"() #0
+  ret void
+}
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind "frame-pointer"="all" }
 
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -192,15 +192,14 @@
 ; GFX9-NEXT:    v_writelane_b32 v43, s33, 4
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_add_u32 s32, s32, 0x800
-; GFX9-NEXT:    s_nop 0
 ; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    v_writelane_b32 v43, s34, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s35, 1
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+4
-; GFX9-NEXT:    v_writelane_b32 v43, s35, 1
 ; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x0
 ; GFX9-NEXT:    v_mov_b32_e32 v40, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v41, v0
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
@@ -2,9 +2,13 @@
 
 # CHECK-LABEL: name: empty_entry_block
 # CHECK: V_WRITELANE
+# CHECK-NEXT: CFI_INSTRUCTION
 # CHECK-NEXT: V_WRITELANE
+# CHECK-NEXT: CFI_INSTRUCTION
 # CHECK-NEXT: V_WRITELANE
+# CHECK-NEXT: CFI_INSTRUCTION
 # CHECK-NEXT: V_WRITELANE
+# CHECK-NEXT: CFI_INSTRUCTION
 # CHECK: V_READLANE
 # CHECK-NEXT: V_READLANE
 # CHECK-NEXT: V_READLANE