diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -887,7 +887,11 @@
   ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
   ProgInfo.VGPRSpill = MFI->getNumSpilledVGPRs();
 
-  ProgInfo.LDSSize = MFI->getLDSSize();
+  unsigned MaxWorkGroupSize = STM.getFlatWorkGroupSizes(F).second;
+  unsigned LDSSpillSize = MFI->getLdsSpill().TotalSize * MaxWorkGroupSize;
+
+  ProgInfo.LDSSize = MFI->getLDSSize() + LDSSpillSize;
+
   ProgInfo.LDSBlocks =
       alignTo(ProgInfo.LDSSize, 1ULL << LDSAlignShift) >> LDSAlignShift;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -181,7 +181,8 @@
     Info.UsesFlatScratch = false;
   }
 
-  Info.PrivateSegmentSize = FrameInfo.getStackSize();
+  unsigned LdsSpillTotalSize = MFI->getLdsSpill().TotalSize;
+  Info.PrivateSegmentSize = FrameInfo.getStackSize() - LdsSpillTotalSize;
 
   // Assume a big number if there are any unknown sized objects.
   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -795,6 +795,11 @@
   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
 }
 
+unsigned GCNSubtarget::getLdsSpillLimitDwords(const MachineFunction &MF) const {
+  const Function &F = MF.getFunction();
+  return AMDGPU::getIntegerAttribute(F, "amdgpu-lds-spill-limit-dwords", 0);
+}
+
 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
                                          int UseOpIdx, SDep &Dep) const {
   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1064,6 +1064,8 @@
   // hasGFX90AInsts is also true.
   bool hasGFX940Insts() const { return GFX940Insts; }
 
+  bool hasDSAddTid() const { return getGeneration() >= GFX9; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1228,6 +1230,11 @@
   /// unit requirement.
   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 
+  /// \returns Maximum amount of LDS space to be used for spilling explicitly
+  /// requested using "amdgpu-lds-spill-limit-dwords attribute attached to
+  /// function \p F.
+  unsigned getLdsSpillLimitDwords(const MachineFunction &MF) const;
+
   void getPostRAMutations(
       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
       const override;
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -70,6 +70,10 @@
       Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
       Register ScratchWaveOffsetReg) const;
 
+  void setupLDSSpilling(MachineFunction &MF, MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I,
+                        const DebugLoc &DL) const;
+
 public:
   bool hasFP(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -400,6 +400,157 @@
   return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
 }
 
+// Determine which stack objects should be spilled to LDS, set up
+// SIMachineFunctionInfo::LdsSpill structure and initialize
+// m0 for LDS spilling if possible.
+void SIFrameLowering::setupLDSSpilling(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       const DebugLoc &DL) const {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = MF.getFunction();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(F).second;
+
+  assert(MFI->isEntryFunction());
+
+  int LDSSpillLimitInBytes = ST.getLdsSpillLimitDwords(MF) * 4;
+  LDSSpillLimitInBytes =
+      std::max(0, LDSSpillLimitInBytes - (int)MFI->getLDSSize());
+
+  // Go through the stack slots starting from the end and assign them to LDS
+  // as long as they fit in the remaining size.
+  SmallVector<int> LdsOffsets(FrameInfo.getObjectIndexEnd(), -1);
+  bool AllStackSlotsHandled = true;
+  int TotalSize = 0;
+  int RemainingSize = LDSSpillLimitInBytes;
+  for (int i = FrameInfo.getObjectIndexEnd() - 1; i >= 0; --i) {
+    if (FrameInfo.isDeadObjectIndex(i)) {
+      continue;
+    }
+    if (FrameInfo.isObjectPreAllocated(i)) {
+      AllStackSlotsHandled = false;
+      break;
+    }
+    int ObjSize = FrameInfo.getObjectSize(i);
+    assert(ObjSize > 0);
+    int ObjSizeForAllThreads = ObjSize * WorkGroupSize;
+
+    if (ObjSizeForAllThreads <= RemainingSize) {
+      RemainingSize -= ObjSizeForAllThreads;
+      LdsOffsets[i] = TotalSize;
+
+      TotalSize += ObjSize;
+    } else {
+      AllStackSlotsHandled = false;
+      break;
+    }
+  }
+
+  // No stack slots will use LDS - exit early.
+  if (TotalSize == 0)
+    return;
+
+  // Register to use for m0 save/restore for each spill, or NoRegister if the
+  // save/restore is not needed, and the initialization takes place here once.
+  Register M0SaveRestoreReg;
+  if (MRI.isPhysRegUsed(AMDGPU::M0)) {
+    if (requiresStackPointerReference(MF)) {
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+      ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
+      AllSGPRs = AllSGPRs.slice(
+          std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
+      for (MCPhysReg Reg : AllSGPRs) {
+        if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+          M0SaveRestoreReg = Reg;
+          break;
+        }
+      }
+    } else {
+      assert(!requiresStackPointerReference(MF));
+      M0SaveRestoreReg = MFI->getStackPtrOffsetReg();
+    }
+    // Could not find a free SGPR for M0 init so exit early.
+    if (M0SaveRestoreReg == AMDGPU::NoRegister)
+      return;
+  }
+
+  Register M0InitVal;
+  // The addtid addressing is as follows:
+  // LDS_Addr = LDS_BASE + {Inst_offset1, Inst_offset0} + TID(0..63)*4 + M0
+  // If the workgroup size is not larger than the wave size we can safely init
+  // m0 with 0. Otherwise, we need to make sure that the lds addresses do not
+  // override data for other slots so we initialize m0 to
+  // current_wave_id_in_group * wave size.
+  if (WorkGroupSize > ST.getWavefrontSize()) {
+    Register PreloadedWorkgroupInfoReg = MFI->getWorkgroupInfoReg();
+    if (!PreloadedWorkgroupInfoReg) {
+      // This should never happen, but it depends on how front-end sets up
+      // input sgprs, so it is safer to make it an early out rather than assert.
+      return;
+    }
+
+    if (!MRI.isPhysRegUsed(PreloadedWorkgroupInfoReg)) {
+      M0InitVal = PreloadedWorkgroupInfoReg;
+    } else {
+
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+      ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
+      AllSGPRs = AllSGPRs.slice(
+          std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
+      for (MCPhysReg Reg : AllSGPRs) {
+        if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+          M0InitVal = Reg;
+          break;
+        }
+      }
+    }
+
+    // Could not find a free SGPR for M0 init so exit early.
+    // FIXME: We could also check some of the preloads to see if one of them
+    // could be re-used.
+    if (M0InitVal == AMDGPU::NoRegister)
+      return;
+
+    // Load ordered_append_term to get the current wave id in a group.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_BFE_U32), M0InitVal)
+        .addReg(PreloadedWorkgroupInfoReg)
+        .addImm(0xc0006);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MUL_I32), M0InitVal)
+        .addReg(M0InitVal)
+        .addImm(ST.getWavefrontSize() * 4);
+  }
+
+  // If save/restore is not needed we can init m0 here and be done with it.
+  if (M0SaveRestoreReg == AMDGPU::NoRegister) {
+    if (M0InitVal == AMDGPU::NoRegister)
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).addImm(0);
+    else
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+          .addReg(M0InitVal);
+  }
+
+  SIMachineFunctionInfo::LdsSpill LdsSpillInfo;
+  LdsSpillInfo.M0InitVal = M0InitVal;
+  LdsSpillInfo.M0SaveRestoreReg = M0SaveRestoreReg;
+  LdsSpillInfo.LdsOffsets = LdsOffsets;
+  LdsSpillInfo.TotalSize = TotalSize;
+  MFI->setLdsSpill(LdsSpillInfo);
+
+  // Earlier we set ScavengeFI based on the fact that there were stack accesses.
+  // In the event no slots will use stack, we can safely remove it.
+  if (AllStackSlotsHandled) {
+    int ScavengeFI = MFI->getScavengeFI(FrameInfo, *TRI);
+    FrameInfo.setStackSize(FrameInfo.getStackSize() -
+                           FrameInfo.getObjectSize(ScavengeFI));
+    FrameInfo.RemoveStackObject(ScavengeFI);
+  }
+}
+
 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
@@ -425,6 +576,14 @@
 
   assert(MFI->isEntryFunction());
 
+  // Debug location must be unknown since the first debug location is used to
+  // determine the end of the prologue.
+  DebugLoc DL;
+  MachineBasicBlock::iterator I = MBB.begin();
+
+  if (FrameInfo.getStackSize() > 0 && MFI->ldsSpillingEnabled(MF))
+    setupLDSSpilling(MF, MBB, I, DL);
+
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
@@ -461,11 +620,6 @@
     }
   }
 
-  // Debug location must be unknown since the first debug location is used to
-  // determine the end of the prologue.
-  DebugLoc DL;
-  MachineBasicBlock::iterator I = MBB.begin();
-
   // We found the SRSRC first because it needs four registers and has an
   // alignment requirement. If the SRSRC that we found is clobbering with
   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2555,6 +2555,31 @@
     InVals.push_back(Val);
   }
 
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (MFI->ldsSpillingEnabled(MF) &&
+      ST.getFlatWorkGroupSizes(Fn).second > ST.getWavefrontSize()) {
+
+    int WorkGroupInfoSgprNo =
+        AMDGPU::getIntegerAttribute(Fn, "amdgpu-work-group-info-arg-no", -1);
+    if (WorkGroupInfoSgprNo != -1)
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        const ISD::InputArg &Arg = Ins[i];
+        if (Arg.getOrigArgIndex() == (unsigned)WorkGroupInfoSgprNo) {
+
+          CCValAssign &VA = ArgLocs[i];
+          Register WorkGroupInfoReg = VA.getLocReg();
+          assert(AMDGPU::SGPR_32RegClass.contains(WorkGroupInfoReg));
+
+          Info->setWorkgroupInfoReg(WorkGroupInfoReg);
+          MF.addLiveIn(WorkGroupInfoReg, &AMDGPU::SGPR_32RegClass);
+          MF.front().addLiveIn(WorkGroupInfoReg, &AMDGPU::SGPR_32RegClass);
+
+          break;
+        }
+      }
+  }
+
   // Start adding system SGPRs.
   if (IsEntryFunc) {
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -364,6 +364,11 @@
   // base to the beginning of the new function's frame.
   Register StackPtrOffsetReg = AMDGPU::SP_REG;
 
+  // This is WorkgroupInfo register set up for LDS spilling for cases where
+  // workgroup size is larger than wave size. It relies on user input
+  // registers set up by the front-end.
+  Register WorkgroupInfoReg = 0;
+
   AMDGPUFunctionArgInfo ArgInfo;
 
   // Graphics info.
@@ -472,6 +477,20 @@
     bool IsDead = false;
   };
 
+  struct LdsSpill {
+    // Value to init m0 with.
+    Register M0InitVal;
+    // Register to save/restore current value of m0 for each spill. If
+    // NoRegister, the m0 initialization takes place in the prolog once.
+    Register M0SaveRestoreReg;
+    // Offset in LDS indexed by a stack object index. Value (-1) means there is
+    // no LDS spilling for such stack object index. The values are properly
+    // initialized only if TotalSize > 0.
+    SmallVector<int> LdsOffsets;
+    // Total size of all LDS spill objects in bytes (per thread).
+    unsigned TotalSize = 0;
+  };
+
   // Track VGPRs reserved for WWM.
   SmallSetVector<Register, 8> WWMReservedRegs;
 
@@ -509,6 +528,8 @@
   // frame, so save it here and add it to the RegScavenger later.
   Optional<int> ScavengeFI;
 
+  LdsSpill LdsSpillInfo;
+
 private:
   Register VGPRForAGPRCopy;
 
@@ -800,6 +821,13 @@
     StackPtrOffsetReg = Reg;
   }
 
+  void setWorkgroupInfoReg(Register Reg) {
+    assert(Reg != 0);
+    WorkgroupInfoReg = Reg;
+  }
+
+  Register getWorkgroupInfoReg() const { return WorkgroupInfoReg; }
+
   // Note the unset value for this is AMDGPU::SP_REG rather than
   // NoRegister. This is mostly a workaround for MIR tests where state that
   // can't be directly computed from the function is not preserved in serialized
@@ -988,6 +1016,12 @@
 
   // \returns true if a function needs or may need AGPRs.
   bool usesAGPRs(const MachineFunction &MF) const;
+
+  void setLdsSpill(LdsSpill Info) { LdsSpillInfo = Info; }
+
+  LdsSpill getLdsSpill() const { return LdsSpillInfo; }
+
+  bool ldsSpillingEnabled(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -734,3 +734,18 @@
   UsesAGPRs = false;
   return false;
 }
+
+bool SIMachineFunctionInfo::ldsSpillingEnabled(
+    const MachineFunction &MF) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.hasDSAddTid())
+    return false;
+
+  if (MF.getFrameInfo().hasCalls())
+    return false;
+
+  if (ST.getLdsSpillLimitDwords(MF) == 0)
+    return false;
+
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -405,6 +405,12 @@
   /// of the subtarget.
   ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
 
+  bool buildLdsSpillLoadStore(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, bool IsLoad, int Index,
+                              Register ValueReg, bool ValueIsKill,
+                              int64_t InstrOffset,
+                              MachineMemOperand *MMO) const;
   // Insert spill or restore instructions.
   // When lowering spill pseudos, the RegScavenger should be set.
   // For creating spill instructions during frame lowering, where no scavenger
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1267,6 +1267,89 @@
   return LoadStoreOp;
 }
 
+bool SIRegisterInfo::buildLdsSpillLoadStore(MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MI,
+                                            const DebugLoc &DL, bool IsLoad,
+                                            int Index, Register ValueReg,
+                                            bool IsKill, int64_t InstOffset,
+                                            MachineMemOperand *MMO) const {
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
+
+  SIMachineFunctionInfo::LdsSpill LdsSpillInfo = FuncInfo->getLdsSpill();
+  int64_t LdsOffsetForIndex = LdsSpillInfo.LdsOffsets[Index];
+  if (LdsOffsetForIndex == -1)
+    return false;
+
+  if (LdsSpillInfo.M0SaveRestoreReg) {
+
+    // FIXME: If we could prove that there are no m0 defs/uses between two LDS
+    // spill instructions we could avoid doing some save/restore.
+
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32),
+            LdsSpillInfo.M0SaveRestoreReg)
+        .addReg(AMDGPU::M0);
+    if (LdsSpillInfo.M0InitVal == AMDGPU::NoRegister)
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).addImm(0x0);
+    else
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+          .addImm(LdsSpillInfo.M0InitVal);
+  }
+
+  const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
+  unsigned EltCount = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+
+  Align Alignment = MFI.getObjectAlign(Index);
+  const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
+  for (unsigned R = 0; R < EltCount; ++R) {
+    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(4 * R);
+    MachineMemOperand *NewMMO = MF->getMachineMemOperand(
+        PInfo, MMO->getFlags(), 4, commonAlignment(Alignment, 4 * R));
+
+    Register SubReg =
+        EltCount == 1 ? ValueReg
+                      : Register(getSubReg(ValueReg, getSubRegFromChannel(R)));
+
+    unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(MF->getFunction()).second;
+    // The addtid addressing is as follows:
+    // LDS_Addr = LDS_BASE + {Inst_offset1, Inst_offset0} + TID(0..63)*4 + M0
+    // We calculate offset for the zeroth lane and make room for other lanes by
+    // multiplying by the wave size. The earlier m0 setup handles the case
+    // when the workgroup size is larger than thread size.
+
+    int64_t LdsOffsetForIndex = FuncInfo->getLdsSpill().LdsOffsets[Index];
+    assert(LdsOffsetForIndex != -1);
+
+    int64_t StackOffset = InstOffset + LdsOffsetForIndex + 4 * R;
+    int64_t StackOffsetZerothLane =
+        StackOffset * WorkGroupSize + FuncInfo->getLDSSize();
+
+    if (IsLoad) {
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::DS_READ_ADDTID_B32), SubReg)
+          .addImm(StackOffsetZerothLane)
+          .addImm(0 /* gds */)
+          .addMemOperand(NewMMO);
+
+    } else {
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_ADDTID_B32))
+          .addReg(SubReg, getKillRegState(R == EltCount - 1 ? IsKill : false))
+          .addImm(StackOffsetZerothLane)
+          .addImm(0 /* gds */)
+          .addMemOperand(NewMMO);
+    }
+  }
+
+  if (LdsSpillInfo.M0SaveRestoreReg) {
+    BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+        .addReg(LdsSpillInfo.M0SaveRestoreReg, RegState::Kill);
+  }
+  return true;
+}
+
 void SIRegisterInfo::buildSpillLoadStore(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
     unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
@@ -2030,7 +2113,28 @@
     case AMDGPU::SI_SPILL_V128_SAVE:
     case AMDGPU::SI_SPILL_V96_SAVE:
     case AMDGPU::SI_SPILL_V64_SAVE:
-    case AMDGPU::SI_SPILL_V32_SAVE:
+    case AMDGPU::SI_SPILL_V32_SAVE: {
+      if (MFI->getLdsSpill().TotalSize > 0) {
+
+        const MachineOperand *VData =
+            TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+               MFI->getStackPtrOffsetReg());
+
+        bool ldsSpill = buildLdsSpillLoadStore(
+            *MBB, MI, DL, /*IsLoad*/ false, Index, VData->getReg(),
+            /*IsKill*/ VData->isKill(),
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+            *MI->memoperands_begin());
+
+        if (ldsSpill) {
+          MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+          MI->eraseFromParent();
+          break;
+        }
+      }
+      LLVM_FALLTHROUGH;
+    }
     case AMDGPU::SI_SPILL_A1024_SAVE:
     case AMDGPU::SI_SPILL_A512_SAVE:
     case AMDGPU::SI_SPILL_A256_SAVE:
@@ -2076,7 +2180,25 @@
     case AMDGPU::SI_SPILL_V224_RESTORE:
     case AMDGPU::SI_SPILL_V256_RESTORE:
     case AMDGPU::SI_SPILL_V512_RESTORE:
-    case AMDGPU::SI_SPILL_V1024_RESTORE:
+    case AMDGPU::SI_SPILL_V1024_RESTORE: {
+      if (MFI->getLdsSpill().TotalSize > 0) {
+        const MachineOperand *VData =
+            TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
+               MFI->getStackPtrOffsetReg());
+
+        bool ldsSpill = buildLdsSpillLoadStore(
+            *MBB, MI, DL, /*IsLoad */ true, Index, VData->getReg(), false,
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
+            *MI->memoperands_begin());
+        if (ldsSpill) {
+          MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+          MI->eraseFromParent();
+          break;
+        }
+      }
+      LLVM_FALLTHROUGH;
+    }
     case AMDGPU::SI_SPILL_A32_RESTORE:
     case AMDGPU::SI_SPILL_A64_RESTORE:
     case AMDGPU::SI_SPILL_A96_RESTORE:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-spill-cs.ll b/llvm/test/CodeGen/AMDGPU/lds-spill-cs.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-spill-cs.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64
+
+; The test checks if the part of spilling goes to LDS with the right m0 setup.
+; Without the vgpr limit, the test needs to use 16 vgprs as there are four vec4 variables in-flight.
+; With "num-vgpr"="12" limit, one vec4 needs to be spilled to memory (16 bytes).
+; 16 bytes will occupy 256 dwords, so setting "amdgpu-lds-spill-limit-dwords"="256" will suffice.
+; Note: 16 bytes * 64 (workgroup size) = 16 * 64 * 8 bits = 16 * 64 * 8 / 32 dwords = 256 dwords
+
+define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %spillTable, <3 x i32> inreg %WorkgroupId, i32 inreg %MultiDispatchInfo, <3 x i32> %LocalInvocationId, <2 x i32> inreg %ptr) #3 {
+; W32-LABEL: _amdgpu_cs_main:
+; W32:       ; %bb.0: ; %.entry
+; W32:    s_bfe_u32 s7, s7, 0xc0006
+; W32:    s_mulk_i32 s7, 0x80
+; W32:    s_mov_b32 m0, s7
+; W32:    ds_write_addtid_b32 v0 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v1 offset:256 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v2 offset:512 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v3 offset:768 ; 4-byte Folded Spill
+; W32:    ds_read_addtid_b32 v0 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v1 offset:256 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v2 offset:512 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v3 offset:768 ; 4-byte Folded Reload
+;
+; W64-LABEL: _amdgpu_cs_main:
+; W64:       ; %bb.0: ; %.entry
+; W64:    s_mov_b32 m0, 0
+; W64:    ds_write_addtid_b32 v0 ; 4-byte Folded Spill
+; W64:    ds_write_addtid_b32 v1 offset:256 ; 4-byte Folded Spill
+; W64:    ds_write_addtid_b32 v2 offset:512 ; 4-byte Folded Spill
+; W64:    ds_write_addtid_b32 v3 offset:768 ; 4-byte Folded Spill
+; W64:    ds_read_addtid_b32 v0 ; 4-byte Folded Reload
+; W64:    ds_read_addtid_b32 v1 offset:256 ; 4-byte Folded Reload
+; W64:    ds_read_addtid_b32 v2 offset:512 ; 4-byte Folded Reload
+; W64:    ds_read_addtid_b32 v3 offset:768 ; 4-byte Folded Reload
+.entry:
+  %i6 = bitcast <2 x i32> %ptr to i64
+  %i7 = inttoptr i64 %i6 to <4 x i32> addrspace(4)*
+  %i8 = load <4 x i32>, <4 x i32> addrspace(4)* %i7, align 16
+  %i9 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 0, i32 0, i32 0)
+  %i10 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 16, i32 0, i32 0)
+  %i11 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 32, i32 0, i32 0)
+  %i12 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 48, i32 0, i32 0)
+  fence syncscope("workgroup") acq_rel
+  call void @llvm.amdgcn.s.barrier()
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i9, <4 x i32> %i8, i32 64, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i10, <4 x i32> %i8, i32 80, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i11, <4 x i32> %i8, i32 96, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i12, <4 x i32> %i8, i32 112, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: convergent nounwind willreturn
+declare void @llvm.amdgcn.s.barrier() #0
+
+; Function Attrs: nounwind readonly willreturn
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) #4
+
+; Function Attrs: nounwind willreturn writeonly
+declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #5
+
+attributes #3 = { nounwind "amdgpu-flat-work-group-size"="64,64" "amdgpu-lds-spill-limit-dwords"="256" "amdgpu-work-group-info-arg-no"="5" "amdgpu-num-vgpr"="12" }
diff --git a/llvm/test/CodeGen/AMDGPU/lds-spill-ps.ll b/llvm/test/CodeGen/AMDGPU/lds-spill-ps.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-spill-ps.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W32
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=W64
+
+; The test checks if the part of spilling goes to LDS with the right m0 setup.
+; Since "amdgpu-lds-spill-limit-dwords"="256" limit is respected:
+;  - In wave32, 8 dword slots get allocated to LDS (8 * 32), equaling 1024 bytes.
+;  - In wave64, 4 dword slots get allodated to LDS (4 * 64), equaling 1024 bytes.
+
+define dllexport amdgpu_ps void @_amdgpu_ps_main(i32 inreg %globalTable, i32 inreg %perShaderTable, i32 inreg %descTable0, i32 inreg %spillTable, i32 inreg %PrimMask, <2 x float> %PerspInterpSample, <2 x float> %PerspInterpCenter, <2 x float> %PerspInterpCentroid, <3 x float> %PerspInterpPullMode, <2 x float> %LinearInterpSample, <2 x float> %LinearInterpCenter, <2 x float> %LinearInterpCentroid, float %LineStipple, float %FragCoordX, float %FragCoordY, float %FragCoordZ, float %FragCoordW, i32 %FrontFacing, i32 %Ancillary, i32 %SampleCoverage, i32 %FixedXY, <2 x i32> inreg %ptr) #0 {
+; W32-LABEL: _amdgpu_ps_main:
+; W32:       ; %bb.0: ; %.entry
+; W32:    s_mov_b32 m0, 0
+; W32:    ds_write_addtid_b32 v0 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v1 offset:128 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v2 offset:256 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v3 offset:384 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v0 offset:512 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v1 offset:640 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v2 offset:768 ; 4-byte Folded Spill
+; W32:    ds_write_addtid_b32 v3 offset:896 ; 4-byte Folded Spill
+; W32:    ds_read_addtid_b32 v0 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v1 offset:128 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v2 offset:256 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v3 offset:384 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v0 offset:512 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v1 offset:640 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v2 offset:768 ; 4-byte Folded Reload
+; W32:    ds_read_addtid_b32 v3 offset:896 ; 4-byte Folded Reload
+;
+; W64-LABEL: _amdgpu_ps_main:
+; W64:       ; %bb.0: ; %.entry
+; W64:    s_mov_b32 m0, 0
+; W64:    ds_write_addtid_b32 v0 ; 4-byte Folded Spill
+; W64:    ds_write_addtid_b32 v1 offset:256 ; 4-byte Folded Spill
+; W64:    ds_write_addtid_b32 v2 offset:512 ; 4-byte Folded Spill
+; W64:    ds_write_addtid_b32 v3 offset:768 ; 4-byte Folded Spill
+; W64:    ds_read_addtid_b32 v0 ; 4-byte Folded Reload
+; W64:    ds_read_addtid_b32 v1 offset:256 ; 4-byte Folded Reload
+; W64:    ds_read_addtid_b32 v2 offset:512 ; 4-byte Folded Reload
+; W64:    ds_read_addtid_b32 v3 offset:768 ; 4-byte Folded Reload
+.entry:
+  %i6 = bitcast <2 x i32> %ptr to i64
+  %i7 = inttoptr i64 %i6 to <4 x i32> addrspace(4)*
+  %i8 = load <4 x i32>, <4 x i32> addrspace(4)* %i7, align 16
+  %i9 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 0, i32 0, i32 0)
+  %i10 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 16, i32 0, i32 0)
+  %i11 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 32, i32 0, i32 0)
+  %i12 = call <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32> %i8, i32 48, i32 0, i32 0)
+  fence acq_rel
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i9, <4 x i32> %i8, i32 64, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i10, <4 x i32> %i8, i32 80, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i11, <4 x i32> %i8, i32 96, i32 0, i32 0)
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i12, <4 x i32> %i8, i32 112, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: nounwind readonly willreturn
+declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg)
+; Function Attrs: nounwind willreturn writeonly
+declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg)
+
+attributes #0 = { "amdgpu-lds-spill-limit-dwords"="256" "amdgpu-num-vgpr"="12" }