Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -417,14 +417,16 @@
     }
   }
 
-  if (VCCUsed || FlatUsed)
+  if (VCCUsed)
     MaxSGPR += 2;
 
   if (FlatUsed) {
     MaxSGPR += 2;
     // 2 additional for VI+.
-    if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      MaxSGPR += 2;
+    if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+      // Assume XNACK_MASK is not used
+      //MaxSGPR += 2;
+    }
   }
 
   // We found the maximum register index. They start at 0, so add one to get the
Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -105,51 +105,53 @@
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
 
-  // We reserved the last registers for this. Shift it down to the end of those
-  // which were actually used.
-  //
-  // FIXME: It might be safer to use a pseudoregister before replacement.
-
-  // FIXME: We should be able to eliminate unused input registers. We only
-  // cannot do this for the resources required for scratch access. For now we
-  // skip over user SGPRs and may leave unused holes.
-
-  // We find the resource first because it has an alignment requirement.
-  if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-
-    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
-    // Skip the last 2 elements because the last one is reserved for VCC, and
-    // this is the 2nd to last element already.
-    for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
-      // Pick the first unallocated one. Make sure we don't clobber the other
-      // reserved input we needed.
-      if (!MRI.isPhysRegUsed(Reg)) {
-        assert(MRI.isAllocatable(Reg));
-        MRI.replaceRegWith(ScratchRsrcReg, Reg);
-        ScratchRsrcReg = Reg;
-        MFI->setScratchRSrcReg(ScratchRsrcReg);
-        break;
+  if (!ST.hasSGPRInitBug()) {
+    // We reserved the last registers for this. Shift it down to the end of those
+    // which were actually used.
+    //
+    // FIXME: It might be safer to use a pseudoregister before replacement.
+
+    // FIXME: We should be able to eliminate unused input registers. We only
+    // cannot do this for the resources required for scratch access. For now we
+    // skip over user SGPRs and may leave unused holes.
+
+    // We find the resource first because it has an alignment requirement.
+    if (ScratchRsrcReg == TRI->reservedPrivateSegmentBufferReg(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+      // Skip the last 2 elements because the last one is reserved for VCC, and
+      // this is the 2nd to last element already.
+      for (MCPhysReg Reg : getAllSGPR128().drop_back(2).slice(NumPreloaded)) {
+        // Pick the first unallocated one. Make sure we don't clobber the other
+        // reserved input we needed.
+        if (!MRI.isPhysRegUsed(Reg)) {
+          assert(MRI.isAllocatable(Reg));
+          MRI.replaceRegWith(ScratchRsrcReg, Reg);
+          ScratchRsrcReg = Reg;
+          MFI->setScratchRSrcReg(ScratchRsrcReg);
+          break;
+        }
       }
     }
-  }
 
-  if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
-    // Skip the last 2 elements because the last one is reserved for VCC, and
-    // this is the 2nd to last element already.
-    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-    for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
-      // Pick the first unallocated SGPR. Be careful not to pick an alias of the
-      // scratch descriptor, since we haven’t added its uses yet.
-      if (!MRI.isPhysRegUsed(Reg)) {
-        assert(MRI.isAllocatable(Reg) &&
-               !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
-
-        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-        ScratchWaveOffsetReg = Reg;
-        MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
-        break;
+    if (ScratchWaveOffsetReg == TRI->reservedPrivateSegmentWaveByteOffsetReg(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      // Skip the last 2 elements because the last one is reserved for VCC, and
+      // this is the 2nd to last element already.
+      unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+      for (MCPhysReg Reg : getAllSGPRs().drop_back(6).slice(NumPreloaded)) {
+        // Pick the first unallocated SGPR. Be careful not to pick an alias of the
+        // scratch descriptor, since we haven’t added its uses yet.
+        if (!MRI.isPhysRegUsed(Reg)) {
+          assert(MRI.isAllocatable(Reg) &&
+                !TRI->isSubRegisterEq(ScratchRsrcReg, Reg));
+
+          MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
+          ScratchWaveOffsetReg = Reg;
+          MFI->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+          break;
+        }
       }
     }
   }
Index: test/CodeGen/AMDGPU/spill-stress.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/spill-stress.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+
+; On Tonga and Iceland, limited SGPR availability means care must be taken to
+; allocate scratch registers correctly.
+; TONGA-LABEL: test
+define void @test(<60 x i32> addrspace(1)* %out, i32 addrspace(1)* %foil, <60 x i32> addrspace(1)* %in, <60 x i32> %f) {
+entry:
+  %tid = call i32 @llvm.SI.tid() nounwind readnone
+  %aptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %tid
+  %a = load <60 x i32>, <60 x i32> addrspace(1)* %aptr
+  store i32 %tid, i32 addrspace(1)* %foil
+  %bidx = add i32 %tid, 60
+  %bptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %bidx
+  %b = load <60 x i32>, <60 x i32> addrspace(1)* %bptr
+  %bfoil = getelementptr i32, i32 addrspace(1)* %foil, i32 1
+  store i32 %bidx, i32 addrspace(1)* %bfoil
+  %cidx = add i32 %tid, 128
+  %cptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %cidx
+  %c = load <60 x i32>, <60 x i32> addrspace(1)* %cptr
+  %cfoil = getelementptr i32, i32 addrspace(1)* %bfoil, i32 1
+  store i32 %cidx, i32 addrspace(1)* %cfoil
+  %didx = add i32 %tid, 196
+  %dptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %didx
+  %d = load <60 x i32>, <60 x i32> addrspace(1)* %dptr
+  %dfoil = getelementptr i32, i32 addrspace(1)* %cfoil, i32 1
+  store i32 %cidx, i32 addrspace(1)* %dfoil
+  %eidx = add i32 %tid, 256
+  %eptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %in, i32 %eidx
+  %e = load <60 x i32>, <60 x i32> addrspace(1)* %eptr
+  %am = mul <60 x i32> %a, %f
+  %s1 = add <60 x i32> %b, %am
+  %s2 = add <60 x i32> %s1, %c
+  %s3 = add <60 x i32> %s2, %d
+  %s = add <60 x i32> %e, %s3
+  %outptr = getelementptr <60 x i32>, <60 x i32> addrspace(1)* %out, i32 %tid
+  store <60 x i32> %s, <60 x i32> addrspace(1)* %outptr
+  ret void
+}
+
+declare i32 @llvm.SI.tid() nounwind readnone