Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,12 +21,6 @@
 using namespace llvm;
 
 
-static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
-                              const MachineFrameInfo &MFI) {
-  return FuncInfo->hasSpilledSGPRs() &&
-    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
-}
-
 static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
                                          const SIRegisterInfo *TRI) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
@@ -75,7 +69,6 @@
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 
-
   // Add wave offset in bytes to private base offset.
   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@@ -97,7 +90,8 @@
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+  if (ScratchRsrcReg == AMDGPU::NoRegister)
+    return AMDGPU::NoRegister;
 
   if (ST.hasSGPRInitBug() ||
       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
@@ -116,14 +110,17 @@
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
   // Skip the last 2 elements because the last one is reserved for VCC, and
   // this is the 2nd to last element already.
-  for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) {
+  for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
     // reserved input we needed.
-    if (!MRI.isPhysRegUsed(Reg)) {
-      assert(MRI.isAllocatable(Reg));
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+      //assert(MRI.isAllocatable(Reg));
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -146,8 +143,15 @@
 
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+
   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
 
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  if (NumPreloaded > AllSGPRs.size())
+    return ScratchWaveOffsetReg;
+
+  AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
   // We need to drop register from the end of the list that we cannot use
   // for the scratch wave offset.
   // + 2 s102 and s103 do not exist on VI.
@@ -161,7 +165,10 @@
   //     are no other free SGPRs, then the value will stay in this register.
   // ----
   //  13
-  for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) {
+  if (AllSGPRs.size() < 13)
+    return ScratchWaveOffsetReg;
+
+  for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
     // scratch descriptor, since we haven’t added its uses yet.
     if (!MRI.isPhysRegUsed(Reg)) {
@@ -186,9 +193,6 @@
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
-  if (!MF.getFrameInfo().hasStackObjects())
-    return;
-
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
 
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -198,8 +202,6 @@
   //
   // FIXME: We should be cleaning up these unused SGPR spill frame indices
   // somewhere.
-  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
-    return;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
@@ -209,38 +211,51 @@
     = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
   unsigned ScratchWaveOffsetReg
     = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-  assert(ScratchRsrcReg != AMDGPU::NoRegister);
-  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  if (ScratchRsrcReg == AMDGPU::NoRegister) {
+    assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+    return;
+  }
+
   assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
 
-  if (MFI->hasFlatScratchInit())
+  // We need to do the replacement of the private segment buffer and wave offset
+  // register even if there are no stack objects. There could be stores to undef
+  // or a constant without an associated object.
+
+  // FIXME: We still have implicit uses on SGPR spill instructions in case they
+  // need to spill to vector memory. It's likely that will not happen, but at
+  // this point it appears we need the setup. This part of the prolog should be
+  // emitted after frame indices are eliminated.
+
+  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
     emitFlatScratchInit(TII, TRI, MF, MBB);
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
+
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
   if (ST.isAmdCodeObjectV2()) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
-  // If we reserved the original input registers, we don't need to copy to the
-  // reserved registers.
-  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
-    // We should always reserve these 5 registers at the same time.
-    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
-           "scratch wave offset and private segment buffer inconsistent");
-    return;
-  }
+  bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+  bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
-  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
-  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  if (OffsetRegUsed) {
+    assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+           "scratch wave offset input is required");
+    MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+    MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  }
 
-  if (ST.isAmdCodeObjectV2()) {
+  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+    assert(ST.isAmdCodeObjectV2());
     MRI.addLiveIn(PreloadedPrivateBufferReg);
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
@@ -250,30 +265,46 @@
     if (&OtherBB == &MBB)
       continue;
 
-    OtherBB.addLiveIn(ScratchRsrcReg);
-    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+    if (OffsetRegUsed)
+      OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+    if (ResourceRegUsed)
+      OtherBB.addLiveIn(ScratchRsrcReg);
   }
 
   DebugLoc DL;
   MachineBasicBlock::iterator I = MBB.begin();
 
-  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
-    // Make sure we emit the copy for the offset first. We may have chosen to
-    // copy the buffer resource into a register that aliases the input offset
-    // register.
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+
+  bool CopyBuffer = ResourceRegUsed &&
+    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+    ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+  // This needs to be careful of the copying order to avoid overwriting one of
+  // the input registers before it's been copied to it's final
+  // destination. Usually the offset should be copied first.
+  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+                                              ScratchWaveOffsetReg);
+  if (CopyBuffer && CopyBufferFirst) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  }
+
+  if (OffsetRegUsed &&
+      PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
       .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
   }
 
-  if (ST.isAmdCodeObjectV2()) {
-    // Insert copies from argument register.
-    assert(
-      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
-      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
-
+  if (CopyBuffer && !CopyBufferFirst) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
-  } else {
+  }
+
+  if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
+    assert(!ST.isAmdCodeObjectV2());
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
     unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -614,7 +614,12 @@
     BuildMI(MBB, MI, DL, OpDesc)
       .addReg(SrcReg, getKillRegState(isKill)) // data
       .addFrameIndex(FrameIndex)               // addr
-      .addMemOperand(MMO);
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+    // Add the scratch resource registers as implicit uses because we may end up
+    // needing them, and need to ensure that the reserved registers are
+    // correctly handled.
 
     return;
   }
@@ -707,7 +712,9 @@
 
     BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
-      .addMemOperand(MMO);
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
 
     return;
   }
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -973,9 +973,20 @@
       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 
     // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getNumReservedSGPRs(ST))
+    if (Requested && (Requested <= getNumReservedSGPRs(ST)))
       Requested = 0;
 
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accomodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < NumInputSGPRs)
+      Requested = NumInputSGPRs;
+
     // Make sure requested value is compatible with values implied by
     // default/requested minimum/maximum number of waves per execution unit.
     if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first))
Index: test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
===================================================================
--- test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -1,9 +1,16 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK-LABEL: {{^}}max_18_sgprs:
+; CHECK-LABEL: {{^}}max_14_sgprs:
+
+; FIXME: Should be ablo to skip this copying of the private segment
+; buffer because all the SGPR spills are to VGPRs.
+
+; CHECK: s_mov_b64 s[6:7], s[2:3]
+; CHECK: s_mov_b64 s[4:5], s[0:1]
+
 ; CHECK: SGPRBlocks: 1
-; CHECK: NumSGPRsForWavesPerEU: 13
-define void @max_18_sgprs(i32 addrspace(1)* %out1,
+; CHECK: NumSGPRsForWavesPerEU: 14
+define void @max_14_sgprs(i32 addrspace(1)* %out1,
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
                           i32 addrspace(1)* %out4,
@@ -14,4 +21,102 @@
   store i32 %four, i32 addrspace(1)* %out4
   ret void
 }
-attributes #0 = {"amdgpu-num-sgpr"="18"}
+
+; private resource: 4
+; scratch wave offset: 1
+; workgroup ids: 3
+; dispatch id: 2
+; queue ptr: 2
+; flat scratch init: 2
+; ---------------------
+; total: 14
+
+; + reserved vcc, flat_scratch = 18
+
+; Because we can't handle re-using the last few input registers as the
+; special vcc etc. registers (as well as decide to not use the unused
+; features when the number of registers is frozen), this ends up using
+; more than expected.
+
+; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
+; TOSGPR: SGPRBlocks: 2
+; TOSGPR: NumSGPRsForWavesPerEU: 18
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b32 s9, s13
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+
+; TOSMEM: SGPRBlocks: 2
+; TOSMEM: NumSGPRsForWavesPerEU: 18
+define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+                                        i32 addrspace(1)* %out2,
+                                        i32 addrspace(1)* %out3,
+                                        i32 addrspace(1)* %out4,
+                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+  store volatile i32 0, i32* undef
+  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
+
+  store i32 %one, i32 addrspace(1)* %out1
+  store i32 %two, i32 addrspace(1)* %out2
+  store i32 %three, i32 addrspace(1)* %out3
+  store i32 %four, i32 addrspace(1)* %out4
+  ret void
+}
+
+; ALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
+; ; Make sure copies for input buffer are not clobbered. This requires
+; ; swapping the order the registers are copied from what normally
+; ; happens.
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+; TOSMEM: s_mov_b32 s3, s11
+
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 16
+define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+                                        i32 addrspace(1)* %out2,
+                                        i32 addrspace(1)* %out3,
+                                        i32 addrspace(1)* %out4,
+                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+  store volatile i32 0, i32* undef
+  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+
+  store i32 %one, i32 addrspace(1)* %out1
+  store i32 %two, i32 addrspace(1)* %out2
+  store i32 %three, i32 addrspace(1)* %out3
+  store i32 %four, i32 addrspace(1)* %out4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i64 @llvm.amdgcn.dispatch.id() #1
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
+attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
Index: test/CodeGen/AMDGPU/private-access-no-objects.ll
===================================================================
--- test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -1,6 +1,17 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPTNONE %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s
+
+; There are no stack objects, but still a private memory access. The
+; private access regiters need to be correctly initialized anyway, and
+; shifted down to the end of the used registers.
 
 ; GCN-LABEL: {{^}}store_to_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
@@ -13,18 +24,30 @@
 }
 
 ; GCN-LABEL: {{^}}store_to_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 123 to i32*)
  ret void
 }
 
 ; GCN-LABEL: {{^}}load_from_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_from_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
   ret void
Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll
===================================================================
--- test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
Index: test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
===================================================================
--- test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=TONGA %s
 
 ; On Tonga and Iceland, limited SGPR availability means care must be taken to
 ; allocate scratch registers correctly. Check that this test compiles without
Index: test/CodeGen/AMDGPU/spill-m0.ll
===================================================================
--- test/CodeGen/AMDGPU/spill-m0.ll
+++ test/CodeGen/AMDGPU/spill-m0.ll
@@ -6,6 +6,8 @@
 ; XXX - Why does it like to use vcc?
 
 ; GCN-LABEL: {{^}}spill_m0:
+; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+
 ; GCN: s_cmp_lg_u32
 
 ; TOVGPR: s_mov_b32 vcc_hi, m0
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -459,7 +459,7 @@
   br i1 %cc, label %if, label %else
 
 if:
-  store volatile <4 x float> %dtex, <4 x float>* undef
+  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
   unreachable
 
 else: