Index: lib/Target/AMDGPU/SIFrameLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIFrameLowering.cpp
+++ lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,19 +21,8 @@
 
 static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
                               const MachineFrameInfo *FrameInfo) {
-  if (!FuncInfo->hasSpilledSGPRs())
-    return false;
-
-  if (FuncInfo->hasSpilledVGPRs())
-    return false;
-
-  for (int I = FrameInfo->getObjectIndexBegin(),
-         E = FrameInfo->getObjectIndexEnd(); I != E; ++I) {
-    if (!FrameInfo->isSpillSlotObjectIndex(I))
-      return false;
-  }
-
-  return true;
+  return FuncInfo->hasSpilledSGPRs() &&
+    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
 }
 
 static ArrayRef<MCPhysReg> getAllSGPR128() {
@@ -67,6 +56,8 @@
       static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   const AMDGPUSubtarget &ST = MF.getSubtarget<AMDGPUSubtarget>();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineBasicBlock::iterator I = MBB.begin();
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
@@ -84,6 +75,44 @@
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
+  if (MFI->hasFlatScratchInit() /*&& MFI->hasFlatInstructions()*/) {
+    // We don't need this if we only have spills since there is no user facing
+    // scratch.
+
+    // TODO: If we know we don't have flat instructions earlier, we can omit
+    // this from the input registers.
+    //
+    // TODO: We only need to know if we access scratch space through a flat
+    // pointer. Because we only detect if flat instructions are used at all,
+    // this will be used more often than necessary on VI.
+
+    DebugLoc DL;
+
+    unsigned FlatScratchInitReg
+      = TRI->getPreloadedValue(MF, SIRegisterInfo::FLAT_SCRATCH_INIT);
+
+    MRI.addLiveIn(FlatScratchInitReg);
+    MBB.addLiveIn(FlatScratchInitReg);
+
+    // Copy the size in bytes.
+    unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::FLAT_SCR_LO)
+      .addReg(FlatScrInitHi, RegState::Kill);
+
+    unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+
+    // Add wave offset in bytes to private base offset.
+    // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
+
+    // Convert offset to 256-byte units.
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
+      .addReg(FlatScrInitLo, RegState::Kill)
+      .addImm(8);
+  }
+
   // If we reserved the original input registers, we don't need to copy to the
   // reserved registers.
   if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
@@ -96,7 +125,6 @@
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
   MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
 
@@ -160,7 +188,6 @@
   assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
 
   const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
-  MachineBasicBlock::iterator I = MBB.begin();
   DebugLoc DL;
 
   if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -695,6 +695,12 @@
     CCInfo.AllocateReg(InputPtrReg);
   }
 
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
   AnalyzeFormalArguments(CCInfo, Splits);
 
   SmallVector<SDValue, 16> Chains;
@@ -822,8 +828,11 @@
 
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
-
   bool HasStackObjects = MF.getFrameInfo()->hasStackObjects();
+  // Record that we know we have non-spill stack objects so we don't need to
+  // check all stack objects later.
+  if (HasStackObjects)
+    Info->setHasNonSpillStackObjects(true);
 
   if (ST.isAmdHsaOS()) {
     // TODO: Assume we will spill without optimizations.
Index: lib/Target/AMDGPU/SILowerControlFlow.cpp
===================================================================
--- lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -564,43 +564,11 @@
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
   }
 
-  // FIXME: This seems inappropriate to do here.
   if (NeedFlat && MFI->IsKernel) {
-    // Insert the prologue initializing the SGPRs pointing to the scratch space
-    // for flat accesses.
-    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
-
     // TODO: What to use with function calls?
-
-    // FIXME: This is reporting stack size that is used in a scratch buffer
-    // rather than registers as well.
-    uint64_t StackSizeBytes = FrameInfo->getStackSize();
-
-    int IndirectBegin
-      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
-    // Convert register index to 256-byte unit.
-    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
-
-    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
-           "Stack limits should be smaller than 16-bits");
-
-    // Initialize the flat scratch register pair.
-    // TODO: Can we use one s_mov_b64 here?
-
-    // Offset is in units of 256-bytes.
-    MachineBasicBlock &MBB = MF.front();
-    DebugLoc NoDL;
-    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
-    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
-
-    assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
-
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
-      .addImm(StackOffset);
-
-    // Documentation says size is "per-thread scratch size in bytes"
-    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
-      .addImm(StackSizeBytes);
+    // We will need to Initialize the flat scratch register pair.
+    if (NeedFlat)
+      MFI->setHasFlatInstructions(true);
   }
 
   return true;
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -73,6 +73,8 @@
 private:
   bool HasSpilledSGPRs;
   bool HasSpilledVGPRs;
+  bool HasNonSpillStackObjects;
+  bool HasFlatInstructions;
 
   // Feature bits required for inputs passed in user SGPRs.
   bool PrivateSegmentBuffer : 1;
@@ -129,6 +131,7 @@
   unsigned addDispatchPtr(const SIRegisterInfo &TRI);
   unsigned addQueuePtr(const SIRegisterInfo &TRI);
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
+  unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -277,6 +280,22 @@
     HasSpilledVGPRs = Spill;
   }
 
+  bool hasNonSpillStackObjects() const {
+    return HasNonSpillStackObjects;
+  }
+
+  void setHasNonSpillStackObjects(bool StackObject = true) {
+    HasNonSpillStackObjects = StackObject;
+  }
+
+  bool hasFlatInstructions() const {
+    return HasFlatInstructions;
+  }
+
+  void setHasFlatInstructions(bool UseFlat = true) {
+    HasFlatInstructions = UseFlat;
+  }
+
   unsigned getPSInputAddr() const {
     return PSInputAddr;
   }
Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -54,6 +54,8 @@
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
     HasSpilledVGPRs(false),
+    HasNonSpillStackObjects(false),
+    HasFlatInstructions(false),
     PrivateSegmentBuffer(false),
     DispatchPtr(false),
     QueuePtr(false),
@@ -93,6 +95,11 @@
   if (F->hasFnAttribute("amdgpu-work-item-id-z"))
     WorkItemIDZ = true;
 
+  // X, XY, and XYZ are the only supported combinations, so make sure Y is
+  // enabled if Z is.
+  if (WorkItemIDZ)
+    WorkItemIDY = true;
+
   bool MaySpill = ST.isVGPRSpillingEnabled(this);
   bool HasStackObjects = FrameInfo->hasStackObjects();
 
@@ -107,10 +114,12 @@
       DispatchPtr = true;
   }
 
-  // X, XY, and XYZ are the only supported combinations, so make sure Y is
-  // enabled if Z is.
-  if (WorkItemIDZ)
-    WorkItemIDY = true;
+  // We don't need to worry about accessing spills with flat instructions.
+  // TODO: On VI where we must use flat for global, we should be able to omit
+  // this if it is never used for generic access.
+  if (HasStackObjects && ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS &&
+      ST.isAmdHsaOS())
+    FlatScratchInit = true;
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -142,6 +151,13 @@
   return KernargSegmentPtrUserSGPR;
 }
 
+unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+  FlatScratchInitUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return FlatScratchInitUserSGPR;
+}
+
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg(
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
Index: lib/Target/AMDGPU/SIRegisterInfo.h
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.h
+++ lib/Target/AMDGPU/SIRegisterInfo.h
@@ -117,10 +117,12 @@
 
   enum PreloadedValue {
     // SGPRS:
-    PRIVATE_SEGMENT_BUFFER =  0,
+    PRIVATE_SEGMENT_BUFFER = 0,
     DISPATCH_PTR        =  1,
     QUEUE_PTR           =  2,
     KERNARG_SEGMENT_PTR =  3,
+    DISPATCH_ID         =  4,
+    FLAT_SCRATCH_INIT   =  5,
     WORKGROUP_ID_X      = 10,
     WORKGROUP_ID_Y      = 11,
     WORKGROUP_ID_Z      = 12,
Index: lib/Target/AMDGPU/SIRegisterInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -624,6 +624,11 @@
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
+  case SIRegisterInfo::DISPATCH_ID:
+    llvm_unreachable("unimplemented");
+  case SIRegisterInfo::FLAT_SCRATCH_INIT:
+    assert(MFI->hasFlatScratchInit());
+    return MFI->FlatScratchInitUserSGPR;
   case SIRegisterInfo::DISPATCH_PTR:
     assert(MFI->hasDispatchPtr());
     return MFI->DispatchPtrUserSGPR;
Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll
===================================================================
--- test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -190,8 +190,6 @@
 }
 
 ; GCN-LABEL: {{^}}test_sink_global_vreg_sreg_i32:
-; VI-DAG: s_movk_i32 flat_scratch_lo, 0x0
-; VI-DAG: s_movk_i32 flat_scratch_hi, 0x0
 ; GCN: s_and_saveexec_b64
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
Index: test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
===================================================================
--- test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
+++ test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
@@ -1,11 +1,22 @@
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefix=HSA-DEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global | FileCheck -check-prefix=HSA-NODEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=NOHSA-DEFAULT %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -mattr=+flat-for-global | FileCheck -check-prefix=NOHSA-NODEFAULT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
 
 
+; There are no stack objects even though flat is used by default, so
+; flat_scratch_init should be disabled.
+
+; ALL-LABEL: {{^}}test:
+; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: .end_amd_kernel_code_t
+
+; ALL-NOT: flat_scr
+
 ; HSA-DEFAULT: flat_store_dword
 ; HSA-NODEFAULT: buffer_store_dword
+
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 define void @test(i32 addrspace(1)* %out) {
Index: test/CodeGen/AMDGPU/global_atomics.ll
===================================================================
--- test/CodeGen/AMDGPU/global_atomics.ll
+++ test/CodeGen/AMDGPU/global_atomics.ll
@@ -24,8 +24,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 
 define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
@@ -38,8 +36,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -71,8 +67,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -83,8 +77,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -117,8 +109,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -130,8 +120,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -163,8 +151,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -175,8 +161,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -209,8 +193,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -222,8 +204,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -255,8 +235,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -267,8 +245,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -301,8 +277,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -314,8 +288,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -347,8 +319,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -359,8 +329,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -393,8 +361,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -406,8 +372,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -439,8 +403,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -451,8 +413,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -485,8 +445,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -498,8 +456,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -531,8 +487,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -543,8 +497,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -577,8 +529,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -590,8 +540,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -623,8 +571,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -635,8 +581,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -669,8 +613,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -682,8 +624,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -715,8 +655,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -727,8 +665,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -771,8 +707,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -804,8 +738,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -816,8 +748,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -850,8 +780,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -863,8 +791,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
@@ -896,8 +822,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
 define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
@@ -908,8 +832,6 @@
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
-; VI: s_movk_i32 flat_scratch_lo, 0x0
-; VI: s_movk_i32 flat_scratch_hi, 0x0
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
 define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
Index: test/CodeGen/AMDGPU/hsa.ll
===================================================================
--- test/CodeGen/AMDGPU/hsa.ll
+++ test/CodeGen/AMDGPU/hsa.ll
@@ -28,7 +28,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 296
+; ELF: Size: 288
 ; ELF: Type: AMDGPU_HSA_KERNEL (0xA)
 ; ELF: }
 
Index: test/CodeGen/AMDGPU/large-alloca-compute.ll
===================================================================
--- test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -17,7 +17,7 @@
 ; GCNHSA: .amd_kernel_code_t
 
 ; GCNHSA: compute_pgm_rsrc2_scratch_en = 1
-; GCNHSA: compute_pgm_rsrc2_user_sgpr = 6
+; GCNHSA: compute_pgm_rsrc2_user_sgpr = 8
 ; GCNHSA: compute_pgm_rsrc2_tgid_x_en = 1
 ; GCNHSA: compute_pgm_rsrc2_tgid_y_en = 0
 ; GCNHSA: compute_pgm_rsrc2_tgid_z_en = 0
@@ -29,7 +29,7 @@
 ; GCNHSA: enable_sgpr_queue_ptr = 0
 ; GCNHSA: enable_sgpr_kernarg_segment_ptr = 1
 ; GCNHSA: enable_sgpr_dispatch_id = 0
-; GCNHSA: enable_sgpr_flat_scratch_init = 0
+; GCNHSA: enable_sgpr_flat_scratch_init = 1
 ; GCNHSA: enable_sgpr_private_segment_size = 0
 ; GCNHSA: enable_sgpr_grid_workgroup_count_x = 0
 ; GCNHSA: enable_sgpr_grid_workgroup_count_y = 0
@@ -39,8 +39,8 @@
 ; GCNHSA: .end_amd_kernel_code_t
 
 
-; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
-; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s7 offen
+; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
+; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
Index: test/CodeGen/AMDGPU/private-memory.ll
===================================================================
--- test/CodeGen/AMDGPU/private-memory.ll
+++ test/CodeGen/AMDGPU/private-memory.ll
@@ -41,6 +41,10 @@
 ; HSA-ALLOCA: workitem_private_segment_byte_size = 24
 ; HSA-ALLOCA: .end_amd_kernel_code_t
 
+; HSA-ALLOCA: s_mov_b32 flat_scratch_lo, s7
+; HSA-ALLOCA: s_add_u32 s6, s6, s9
+; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8
+
 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
 ; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0
 
Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
===================================================================
--- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; XUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA %s
-; XUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -11,18 +11,20 @@
 
 ; FIXME: The same register is initialized to 0 for every spill.
 
-declare i32 @llvm.r600.read.tgid.x() #1
-declare i32 @llvm.r600.read.tgid.y() #1
-declare i32 @llvm.r600.read.tgid.z() #1
-
 ; GCN-LABEL: {{^}}spill_vgpr_compute:
 
-; GCN: s_mov_b32 s16, s3
-; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; HSA: enable_sgpr_private_segment_buffer = 1
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: workitem_private_segment_byte_size = 1024
+
+; GCN-NOT: flat_scr
+
+; GCNMESA: s_mov_b32 s16, s3
+; GCNMESA: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GCNMESA-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCNMESA-NEXT: s_mov_b32 s14, -1
+; SIMESA-NEXT: s_mov_b32 s15, 0x80f000
+; VIMESA-NEXT: s_mov_b32 s15, 0x800000
 
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill