diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1284,6 +1284,10 @@
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
 
+def EnableFlatScratch : Predicate<"Subtarget->enableFlatScratch()">;
+
+def DisableFlatScratch : Predicate<"!Subtarget->enableFlatScratch()">;
+
 // Include AMDGPU TD files
 include "SISchedule.td"
 include "GCNProcessors.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -240,6 +240,8 @@
                         SDValue &Offset) const;
   bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
                          SDValue &VOffset, SDValue &Offset) const;
+  bool SelectScratchSAddr(SDNode *N, SDValue Addr, SDValue &SAddr,
+                          SDValue &Offset) const;
 
   bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
                         bool &Imm) const;
@@ -1672,9 +1674,11 @@
                                           SDValue &Offset) const {
   int64_t OffsetVal = 0;
 
+  unsigned AS = findMemSDNode(N)->getAddressSpace();
+
   if (Subtarget->hasFlatInstOffsets() &&
       (!Subtarget->hasFlatSegmentOffsetBug() ||
-       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+       AS != AMDGPUAS::FLAT_ADDRESS)) {
     SDValue N0, N1;
     if (CurDAG->isBaseWithConstantOffset(Addr)) {
       N0 = Addr.getOperand(0);
@@ -1686,7 +1690,6 @@
       uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
 
       const SIInstrInfo *TII = Subtarget->getInstrInfo();
-      unsigned AS = findMemSDNode(N)->getAddressSpace();
       if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
         Addr = N0;
         OffsetVal = COffsetVal;
@@ -1719,39 +1722,52 @@
 
         OffsetVal = ImmField;
 
-        // TODO: Should this try to use a scalar add pseudo if the base address
-        // is uniform and saddr is usable?
-        SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-        SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
-
-        SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                              MVT::i32, N0, Sub0);
-        SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
-                                              MVT::i32, N0, Sub1);
-
         SDValue AddOffsetLo =
             getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
-        SDValue AddOffsetHi =
-            getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
-
-        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
         SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
-        SDNode *Add =
-            CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
-                                   {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
-
-        SDNode *Addc = CurDAG->getMachineNode(
-            AMDGPU::V_ADDC_U32_e64, DL, VTs,
-            {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
-
-        SDValue RegSequenceArgs[] = {
-            CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
-            SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
-
-        Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                              MVT::i64, RegSequenceArgs),
-                       0);
+        if (Addr.getValueType().getSizeInBits() == 32) {
+          SmallVector<SDValue, 3> Opnds;
+          Opnds.push_back(N0);
+          Opnds.push_back(AddOffsetLo);
+          unsigned AddOp = AMDGPU::V_ADD_CO_U32_e32;
+          if (Subtarget->hasAddNoCarry()) {
+            AddOp = AMDGPU::V_ADD_U32_e64;
+            Opnds.push_back(Clamp);
+          }
+          Addr = SDValue(CurDAG->getMachineNode(AddOp, DL, MVT::i32, Opnds), 0);
+        } else {
+          // TODO: Should this try to use a scalar add pseudo if the base address
+          // is uniform and saddr is usable?
+          SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+          SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+          SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                DL, MVT::i32, N0, Sub0);
+          SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                                DL, MVT::i32, N0, Sub1);
+
+          SDValue AddOffsetHi =
+              getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+          SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+
+          SDNode *Add =
+              CurDAG->getMachineNode(AMDGPU::V_ADD_CO_U32_e64, DL, VTs,
+                                     {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+          SDNode *Addc = CurDAG->getMachineNode(
+              AMDGPU::V_ADDC_U32_e64, DL, VTs,
+              {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+          SDValue RegSequenceArgs[] = {
+              CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+              SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
+
+          Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                                MVT::i64, RegSequenceArgs),
+                         0);
+        }
       }
     }
   }
@@ -1824,6 +1840,64 @@
   return true;
 }
 
+// Match (32-bit SGPR base) + sext(imm offset)
+bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *N,
+                                            SDValue Addr,
+                                            SDValue &SAddr,
+                                            SDValue &Offset) const {
+  if (Addr->isDivergent())
+    return false;
+
+  SAddr = Addr;
+  int64_t COffsetVal = 0;
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+    SAddr = Addr.getOperand(0);
+  }
+
+  if (auto FI = dyn_cast<FrameIndexSDNode>(SAddr)) {
+    SAddr = CurDAG->getTargetFrameIndex(FI->getIndex(), FI->getValueType(0));
+  } else if (SAddr.getOpcode() == ISD::ADD &&
+             isa<FrameIndexSDNode>(SAddr.getOperand(0))) {
+    // Materialize this into a scalar move for scalar address to avoid
+    // readfirstlane.
+    auto FI = cast<FrameIndexSDNode>(SAddr.getOperand(0));
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI->getIndex(),
+                                              FI->getValueType(0));
+    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, SDLoc(SAddr),
+                                           MVT::i32, TFI, SAddr.getOperand(1)),
+                    0);
+  }
+
+  const SIInstrInfo *TII = Subtarget->getInstrInfo();
+
+  if (!TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+    int64_t RemainderOffset = COffsetVal;
+    int64_t ImmField = 0;
+    const unsigned NumBits = TII->getNumFlatOffsetBits(true);
+    // Use signed division by a power of two to truncate towards 0.
+    int64_t D = 1LL << (NumBits - 1);
+    RemainderOffset = (COffsetVal / D) * D;
+    ImmField = COffsetVal - RemainderOffset;
+
+    assert(TII->isLegalFLATOffset(ImmField, AMDGPUAS::PRIVATE_ADDRESS, true));
+    assert(RemainderOffset + ImmField == COffsetVal);
+
+    COffsetVal = ImmField;
+
+    SDLoc DL(N);
+    SDValue AddOffset =
+        getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+    SAddr = SDValue(CurDAG->getMachineNode(AMDGPU::S_ADD_U32, DL, MVT::i32,
+                                           SAddr, AddOffset), 0);
+  }
+
+  Offset = CurDAG->getTargetConstant(COffsetVal, SDLoc(), MVT::i16);
+
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
                                           SDValue &Offset, bool &Imm) const {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -959,6 +959,8 @@
     return true;
   }
 
+  bool enableFlatScratch() const;
+
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            unsigned NumRegionInstrs) const override;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -50,6 +50,11 @@
   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
   cl::init(false));
 
+static cl::opt<bool> EnableFlatScratch(
+  "amdgpu-enable-flat-scratch",
+  cl::desc("Use flat scratch instructions"),
+  cl::init(false));
+
 GCNSubtarget::~GCNSubtarget() = default;
 
 R600Subtarget &
@@ -286,6 +291,10 @@
   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
 }
 
+bool GCNSubtarget::enableFlatScratch() const {
+  return EnableFlatScratch && hasFlatScratchInsts();
+}
+
 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
   if (getGeneration() < GFX10)
     return 1;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1592,6 +1592,7 @@
   >;
 }
 
+let OtherPredicates = [DisableFlatScratch] in {
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>;
@@ -1610,7 +1611,7 @@
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
@@ -1626,6 +1627,8 @@
 defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
 }
 
+} // End OtherPredicates = [DisableFlatScratch]
+
 multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
                                       ValueType vt, PatFrag atomic_st> {
   // Store follows atomic op convention so address is first
@@ -1676,6 +1679,7 @@
   >;
 }
 
+let OtherPredicates = [DisableFlatScratch] in {
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
@@ -1690,7 +1694,7 @@
 defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
 
 
-let OtherPredicates = [D16PreservesUnusedBits] in {
+let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
  // Hiding the extract high pattern in the PatFrag seems to not
  // automatically increase the complexity.
 let AddedComplexity = 1 in {
@@ -1698,6 +1702,7 @@
 defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>;
 }
 }
+} // End OtherPredicates = [DisableFlatScratch]
 
 //===----------------------------------------------------------------------===//
 // MTBUF Patterns
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -8,8 +8,10 @@
 
 def FLATOffset : ComplexPattern<i64, 2, "SelectFlatOffset<false>", [], [SDNPWantRoot], -10>;
 def FLATOffsetSigned : ComplexPattern<i64, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
+def ScratchOffset : ComplexPattern<i32, 2, "SelectFlatOffset<true>", [], [SDNPWantRoot], -10>;
 
 def GlobalSAddr : ComplexPattern<i64, 3, "SelectGlobalSAddr", [], [SDNPWantRoot], -10>;
+def ScratchSAddr : ComplexPattern<i32, 2, "SelectScratchSAddr", [], [SDNPWantRoot], -10>;
 
 //===----------------------------------------------------------------------===//
 // FLAT classes
@@ -233,6 +235,11 @@
   let maybeAtomic = 1;
 }
 
+class FlatScratchInst <string sv_op, string mode> {
+  string SVOp = sv_op;
+  string Mode = mode;
+}
+
 class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   bit HasTiedOutput = 0,
   bit EnableSaddr = 0,
@@ -283,21 +290,27 @@
 
 multiclass FLAT_Scratch_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedOutput = 0> {
   let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>;
-    def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>;
+    def "" : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput>,
+             FlatScratchInst<opName, "SV">;
+    def _SADDR : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 1>,
+                 FlatScratchInst<opName, "SS">;
 
     let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>;
+    def _ST  : FLAT_Scratch_Load_Pseudo<opName, regClass, HasTiedOutput, 0, 0>,
+               FlatScratchInst<opName, "ST">;
   }
 }
 
 multiclass FLAT_Scratch_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_scratch = 1 in {
-    def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>;
-    def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>;
+    def "" : FLAT_Scratch_Store_Pseudo<opName, regClass>,
+             FlatScratchInst<opName, "SV">;
+    def _SADDR : FLAT_Scratch_Store_Pseudo<opName, regClass, 1>,
+                 FlatScratchInst<opName, "SS">;
 
     let SubtargetPredicate = HasFlatScratchSTMode in
-    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>;
+    def _ST  : FLAT_Scratch_Store_Pseudo<opName, regClass, 0, 0>,
+               FlatScratchInst<opName, "ST">;
   }
 }
 
@@ -852,6 +865,37 @@
   (inst $vaddr, $data, $offset)
 >;
 
+class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))),
+  (inst $vaddr, $offset)
+>;
+
+class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
+  (inst $vaddr, $offset, 0, 0, 0, $in)
+>;
+
+class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)),
+  (inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
+>;
+
+class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))),
+  (inst $saddr, $offset)
+>;
+
+class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
+  (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+  (inst $saddr, $offset, 0, 0, 0, $in)
+>;
+
+class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
+                            ValueType vt> : GCNPat <
+  (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)),
+  (inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
+>;
+
 let OtherPredicates = [HasFlatAddressSpace] in {
 
 def : FlatLoadPat <FLAT_LOAD_UBYTE, extloadi8_flat, i32>;
@@ -1009,6 +1053,37 @@
   }
 }
 
+multiclass ScratchFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : ScratchLoadSignedPat <inst, node, vt> {
+    let AddedComplexity = 25;
+  }
+
+  def : ScratchLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 26;
+  }
+}
+
+multiclass ScratchFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
+                               ValueType vt> {
+  def : ScratchStoreSignedPat <inst, node, vt> {
+    let AddedComplexity = 25;
+  }
+
+  def : ScratchStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 26;
+  }
+}
+
+multiclass ScratchFLATLoadPats_D16<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
+  def : ScratchLoadSignedPat_D16 <inst, node, vt> {
+    let AddedComplexity = 25;
+  }
+
+  def : ScratchLoadSaddrPat_D16<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+    let AddedComplexity = 26;
+  }
+}
+
 let OtherPredicates = [HasFlatGlobalInsts] in {
 
 defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
@@ -1109,6 +1184,62 @@
 
 } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
 
+let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
+
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, extloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_UBYTE, zextloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SBYTE, sextloadi8_private, i16>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, extloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, zextloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_SSHORT, sextloadi16_private, i32>;
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_USHORT, load_private, i16>;
+
+foreach vt = Reg32Types.types in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORD, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORD, store_private, vt>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX2, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX2, store_private, vt>;
+}
+
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX3, load_private, v3i32>;
+
+foreach vt = VReg_128.RegTypes in {
+defm : ScratchFLATLoadPats <SCRATCH_LOAD_DWORDX4, load_private, vt>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX4, store_private, vt>;
+}
+
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE, truncstorei8_private, i16>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, truncstorei16_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT, store_private, i16>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_DWORDX3, store_private, v3i32>;
+
+let OtherPredicates = [D16PreservesUnusedBits, HasFlatScratchInsts, EnableFlatScratch] in {
+defm : ScratchFLATStorePats <SCRATCH_STORE_SHORT_D16_HI, truncstorei16_hi16_private, i32>;
+defm : ScratchFLATStorePats <SCRATCH_STORE_BYTE_D16_HI, truncstorei8_hi16_private, i32>;
+
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16_HI, az_extloadi8_d16_hi_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16_HI, sextloadi8_d16_hi_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16_HI, load_d16_hi_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16_HI, load_d16_hi_private, v2f16>;
+
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16, az_extloadi8_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_UBYTE_D16, az_extloadi8_d16_lo_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16, sextloadi8_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SBYTE_D16, sextloadi8_d16_lo_private, v2f16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2i16>;
+defm : ScratchFLATLoadPats_D16 <SCRATCH_LOAD_SHORT_D16, load_d16_lo_private, v2f16>;
+}
+
+} // End OtherPredicates = [HasFlatScratchInsts,EnableFlatScratch]
 
 //===----------------------------------------------------------------------===//
 // Target
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -173,7 +173,7 @@
                               int OpNo,
                               const MachineOperand &OpToFold) {
   return OpToFold.isFI() &&
-    (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&
+    TII->isMUBUF(UseMI) &&
     OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -134,7 +134,8 @@
 // We need to specially emit stack operations here because a different frame
 // register is used than in the rest of the function, as getFrameRegister would
 // use.
-static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+static void buildPrologSpill(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+                             MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
                              const SIInstrInfo *TII, Register SpillReg,
                              Register ScratchRsrcReg, Register SPReg, int FI) {
@@ -147,7 +148,19 @@
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
       MFI.getObjectAlign(FI));
 
-  if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
+  if (ST.enableFlatScratch()) {
+    if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
+        .addReg(SpillReg, RegState::Kill)
+        .addReg(SPReg)
+        .addImm(Offset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // dlc
+        .addMemOperand(MMO);
+      return;
+    }
+  } else if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
       .addReg(SpillReg, RegState::Kill)
       .addReg(ScratchRsrcReg)
@@ -166,29 +179,48 @@
   // offset in the spill.
   LiveRegs.addReg(SpillReg);
 
-  MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
-    MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+  if (ST.enableFlatScratch()) {
+    MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+      MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
 
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
-    .addImm(Offset);
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+      .addReg(SPReg)
+      .addImm(Offset);
 
-  BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
-    .addReg(SpillReg, RegState::Kill)
-    .addReg(OffsetReg, RegState::Kill)
-    .addReg(ScratchRsrcReg)
-    .addReg(SPReg)
-    .addImm(0)
-    .addImm(0) // glc
-    .addImm(0) // slc
-    .addImm(0) // tfe
-    .addImm(0) // dlc
-    .addImm(0) // swz
-    .addMemOperand(MMO);
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_STORE_DWORD_SADDR))
+      .addReg(SpillReg, RegState::Kill)
+      .addReg(OffsetReg, RegState::Kill)
+      .addImm(0)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // dlc
+      .addMemOperand(MMO);
+  } else {
+    MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+      MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
+      .addImm(Offset);
+
+    BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
+      .addReg(SpillReg, RegState::Kill)
+      .addReg(OffsetReg, RegState::Kill)
+      .addReg(ScratchRsrcReg)
+      .addReg(SPReg)
+      .addImm(0)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // tfe
+      .addImm(0) // dlc
+      .addImm(0) // swz
+      .addMemOperand(MMO);
+  }
 
   LiveRegs.removeReg(SpillReg);
 }
 
-static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
+static void buildEpilogReload(const GCNSubtarget &ST, LivePhysRegs &LiveRegs,
+                              MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I,
                               const SIInstrInfo *TII, Register SpillReg,
                               Register ScratchRsrcReg, Register SPReg, int FI) {
@@ -200,6 +232,35 @@
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
       MFI.getObjectAlign(FI));
 
+  if (ST.enableFlatScratch()) {
+    if (TII->isLegalFLATOffset(Offset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
+      BuildMI(MBB, I, DebugLoc(),
+              TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR), SpillReg)
+        .addReg(SPReg)
+        .addImm(Offset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // dlc
+        .addMemOperand(MMO);
+      return;
+    }
+    MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
+      MF->getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0RegClass);
+
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_ADD_U32), OffsetReg)
+        .addReg(SPReg)
+        .addImm(Offset);
+      BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::SCRATCH_LOAD_DWORD_SADDR),
+              SpillReg)
+      .addReg(OffsetReg, RegState::Kill)
+      .addImm(0)
+      .addImm(0) // glc
+      .addImm(0) // slc
+      .addImm(0) // dlc
+      .addMemOperand(MMO);
+      return;
+  }
+
   if (SIInstrInfo::isLegalMUBUFImmOffset(Offset)) {
     BuildMI(MBB, I, DebugLoc(),
             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
@@ -256,6 +317,7 @@
 
   Register FlatScratchInitReg =
       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+  assert(FlatScratchInitReg);
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.addLiveIn(FlatScratchInitReg);
@@ -365,6 +427,10 @@
   return ScratchRsrcReg;
 }
 
+static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
+  return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
+}
+
 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
@@ -461,7 +527,7 @@
     Register SPReg = MFI->getStackPtrOffsetReg();
     assert(SPReg != AMDGPU::SP_REG);
     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
-        .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
+        .addImm(MF.getFrameInfo().getStackSize() * getScratchScaleFactor(ST));
   }
 
   if (hasFP(MF)) {
@@ -780,7 +846,7 @@
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
 
-    buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+    buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
                      FuncInfo->getScratchRSrcReg(),
                      StackPtrReg,
                      Reg.FI.getValue());
@@ -798,7 +864,7 @@
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(FramePtrReg);
 
-    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+    buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
                      FuncInfo->FramePointerSaveIndex.getValue());
   }
@@ -815,7 +881,7 @@
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
         .addReg(BasePtrReg);
 
-    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+    buildPrologSpill(ST, LiveRegs, MBB, MBBI, TII, TmpVGPR,
                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
                      *FuncInfo->BasePointerSaveIndex);
   }
@@ -888,11 +954,11 @@
     // s_and_b32 s32, tmp_reg, 0b111...0000
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
         .addReg(StackPtrReg)
-        .addImm((Alignment - 1) * ST.getWavefrontSize())
+        .addImm((Alignment - 1) * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
         .addReg(ScratchSPReg, RegState::Kill)
-        .addImm(-Alignment * ST.getWavefrontSize())
+        .addImm(-Alignment * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
     FuncInfo->setIsStackRealigned(true);
   } else if ((HasFP = hasFP(MF))) {
@@ -914,7 +980,7 @@
   if (HasFP && RoundedSize != 0) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
         .addReg(StackPtrReg)
-        .addImm(RoundedSize * ST.getWavefrontSize())
+        .addImm(RoundedSize * getScratchScaleFactor(ST))
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
@@ -976,7 +1042,7 @@
   if (RoundedSize != 0 && hasFP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
-      .addImm(RoundedSize * ST.getWavefrontSize())
+      .addImm(RoundedSize * getScratchScaleFactor(ST))
       .setMIFlag(MachineInstr::FrameDestroy);
   }
 
@@ -1002,7 +1068,7 @@
 
       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+      buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
           .addReg(TempVGPR, RegState::Kill);
@@ -1028,7 +1094,7 @@
 
       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
-      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+      buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, TempVGPR,
                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
           .addReg(TempVGPR, RegState::Kill);
@@ -1053,7 +1119,7 @@
     if (!ScratchExecCopy)
       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
 
-    buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
+    buildEpilogReload(ST, LiveRegs, MBB, MBBI, TII, Reg.VGPR,
                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
                       Reg.FI.getValue());
   }
@@ -1264,7 +1330,7 @@
     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
       .addReg(SPReg)
-      .addImm(Amount * ST.getWavefrontSize());
+      .addImm(Amount * getScratchScaleFactor(ST));
   } else if (CalleePopAmount != 0) {
     llvm_unreachable("is this used?");
   }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2298,7 +2298,8 @@
     }
 
     assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasKernargSegmentPtr() &&
+           (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
            !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -507,11 +507,20 @@
     return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
   }
 
+  bool isSegmentSpecificFLAT(uint16_t Opcode) const {
+    auto Flags = get(Opcode).TSFlags;
+    return (Flags & SIInstrFlags::FLAT) && !(Flags & SIInstrFlags::LGKM_CNT);
+  }
+
   // FIXME: Make this more precise
   static bool isFLATScratch(const MachineInstr &MI) {
     return isSegmentSpecificFLAT(MI);
   }
 
+  bool isFLATScratch(uint16_t Opcode) const {
+    return isSegmentSpecificFLAT(Opcode);
+  }
+
   // Any FLAT encoded instruction, including global_* and scratch_*.
   bool isFLAT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::FLAT;
@@ -1147,6 +1156,9 @@
   LLVM_READONLY
   int getVCMPXNoSDstOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getFlatScratchInstSTfromSS(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2564,6 +2564,16 @@
   let ValueCols = [["1"]];
 }
 
+// Maps flat scratch opcodes by addressing modes
+def getFlatScratchInstSTfromSS : InstrMapping {
+  let FilterClass = "FlatScratchInst";
+  let RowFields = ["SVOp"];
+  let ColFields = ["Mode"];
+  let KeyCol = ["SS"];
+  let ValueCols = [["ST"]];
+}
+
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -167,11 +167,12 @@
   if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
     KernargSegmentPtr = true;
 
-  if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
+  if (ST.hasFlatAddressSpace() && isEntryFunction() &&
+      (isAmdHsaOrMesa || ST.enableFlatScratch())) {
     // TODO: This could be refined a lot. The attribute is a poor way of
     // detecting calls or stack objects that may require it before argument
     // lowering.
-    if (HasCalls || HasStackObjects)
+    if (HasCalls || HasStackObjects || ST.enableFlatScratch())
       FlatScratchInit = true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -89,7 +89,7 @@
     const MachineFunction &MF) const override;
   bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
 
-  int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
+  int64_t getScratchInstrOffset(const MachineInstr *MI) const;
 
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
                                    int Idx) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -388,8 +388,8 @@
   return true;
 }
 
-int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
-  assert(SIInstrInfo::isMUBUF(*MI));
+int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
+  assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
 
   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::offset);
@@ -398,23 +398,29 @@
 
 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
                                                  int Idx) const {
-  if (!SIInstrInfo::isMUBUF(*MI))
+  if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
     return 0;
 
-  assert(Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
-                                           AMDGPU::OpName::vaddr) &&
+  assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::vaddr) ||
+         (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
+                                            AMDGPU::OpName::saddr))) &&
          "Should never see frame index on non-address operand");
 
-  return getMUBUFInstrOffset(MI);
+  return getScratchInstrOffset(MI);
 }
 
 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   if (!MI->mayLoadOrStore())
     return false;
 
-  int64_t FullOffset = Offset + getMUBUFInstrOffset(MI);
+  int64_t FullOffset = Offset + getScratchInstrOffset(MI);
 
-  return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
+  if (SIInstrInfo::isMUBUF(*MI))
+    return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  return TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
 }
 
 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
@@ -429,9 +435,11 @@
 
   MachineFunction *MF = MBB->getParent();
   const SIInstrInfo *TII = ST.getInstrInfo();
+  unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
+                                           : AMDGPU::V_MOV_B32_e32;
 
   if (Offset == 0) {
-    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
+    BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
       .addFrameIndex(FrameIdx);
     return;
   }
@@ -439,13 +447,22 @@
   MachineRegisterInfo &MRI = MF->getRegInfo();
   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
 
-  Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  Register FIReg = MRI.createVirtualRegister(
+      ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
+                             : &AMDGPU::VGPR_32RegClass);
 
   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
     .addImm(Offset);
-  BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), FIReg)
+  BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
     .addFrameIndex(FrameIdx);
 
+  if (ST.enableFlatScratch() ) {
+    BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
+        .addReg(OffsetReg, RegState::Kill)
+        .addReg(FIReg);
+    return;
+  }
+
   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
     .addReg(OffsetReg, RegState::Kill)
     .addReg(FIReg)
@@ -455,6 +472,7 @@
 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                        int64_t Offset) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
+  bool IsFlat = TII->isFLATScratch(MI);
 
 #ifndef NDEBUG
   // FIXME: Is it possible to be storing a frame index to itself?
@@ -469,12 +487,25 @@
   }
 #endif
 
-  MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+  MachineOperand *FIOp =
+      TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
+                                      : AMDGPU::OpName::vaddr);
 #ifndef NDEBUG
   MachineBasicBlock *MBB = MI.getParent();
   MachineFunction *MF = MBB->getParent();
   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
-  assert(TII->isMUBUF(MI));
+  assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
+
+  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
+  int64_t NewOffset = OffsetOp->getImm() + Offset;
+
+  if (IsFlat) {
+    assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
+           "offset should be legal");
+    FIOp->ChangeToRegister(BaseReg, false);
+    OffsetOp->setImm(NewOffset);
+    return;
+  }
 
   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
   assert((SOffset->isReg() &&
@@ -483,8 +514,6 @@
          (SOffset->isImm() && SOffset->getImm() == 0));
 #endif
 
-  MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
-  int64_t NewOffset = OffsetOp->getImm() + Offset;
   assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
          "offset should be legal");
 
@@ -495,12 +524,16 @@
 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
                                         Register BaseReg,
                                         int64_t Offset) const {
-  if (!SIInstrInfo::isMUBUF(*MI))
+  if (!SIInstrInfo::isMUBUF(*MI) && !!SIInstrInfo::isFLATScratch(*MI))
     return false;
 
-  int64_t NewOffset = Offset + getMUBUFInstrOffset(MI);
+  int64_t NewOffset = Offset + getScratchInstrOffset(MI);
 
-  return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
+  if (SIInstrInfo::isMUBUF(*MI))
+    return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
 }
 
 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
@@ -722,9 +755,10 @@
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
 
-  const MCInstrDesc &Desc = TII->get(LoadStoreOp);
+  const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
   const DebugLoc &DL = MI->getDebugLoc();
-  bool IsStore = Desc.mayStore();
+  bool IsStore = Desc->mayStore();
+  bool IsFlat = TII->isFLATScratch(LoadStoreOp);
 
   bool Scavenged = false;
   MCRegister SOffset = ScratchOffsetReg;
@@ -734,6 +768,7 @@
   unsigned NumSubRegs = AMDGPU::getRegBitWidth(RC->getID()) / (EltSize * CHAR_BIT);
   unsigned Size = NumSubRegs * EltSize;
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
+  int64_t MaxOffset = Offset + Size - EltSize;
   int64_t ScratchOffsetRegDelta = 0;
 
   Align Alignment = MFI.getObjectAlign(Index);
@@ -741,13 +776,17 @@
 
   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
 
-  if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset + Size - EltSize)) {
+  bool IsOffsetLegal = IsFlat
+      ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
+      : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
+  if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
     SOffset = MCRegister();
 
     // We currently only support spilling VGPRs to EltSize boundaries, meaning
     // we can simplify the adjustment of Offset here to just scale with
     // WavefrontSize.
-    Offset *= ST.getWavefrontSize();
+    if (!IsFlat)
+      Offset *= ST.getWavefrontSize();
 
     // We don't have access to the register scavenger if this function is called
     // during  PEI::scavengeFrameVirtualRegs().
@@ -785,8 +824,18 @@
     Offset = 0;
   }
 
+  if (IsFlat && SOffset == AMDGPU::NoRegister) {
+    assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
+           && "Unexpected vaddr for flat scratch with a FI operand");
+
+    assert(ST.hasFlatScratchSTMode());
+    LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
+    Desc = &TII->get(LoadStoreOp);
+  }
+
   Register TmpReg;
 
+  // FIXME: Flat scratch does not have to be limited to a dword per store.
   for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
     Register SubReg = NumSubRegs == 1
                           ? Register(ValueReg)
@@ -831,22 +880,26 @@
           MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
                                    commonAlignment(Alignment, EltSize * i));
 
-      MIB = BuildMI(*MBB, MI, DL, Desc)
+      MIB = BuildMI(*MBB, MI, DL, *Desc)
                 .addReg(SubReg,
-                        getDefRegState(!IsStore) | getKillRegState(IsKill))
-                .addReg(ScratchRsrcReg);
+                        getDefRegState(!IsStore) | getKillRegState(IsKill));
+      if (!IsFlat)
+        MIB.addReg(ScratchRsrcReg);
+
       if (SOffset == AMDGPU::NoRegister) {
-        MIB.addImm(0);
+        if (!IsFlat)
+          MIB.addImm(0);
       } else {
         MIB.addReg(SOffset, SOffsetRegState);
       }
       MIB.addImm(Offset)
           .addImm(0) // glc
           .addImm(0) // slc
-          .addImm(0) // tfe
-          .addImm(0) // dlc
-          .addImm(0) // swz
-          .addMemOperand(NewMMO);
+          .addImm(0); // tfe for MUBUF or dlc for FLAT
+      if (!IsFlat)
+        MIB.addImm(0) // dlc
+           .addImm(0); // swz
+      MIB.addMemOperand(NewMMO);
 
       if (!IsAGPR && NeedSuperRegDef)
         MIB.addReg(ValueReg, RegState::ImplicitDefine);
@@ -947,14 +1000,18 @@
       EltSize, Alignment);
 
   if (IsLoad) {
-    buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                                          : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+    buildSpillLoadStore(MI, Opc,
           Index,
           VGPR, false,
           MFI->getScratchRSrcReg(), FrameReg,
           Offset * EltSize, MMO,
           RS);
   } else {
-    buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR,
+    unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                                          : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+    buildSpillLoadStore(MI, Opc, Index, VGPR,
                         IsKill, MFI->getScratchRSrcReg(), FrameReg,
                         Offset * EltSize, MMO, RS);
     // This only ever adds one VGPR spill
@@ -1294,7 +1351,9 @@
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
              MFI->getStackPtrOffsetReg());
 
-      buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET,
+      unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
+                                            : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
+      buildSpillLoadStore(MI, Opc,
             Index,
             VData->getReg(), VData->isKill(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
@@ -1328,7 +1387,9 @@
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
              MFI->getStackPtrOffsetReg());
 
-      buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+      unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
+                                            : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
+      buildSpillLoadStore(MI, Opc,
             Index,
             VData->getReg(), VData->isKill(),
             TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc)->getReg(),
@@ -1342,6 +1403,113 @@
 
     default: {
       const DebugLoc &DL = MI->getDebugLoc();
+
+      int64_t Offset = FrameInfo.getObjectOffset(Index);
+      if (ST.enableFlatScratch()) {
+        if (TII->isFLATScratch(*MI)) {
+          // The offset is always swizzled, just replace it
+          if (FrameReg)
+            FIOp.ChangeToRegister(FrameReg, false);
+
+          if (!Offset)
+            return;
+
+          MachineOperand *OffsetOp =
+            TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
+          int64_t NewOffset = Offset + OffsetOp->getImm();
+          if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
+                                     true)) {
+            OffsetOp->setImm(NewOffset);
+            if (FrameReg)
+              return;
+            Offset = 0;
+          }
+
+          assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
+                 "Unexpected vaddr for flat scratch with a FI operand");
+
+          // On GFX10 we have ST mode to use no registers for an address.
+          // Otherwise we need to materialize 0 into an SGPR.
+          if (!Offset && ST.hasFlatScratchSTMode()) {
+            unsigned Opc = MI->getOpcode();
+            unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
+            MI->RemoveOperand(
+                AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
+            MI->setDesc(TII->get(NewOpc));
+            return;
+          }
+        }
+
+        if (!FrameReg) {
+          FIOp.ChangeToImmediate(Offset);
+          if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
+            return;
+        }
+
+        // We need to use register here. Check if we can use an SGPR or need
+        // a VGPR.
+        FIOp.ChangeToRegister(AMDGPU::M0, false);
+        bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
+
+        if (!Offset && FrameReg && UseSGPR) {
+          FIOp.setReg(FrameReg);
+          return;
+        }
+
+        const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
+                                                : &AMDGPU::VGPR_32RegClass;
+
+        Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
+        FIOp.setReg(TmpReg);
+        FIOp.setIsKill(true);
+
+        if ((!FrameReg || !Offset) && TmpReg) {
+          unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
+          auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
+          if (FrameReg)
+            MIB.addReg(FrameReg);
+          else
+            MIB.addImm(Offset);
+
+          return;
+        }
+
+        Register TmpSReg =
+            UseSGPR ? TmpReg
+                    : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
+                                           !UseSGPR);
+
+        // TODO: for flat scratch another attempt can be made with a VGPR index
+        //       if no SGPRs can be scavenged.
+        if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+          report_fatal_error("Cannot scavenge register in FI elimination!");
+
+        if (!TmpSReg) {
+          // Use frame register and restore it after.
+          TmpSReg = FrameReg;
+          FIOp.setReg(FrameReg);
+          FIOp.setIsKill(false);
+        }
+
+        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
+          .addReg(FrameReg)
+          .addImm(Offset);
+
+        if (!UseSGPR)
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+            .addReg(TmpSReg, RegState::Kill);
+
+        if (TmpSReg == FrameReg) {
+          // Undo frame register modification.
+          BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
+                  FrameReg)
+            .addReg(FrameReg)
+            .addImm(Offset);
+        }
+
+        return;
+      }
+
       bool IsMUBUF = TII->isMUBUF(*MI);
 
       if (!IsMUBUF && !MFI->isEntryFunction()) {
@@ -1471,7 +1639,6 @@
       // If the offset is simply too big, don't convert to a scratch wave offset
       // relative index.
 
-      int64_t Offset = FrameInfo.getObjectOffset(Index);
       FIOp.ChangeToImmediate(Offset);
       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
         Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -1,6 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
 
 declare hidden void @external_void_func_void() #0
 
@@ -22,7 +23,8 @@
 }
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
-; GCN: buffer_store_dword
+; MUBUF:   buffer_store_dword
+; FLATSCR: scratch_store_dword
 ; GCN: v_writelane_b32 v40, s33, 4
 ; GCN: v_writelane_b32 v40, s34, 0
 ; GCN: v_writelane_b32 v40, s35, 1
@@ -39,7 +41,8 @@
 ; GCN: v_readlane_b32 s34, v40, 0
 
 ; GCN: v_readlane_b32 s33, v40, 4
-; GCN: buffer_load_dword
+; MUBUF:   buffer_load_dword
+; FLATSCR: scratch_load_dword
 ; GCN: s_setpc_b64
 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
   call void @external_void_func_void()
@@ -49,16 +52,19 @@
 }
 
 ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
-; GCN: buffer_store_dword v40
+; MUBUF:   buffer_store_dword v40
+; FLATSCR: scratch_store_dword off, v40
 ; GCN: v_writelane_b32 v40, s33, 4
 
 ; GCN: s_mov_b32 s33, s32
-; GCN: s_add_u32 s32, s32, 0x400
+; MUBUF:   s_add_u32 s32, s32, 0x400
+; FLATSCR: s_add_u32 s32, s32, 16
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_swappc_b64
 
 ; GCN: v_readlane_b32 s33, v40, 4
-; GCN: buffer_load_dword v40,
+; MUBUF:   buffer_load_dword v40
+; FLATSCR: scratch_load_dword v40
 define void @test_func_call_external_void_funcx2() #0 {
   call void @external_void_func_void()
   call void @external_void_func_void()
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,GFX9,MUBUF %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck  -enable-var-scope -check-prefixes=GCN,FLATSCR %s
 
 ; GCN-LABEL: {{^}}callee_no_stack:
 ; GCN: ; %bb.0:
@@ -32,7 +33,8 @@
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack() #0 {
@@ -48,10 +50,13 @@
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_mov_b32 s4, s33
 ; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: s_add_u32 s32, s32, 0x200
+; MUBUF-NEXT:   s_add_u32 s32, s32, 0x200
+; FLATSCR-NEXT: s_add_u32 s32, s32, 8
 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}}
-; GCN-NEXT: s_sub_u32 s32, s32, 0x200
+; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}}
+; FLATSCR-NEXT: scratch_store_dword off, v0, s33 offset:4{{$}}
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x200
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
 ; GCN-NEXT: s_mov_b32 s33, s4
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -65,7 +70,8 @@
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}}
-; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32{{$}}
+; MUBUF-NEXT:   buffer_store_dword v0, off, s[0:3], s32{{$}}
+; FLATSCR-NEXT: scratch_store_dword off, v0, s32{{$}}
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
 define void @callee_with_stack_no_fp_elim_non_leaf() #2 {
@@ -78,26 +84,31 @@
 ; GCN: ; %bb.0:
 ; GCN-NEXT: s_waitcnt
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2
 ; GCN-DAG: s_mov_b32 s33, s32
-; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
+; MUBUF-DAG:   s_add_u32 s32, s32, 0x400{{$}}
+; FLATSCR-DAG: s_add_u32 s32, s32, 16{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30,
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31,
 
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
+; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}}
+; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33{{$}}
 
 ; GCN: s_swappc_b64
 
 ; GCN-DAG: v_readlane_b32 s5, [[CSR_VGPR]]
 ; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]]
 
-; GCN: s_sub_u32 s32, s32, 0x400{{$}}
+; MUBUF:    s_sub_u32 s32, s32, 0x400{{$}}
+; FLATSCR:  s_sub_u32 s32, s32, 16{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 
@@ -118,9 +129,11 @@
 ; GCN-LABEL: {{^}}callee_no_stack_with_call:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-DAG: s_add_u32 s32, s32, 0x400
+; MUBUF-DAG:   s_add_u32 s32, s32, 0x400
+; FLATSCR-DAG: s_add_u32 s32, s32, 16
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]]
 
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0
@@ -130,10 +143,12 @@
 ; GCN-DAG: v_readlane_b32 s4, v40, 0
 ; GCN-DAG: v_readlane_b32 s5, v40, 1
 
-; GCN: s_sub_u32 s32, s32, 0x400
+; MUBUF:   s_sub_u32 s32, s32, 0x400
+; FLATSCR: s_sub_u32 s32, s32, 16
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]]
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -149,7 +164,8 @@
 ;
 ; GCN-LABEL: {{^}}callee_func_sgpr_spill_no_calls:
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN: v_writelane_b32 [[CSR_VGPR]], s
 ; GCN: v_writelane_b32 [[CSR_VGPR]], s
@@ -159,7 +175,8 @@
 ; GCN: v_readlane_b32 s{{[0-9]+}}, [[CSR_VGPR]]
 
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt
 ; GCN-NEXT: s_setpc_b64
@@ -208,16 +225,21 @@
 ; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8
+; MUBUF-DAG:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; MUBUF-DAG:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8
+; FLATSCR-DAG: scratch_store_dword off, [[ZERO]], s33 offset:8
 
 ; GCN:	;;#ASMSTART
 ; GCN-NEXT: ; clobber v41
 ; GCN-NEXT: ;;#ASMEND
 
-; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN: s_add_u32 s32, s32, 0x300
-; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; MUBUF:        s_add_u32 s32, s32, 0x300
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x300
+; FLATSCR:      s_add_u32 s32, s32, 12
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
 ; GCN-NEXT: s_mov_b32 s33, s4
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -233,14 +255,18 @@
 ; GCN: s_waitcnt
 ; GCN-NEXT: v_writelane_b32 v1, s33, 63
 ; GCN-NEXT: s_mov_b32 s33, s32
-; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
 ; GCN-COUNT-63: v_writelane_b32 v1
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
+; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
+; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:8
 ; GCN: ;;#ASMSTART
 ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1
 
-; GCN: s_add_u32 s32, s32, 0x300
-; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; MUBUF:        s_add_u32 s32, s32, 0x300
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x300
+; FLATSCR:      s_add_u32 s32, s32, 12
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
 ; GCN-NEXT: v_readlane_b32 s33, v1, 63
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -265,16 +291,21 @@
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
 ; GCN-COUNT-64: v_writelane_b32 v1,
 
-; GCN: buffer_store_dword
+; MUBUF:   buffer_store_dword
+; FLATSCR: scratch_store_dword
 ; GCN: ;;#ASMSTART
 ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
 
-; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN: s_add_u32 s32, s32, 0x300
-; GCN-NEXT: s_sub_u32 s32, s32, 0x300
+; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; MUBUF:        s_add_u32 s32, s32, 0x300
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x300
+; FLATSCR:      s_add_u32 s32, s32, 12
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 12
 ; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -296,13 +327,18 @@
 
 ; GCN-LABEL: {{^}}realign_stack_no_fp_elim:
 ; GCN: s_waitcnt
-; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
-; GCN-NEXT: s_mov_b32 s4, s33
-; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000
-; GCN-NEXT: s_add_u32 s32, s32, 0x100000
-; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
-; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33
-; GCN-NEXT: s_sub_u32 s32, s32, 0x100000
+; MUBUF-NEXT:   s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0
+; FLATSCR-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x1fff
+; GCN-NEXT:     s_mov_b32 s4, s33
+; MUBUF-NEXT:   s_and_b32 s33, [[SCRATCH]], 0xfff80000
+; FLATSCR-NEXT: s_and_b32 s33, [[SCRATCH]], 0xffffe000
+; MUBUF-NEXT:   s_add_u32 s32, s32, 0x100000
+; FLATSCR-NEXT: s_add_u32 s32, s32, 0x4000
+; GCN-NEXT:     v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; MUBUF-NEXT:   buffer_store_dword [[ZERO]], off, s[0:3], s33
+; FLATSCR-NEXT: scratch_store_dword off, [[ZERO]], s33
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x100000
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x4000
 ; GCN-NEXT: s_mov_b32 s33, s4
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -319,12 +355,15 @@
 ; GCN-NEXT: s_mov_b32 s33, s32
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; GCN: v_writelane_b32 v1, s31, 1
-; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
+; MUBUF:   buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4
+; FLATSCR: scratch_store_dword off, [[ZERO]], s33 offset:4
 ; GCN: ;;#ASMSTART
 ; GCN: v_readlane_b32 s4, v1, 0
-; GCN-NEXT: s_add_u32 s32, s32, 0x200
-; GCN-NEXT: v_readlane_b32 s5, v1, 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x200
+; MUBUF-NEXT:   s_add_u32 s32, s32, 0x200
+; FLATSCR-NEXT: s_add_u32 s32, s32, 8
+; GCN-NEXT:     v_readlane_b32 s5, v1, 1
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x200
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 8
 ; GCN-NEXT: v_readlane_b32 s33, v1, 2
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64 s[4:5]
@@ -346,24 +385,29 @@
 ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
 ; GCN-NEXT: s_mov_b32 s33, s32
 
 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
-; GCN-DAG: buffer_store_dword
-; GCN: s_add_u32 s32, s32, 0x300{{$}}
+; MUBUF-DAG:   buffer_store_dword
+; FLATSCR-DAG: scratch_store_dword
+; MUBUF:       s_add_u32 s32, s32, 0x300{{$}}
+; FLATSCR:     s_add_u32 s32, s32, 12{{$}}
 
 ; GCN: ;;#ASMSTART
 
-; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
-; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}}
+; GCN:          v_readlane_b32 s4, [[CSR_VGPR]], 0
+; GCN-NEXT:     v_readlane_b32 s5, [[CSR_VGPR]], 1
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x300{{$}}
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 12{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s32 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -392,25 +436,32 @@
 ; GCN-LABEL: {{^}}scratch_reg_needed_mubuf_offset:
 ; GCN: s_waitcnt
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
-; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
+; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
+; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008
+; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], [[SCRATCH_SGPR]] ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
-; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
-; GCN-NEXT: s_mov_b32 s33, s32
-; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
-; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}}
-; GCN-DAG: buffer_store_dword
+; GCN-DAG:  v_writelane_b32 [[CSR_VGPR]], s30, 0
+; GCN-DAG:  s_mov_b32 s33, s32
+; GCN-DAG:  v_writelane_b32 [[CSR_VGPR]], s31, 1
+; MUBUF-DAG:   s_add_u32 s32, s32, 0x40300{{$}}
+; FLATSCR-DAG: s_add_u32 s32, s32, 0x100c{{$}}
+; MUBUF-DAG:   buffer_store_dword
+; FLATSCR-DAG: scratch_store_dword
 
 ; GCN: ;;#ASMSTART
 
 ; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
 ; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1
-; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}}
+; MUBUF-NEXT:   s_sub_u32 s32, s32, 0x40300{{$}}
+; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x100c{{$}}
 ; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
-; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
+; MUBUF-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
+; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
+; FLATSCR-NEXT: s_add_u32 [[SCRATCH_SGPR:s[0-9]+]], s32, 0x1008
+; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, [[SCRATCH_SGPR]] ; 4-byte Folded Reload
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_setpc_b64
@@ -447,10 +498,13 @@
 ; GCN-LABEL: {{^}}ipra_call_with_stack:
 ; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
 ; GCN: s_mov_b32 s33, s32
-; GCN: s_add_u32 s32, s32, 0x400
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
-; GCN: s_swappc_b64
-; GCN: s_sub_u32 s32, s32, 0x400
+; MUBUF:   s_add_u32 s32, s32, 0x400
+; FLATSCR: s_add_u32 s32, s32, 16
+; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}}
+; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33{{$}}
+; GCN:     s_swappc_b64
+; MUBUF:   s_sub_u32 s32, s32, 0x400
+; FLATSCR: s_sub_u32 s32, s32, 16
 ; GCN: s_mov_b32 s33, [[FP_COPY:s[0-9]+]]
 define void @ipra_call_with_stack() #0 {
   %alloca = alloca i32, addrspace(5)
@@ -463,11 +517,13 @@
 ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory:
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
-; GCN: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
+; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:4
+; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:4
 ; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN: s_mov_b32 s33, s32
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
+; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:4
+; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:4
 ; GCN: s_waitcnt vmcnt(0)
 ; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
 ; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
@@ -494,13 +550,15 @@
 ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_memory_full_reserved_vgpr:
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; GCN: v_mov_b32_e32 [[TMP_VGPR1:v[0-9]+]], s33
-; GCN: buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
+; MUBUF:   buffer_store_dword [[TMP_VGPR1]], off, s[0:3], s32 offset:[[OFF:[0-9]+]]
+; FLATSCR: scratch_store_dword off, [[TMP_VGPR1]], s32 offset:[[OFF:[0-9]+]]
 ; GCN: s_mov_b64 exec, [[COPY_EXEC1]]
 ; GCN-NOT: v_writelane_b32 v40, s33
 ; GCN: s_mov_b32 s33, s32
 ; GCN-NOT: v_readlane_b32 s33, v40
 ; GCN: s_or_saveexec_b64 [[COPY_EXEC2:s\[[0-9]+:[0-9]+\]]], -1{{$}}
-; GCN: buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
+; MUBUF:   buffer_load_dword [[TMP_VGPR2:v[0-9]+]], off, s[0:3], s32 offset:[[OFF]]
+; FLATSCR: scratch_load_dword [[TMP_VGPR2:v[0-9]+]], off, s32 offset:[[OFF]]
 ; GCN: v_readfirstlane_b32 s33, [[TMP_VGPR2]]
 ; GCN: s_mov_b64 exec, [[COPY_EXEC2]]
 ; GCN: s_setpc_b64
@@ -529,10 +587,13 @@
 ; scratch VGPR to hold the offset.
 ; GCN-LABEL: {{^}}spill_fp_to_memory_scratch_reg_needed_mubuf_offset
 ; GCN: s_or_saveexec_b64 s[4:5], -1
-; GCN: v_mov_b32_e32 v0, s33
+; MUBUF: v_mov_b32_e32 v0, s33
 ; GCN-NOT: v_mov_b32_e32 v0, 0x1008
-; GCN-NEXT: v_mov_b32_e32 v1, 0x1008
-; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen
+; MUBUF-NEXT: v_mov_b32_e32 v1, 0x1008
+; MUBUF-NEXT: buffer_store_dword v0, v1, s[0:3], s32 offen ; 4-byte Folded Spill
+; FLATSCR-NEXT: s_add_u32 [[SOFF:s[0-9]+]], s32, 0x1008
+; FLATSCR-NEXT: v_mov_b32_e32 v0, s33
+; FLATSCR-NEXT: scratch_store_dword off, v0, [[SOFF]] ; 4-byte Folded Spill
 define void @spill_fp_to_memory_scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval align 4 %arg) #3 {
   %alloca = alloca i32, addrspace(5)
   store volatile i32 0, i32 addrspace(5)* %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
--- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll
@@ -1,15 +1,27 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
 
 define <2 x half> @chain_hi_to_lo_private() {
-; GCN-LABEL: chain_hi_to_lo_private:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: chain_hi_to_lo_private:
+; GFX900:       ; %bb.0: ; %bb
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ushort v0, off, s[0:3], 0 offset:2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, off, s[0:3], 0
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: chain_hi_to_lo_private:
+; FLATSCR:       ; %bb.0: ; %bb
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s4, 2
+; FLATSCR-NEXT:    scratch_load_ushort v0, off, s4
+; FLATSCR-NEXT:    s_mov_b32 s4, 0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, off, s4
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds half, half addrspace(5)* null, i64 1
   %load_lo = load half, half addrspace(5)* %gep_lo
@@ -23,14 +35,23 @@
 }
 
 define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %base_lo, half addrspace(5)* %base_hi) {
-; GCN-LABEL: chain_hi_to_lo_private_different_bases:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: chain_hi_to_lo_private_different_bases:
+; GFX900:       ; %bb.0: ; %bb
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_ushort v0, v0, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: chain_hi_to_lo_private_different_bases:
+; FLATSCR:       ; %bb.0: ; %bb
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    scratch_load_ushort v0, v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v0, v1, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %load_lo = load half, half addrspace(5)* %base_lo
   %load_hi = load half, half addrspace(5)* %base_hi
@@ -42,14 +63,23 @@
 }
 
 define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) {
-; GCN-LABEL: chain_hi_to_lo_arithmatic:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_add_f16_e32 v1, 1.0, v1
-; GCN-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: chain_hi_to_lo_arithmatic:
+; GFX900:       ; %bb.0: ; %bb
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: chain_hi_to_lo_arithmatic:
+; FLATSCR:       ; %bb.0: ; %bb
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    v_add_f16_e32 v1, 1.0, v1
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %arith_lo = fadd half %in, 1.0
   %load_hi = load half, half addrspace(5)* %base
@@ -191,38 +221,75 @@
 
 ; Make sure we don't lose any of the private stores.
 define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly %in, <2 x i16> addrspace(1)* nocapture %out) #0 {
-; GCN-LABEL: vload2_private:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    global_load_ushort v2, v[0:1], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
-; GCN-NEXT:    global_load_ushort v2, v[0:1], off offset:2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:6
-; GCN-NEXT:    global_load_ushort v2, v[0:1], off offset:4
-; GCN-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NEXT:    v_mov_b32_e32 v1, s7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:8
-; GCN-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:4
-; GCN-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:6
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v3, v4
-; GCN-NEXT:    buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8
-; GCN-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
-; GCN-NEXT:    s_endpgm
+; GFX900-LABEL: vload2_private:
+; GFX900:       ; %bb.0: ; %entry
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX900-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX900-NEXT:    s_add_u32 s0, s0, s9
+; GFX900-NEXT:    s_addc_u32 s1, s1, 0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, s4
+; GFX900-NEXT:    v_mov_b32_e32 v1, s5
+; GFX900-NEXT:    global_load_ushort v2, v[0:1], off
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    global_load_ushort v2, v[0:1], off offset:2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    global_load_ushort v2, v[0:1], off offset:4
+; GFX900-NEXT:    v_mov_b32_e32 v0, s6
+; GFX900-NEXT:    v_mov_b32_e32 v1, s7
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    buffer_load_ushort v2, off, s[0:3], 0 offset:4
+; GFX900-NEXT:    buffer_load_ushort v4, off, s[0:3], 0 offset:6
+; GFX900-NEXT:    s_waitcnt vmcnt(1)
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v3, v4
+; GFX900-NEXT:    buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8
+; GFX900-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX900-NEXT:    s_endpgm
+;
+; FLATSCR-LABEL: vload2_private:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s4
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:4
+; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:2
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:6
+; FLATSCR-NEXT:    global_load_ushort v2, v[0:1], off offset:4
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s6
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s7
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_short off, v2, vcc_hi offset:8
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    scratch_load_ushort v2, off, vcc_hi offset:4
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    scratch_load_ushort v4, off, vcc_hi offset:6
+; FLATSCR-NEXT:    s_mov_b32 vcc_hi, 0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; FLATSCR-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v3, v4
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v3, off, vcc_hi offset:8
+; FLATSCR-NEXT:    v_lshl_or_b32 v2, v4, 16, v2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; FLATSCR-NEXT:    s_endpgm
 entry:
   %loc = alloca [3 x i16], align 2, addrspace(5)
   %loc.0.sroa_cast1 = bitcast [3 x i16] addrspace(5)* %loc to i8 addrspace(5)*
@@ -297,16 +364,27 @@
 }
 
 define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) {
-; GCN-LABEL: chain_hi_to_lo_private_other_dep:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
-; GCN-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, v1
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: chain_hi_to_lo_private_other_dep:
+; GFX900:       ; %bb.0: ; %bb
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; GFX900-NEXT:    buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: chain_hi_to_lo_private_other_dep:
+; FLATSCR:       ; %bb.0: ; %bb
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    scratch_load_short_d16_hi v1, v0, off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0]
+; FLATSCR-NEXT:    scratch_load_short_d16 v1, v0, off offset:2
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, v1
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %gep_lo = getelementptr inbounds i16, i16 addrspace(5)* %ptr, i64 1
   %load_lo = load i16, i16 addrspace(5)* %gep_lo
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=-unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-ALIGNED %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX7-UNALIGNED %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+unaligned-scratch-access -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9-FLASTSCR %s
 
 ; Should not merge this to a dword load
 define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 {
@@ -35,6 +36,15 @@
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLASTSCR-LABEL: private_load_2xi16_align2:
+; GFX9-FLASTSCR:       ; %bb.0:
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v1, v0, off
+; GFX9-FLASTSCR-NEXT:    scratch_load_ushort v0, v0, off offset:2
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
   %p.0 = load i16, i16 addrspace(5)* %p, align 2
   %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2
@@ -78,6 +88,16 @@
 ; GFX9-NEXT:    buffer_store_short v0, v1, s[0:3], 0 offen offset:2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLASTSCR-LABEL: private_store_2xi16_align2:
+; GFX9-FLASTSCR:       ; %bb.0:
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-FLASTSCR-NEXT:    scratch_store_short v1, v0, off
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 2
+; GFX9-FLASTSCR-NEXT:    scratch_store_short v1, v0, off offset:2
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
   store i16 1, i16 addrspace(5)* %r, align 2
   store i16 2, i16 addrspace(5)* %gep.r, align 2
@@ -124,6 +144,17 @@
 ; GFX9-NEXT:    v_bfi_b32 v1, v1, 0, v0
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLASTSCR-LABEL: private_load_2xi16_align1:
+; GFX9-FLASTSCR:       ; %bb.0:
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-FLASTSCR-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    v_bfi_b32 v1, v1, 0, v0
+; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
   %p.0 = load i16, i16 addrspace(5)* %p, align 1
   %p.1 = load i16, i16 addrspace(5)* %gep.p, align 1
@@ -167,6 +198,14 @@
 ; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLASTSCR-LABEL: private_store_2xi16_align1:
+; GFX9-FLASTSCR:       ; %bb.0:
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
   store i16 1, i16 addrspace(5)* %r, align 1
   store i16 2, i16 addrspace(5)* %gep.r, align 1
@@ -206,6 +245,17 @@
 ; GFX9-NEXT:    v_bfi_b32 v1, v1, 0, v0
 ; GFX9-NEXT:    v_and_or_b32 v0, v0, s4, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLASTSCR-LABEL: private_load_2xi16_align4:
+; GFX9-FLASTSCR:       ; %bb.0:
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLASTSCR-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-FLASTSCR-NEXT:    s_mov_b32 s4, 0xffff
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    v_bfi_b32 v1, v1, 0, v0
+; GFX9-FLASTSCR-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.p = getelementptr i16, i16 addrspace(5)* %p, i64 1
   %p.0 = load i16, i16 addrspace(5)* %p, align 4
   %p.1 = load i16, i16 addrspace(5)* %gep.p, align 2
@@ -228,13 +278,37 @@
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
 ; GFX7-NEXT:    s_endpgm
 ;
-; GCN-LABEL: private_store_2xi16_align4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, 0x20001
-; GCN-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-ALIGNED-LABEL: private_store_2xi16_align4:
+; GFX7-ALIGNED:       ; %bb.0:
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX7-ALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-ALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-ALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX7-UNALIGNED-LABEL: private_store_2xi16_align4:
+; GFX7-UNALIGNED:       ; %bb.0:
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX7-UNALIGNED-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX7-UNALIGNED-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-UNALIGNED-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: private_store_2xi16_align4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-FLASTSCR-LABEL: private_store_2xi16_align4:
+; GFX9-FLASTSCR:       ; %bb.0:
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-FLASTSCR-NEXT:    v_mov_b32_e32 v0, 0x20001
+; GFX9-FLASTSCR-NEXT:    scratch_store_dword v1, v0, off
+; GFX9-FLASTSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLASTSCR-NEXT:    s_setpc_b64 s[30:31]
   %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1
   store i16 1, i16 addrspace(5)* %r, align 4
   store i16 2, i16 addrspace(5)* %gep.r, align 2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -0,0 +1,1295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
+
+define amdgpu_kernel void @zero_init_kernel() {
+; GFX9-LABEL: zero_init_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:76
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:72
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:68
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:64
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:60
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:56
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:52
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:48
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:44
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:40
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:36
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:32
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:28
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:24
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:20
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:16
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: zero_init_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:76
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:72
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:68
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:64
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:60
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:56
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:52
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:48
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:44
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:40
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:36
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:32
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:28
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:24
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:20
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:16
+; GFX10-NEXT:    s_endpgm
+  %alloca = alloca [32 x i16], align 2, addrspace(5)
+  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define void @zero_init_foo() {
+; GFX9-LABEL: zero_init_foo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:60
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:56
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:52
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:48
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:44
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:40
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:36
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:32
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:28
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:24
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:20
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:16
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:12
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:8
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:4
+; GFX9-NEXT:    scratch_store_dword off, v0, s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: zero_init_foo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:60
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:56
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:52
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:48
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:44
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:40
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:36
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:32
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:28
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:24
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:20
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:16
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:12
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:8
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:4
+; GFX10-NEXT:    scratch_store_dword off, v0, s32
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %alloca = alloca [32 x i16], align 2, addrspace(5)
+  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
+; GFX9-LABEL: store_load_sindex_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_add_u32 s1, 4, s1
+; GFX9-NEXT:    scratch_store_dword off, v0, s1
+; GFX9-NEXT:    s_add_u32 s0, 4, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_sindex_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_and_b32 s1, s0, 15
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX10-NEXT:    s_add_u32 s0, 4, s0
+; GFX10-NEXT:    s_add_u32 s1, 4, s1
+; GFX10-NEXT:    scratch_store_dword off, v0, s0
+; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    s_endpgm
+bb:
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
+; GFX9-LABEL: store_load_sindex_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
+; GFX9-NEXT:    s_add_u32 s0, 4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    scratch_store_dword off, v0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 15
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_add_u32 s0, 4, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_sindex_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10-NEXT:    s_add_u32 s1, 4, s1
+; GFX10-NEXT:    s_add_u32 s0, 4, s0
+; GFX10-NEXT:    scratch_store_dword off, v0, s1
+; GFX10-NEXT:    scratch_load_dword v0, off, s0
+; GFX10-NEXT:    s_endpgm
+bb:
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_kernel void @store_load_vindex_kernel() {
+; GFX9-LABEL: store_load_vindex_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_vindex_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 4
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX10-NEXT:    s_endpgm
+bb:
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %i3 = zext i32 %i2 to i64
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = sub nsw i32 31, %i2
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define void @store_load_vindex_foo(i32 %idx) {
+; GFX9-LABEL: store_load_vindex_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, s32
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_vindex_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    v_mov_b32_e32 v2, s32
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v0, v2, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define void @private_ptr_foo(float addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: private_ptr_foo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: private_ptr_foo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x41200000
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr inbounds float, float addrspace(5)* %arg, i32 1
+  store float 1.000000e+01, float addrspace(5)* %gep, align 4
+  ret void
+}
+
+define amdgpu_kernel void @zero_init_small_offset_kernel() {
+; GFX9-LABEL: zero_init_small_offset_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:284
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:280
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:276
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:272
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:300
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:296
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:292
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:288
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:316
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:312
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:308
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:304
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:332
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:328
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:324
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:320
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: zero_init_small_offset_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:284
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:280
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:276
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:272
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:300
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:296
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:292
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:288
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:316
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:312
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:308
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:304
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:332
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:328
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:324
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:320
+; GFX10-NEXT:    s_endpgm
+  %padding = alloca [64 x i32], align 4, addrspace(5)
+  %alloca = alloca [32 x i16], align 2, addrspace(5)
+  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define void @zero_init_small_offset_foo() {
+; GFX9-LABEL: zero_init_small_offset_foo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v0, off, s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:268
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:264
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:256
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:284
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:280
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:276
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:272
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:300
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:296
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:292
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:288
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:316
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:312
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:308
+; GFX9-NEXT:    scratch_store_dword off, v0, s32 offset:304
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: zero_init_small_offset_foo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:268
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:264
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:260
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:256
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:284
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:280
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:276
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:272
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:300
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:296
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:292
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:288
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:316
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:312
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:308
+; GFX10-NEXT:    scratch_store_dword off, v0, s32 offset:304
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %padding = alloca [64 x i32], align 4, addrspace(5)
+  %alloca = alloca [32 x i16], align 2, addrspace(5)
+  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
+; GFX9-LABEL: store_load_sindex_small_offset_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX9-NEXT:    scratch_store_dword off, v0, s1
+; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_sindex_small_offset_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_and_b32 s1, s0, 15
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX10-NEXT:    scratch_store_dword off, v0, s0
+; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    s_endpgm
+bb:
+  %padding = alloca [64 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
+; GFX9-LABEL: store_load_sindex_small_offset_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
+; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    scratch_store_dword off, v0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 15
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_sindex_small_offset_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10-NEXT:    s_add_u32 s1, 0x104, s1
+; GFX10-NEXT:    s_add_u32 s0, 0x104, s0
+; GFX10-NEXT:    scratch_store_dword off, v0, s1
+; GFX10-NEXT:    scratch_load_dword v0, off, s0
+; GFX10-NEXT:    s_endpgm
+bb:
+  %padding = alloca [64 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
+; GFX9-LABEL: store_load_vindex_small_offset_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_vindex_small_offset_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x104
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
+; GFX10-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX10-NEXT:    s_endpgm
+bb:
+  %padding = alloca [64 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %i3 = zext i32 %i2 to i64
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = sub nsw i32 31, %i2
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define void @store_load_vindex_small_offset_foo(i32 %idx) {
+; GFX9-LABEL: store_load_vindex_small_offset_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v1, off, s32
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x100
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_vindex_small_offset_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x100
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
+; GFX10-NEXT:    scratch_load_dword v3, off, s32
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v0, v2, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %padding = alloca [64 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [64 x i32], [64 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_kernel void @zero_init_large_offset_kernel() {
+; GFX9-LABEL: zero_init_large_offset_kernel:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:12
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:8
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:28
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:24
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:20
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:16
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:44
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:40
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:36
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:32
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:60
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:56
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:52
+; GFX9-NEXT:    s_movk_i32 vcc_hi, 0x4010
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:48
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: zero_init_large_offset_kernel:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:12
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:8
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:28
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:24
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:20
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:16
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:44
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:40
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:36
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:32
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:60
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:56
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:52
+; GFX10-NEXT:    s_movk_i32 vcc_lo, 0x4010
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:48
+; GFX10-NEXT:    s_endpgm
+  %padding = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca = alloca [32 x i16], align 2, addrspace(5)
+  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define void @zero_init_large_offset_foo() {
+; GFX9-LABEL: zero_init_large_offset_foo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v0, off, s32
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:12
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:8
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:28
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:24
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:20
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:16
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:44
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:40
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:36
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:32
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:60
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:56
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:52
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:48
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: zero_init_large_offset_foo:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    scratch_load_dword v0, off, s32
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:12
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:8
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:4
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:28
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:24
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:20
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:16
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:44
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:40
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:36
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:32
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:60
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:56
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:52
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    scratch_store_dword off, v0, vcc_lo offset:48
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+  %padding = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca = alloca [32 x i16], align 2, addrspace(5)
+  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %cast = bitcast [32 x i16] addrspace(5)* %alloca to i8 addrspace(5)*
+  call void @llvm.memset.p5i8.i64(i8 addrspace(5)* align 2 dereferenceable(64) %cast, i8 0, i64 64, i1 false)
+  ret void
+}
+
+define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
+; GFX9-LABEL: store_load_sindex_large_offset_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT:    s_lshl_b32 s1, s0, 2
+; GFX9-NEXT:    s_and_b32 s0, s0, 15
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX9-NEXT:    scratch_store_dword off, v0, s1
+; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_sindex_large_offset_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_and_b32 s1, s0, 15
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 2
+; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX10-NEXT:    scratch_store_dword off, v0, s0
+; GFX10-NEXT:    scratch_load_dword v0, off, s1
+; GFX10-NEXT:    s_endpgm
+bb:
+  %padding = alloca [4096 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
+; GFX9-LABEL: store_load_sindex_large_offset_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    s_lshl_b32 s0, s2, 2
+; GFX9-NEXT:    scratch_load_dword v0, off, vcc_hi offset:4
+; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    scratch_store_dword off, v0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 15
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX9-NEXT:    scratch_load_dword v0, off, s0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_sindex_large_offset_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    scratch_load_dword v0, off, off offset:4
+; GFX10-NEXT:    s_and_b32 s0, s2, 15
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v0, 15
+; GFX10-NEXT:    s_lshl_b32 s1, s2, 2
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10-NEXT:    s_add_u32 s1, 0x4004, s1
+; GFX10-NEXT:    s_add_u32 s0, 0x4004, s0
+; GFX10-NEXT:    scratch_store_dword off, v0, s1
+; GFX10-NEXT:    scratch_load_dword v0, off, s0
+; GFX10-NEXT:    s_endpgm
+bb:
+  %padding = alloca [4096 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
+; GFX9-LABEL: store_load_vindex_large_offset_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_load_dword v1, off, vcc_hi offset:4
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX9-NEXT:    v_add_u32_e32 v2, v1, v0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_vindex_large_offset_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x4004
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 15
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, v1, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX10-NEXT:    scratch_load_dword v1, off, off offset:4
+; GFX10-NEXT:    scratch_store_dword v2, v3, off
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:124
+; GFX10-NEXT:    s_endpgm
+bb:
+  %padding = alloca [4096 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %i3 = zext i32 %i2 to i64
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i2
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = sub nsw i32 31, %i2
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define void @store_load_vindex_large_offset_foo(i32 %idx) {
+; GFX9-LABEL: store_load_vindex_large_offset_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    scratch_load_dword v1, off, s32
+; GFX9-NEXT:    s_add_u32 vcc_hi, s32, 0x4000
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, vcc_hi
+; GFX9-NEXT:    v_mov_b32_e32 v3, 15
+; GFX9-NEXT:    v_lshl_add_u32 v2, v0, 2, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, v0, v3
+; GFX9-NEXT:    scratch_store_dword v2, v3, off
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_vindex_large_offset_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    s_add_u32 vcc_lo, s32, 0x4000
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    v_mov_b32_e32 v2, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v3, v0, v1
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, v2
+; GFX10-NEXT:    v_lshl_add_u32 v2, v3, 2, v2
+; GFX10-NEXT:    scratch_load_dword v3, off, s32
+; GFX10-NEXT:    scratch_store_dword v0, v1, off
+; GFX10-NEXT:    scratch_load_dword v0, v2, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %padding = alloca [4096 x i32], align 4, addrspace(5)
+  %i = alloca [32 x float], align 4, addrspace(5)
+  %pad_gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %padding, i32 0, i32 undef
+  %pad_load = load volatile i32, i32 addrspace(5)* %pad_gep, align 4
+  %i1 = bitcast [32 x float] addrspace(5)* %i to i8 addrspace(5)*
+  %i7 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %idx
+  %i8 = bitcast float addrspace(5)* %i7 to i32 addrspace(5)*
+  store volatile i32 15, i32 addrspace(5)* %i8, align 4
+  %i9 = and i32 %idx, 15
+  %i10 = getelementptr inbounds [32 x float], [32 x float] addrspace(5)* %i, i32 0, i32 %i9
+  %i11 = bitcast float addrspace(5)* %i10 to i32 addrspace(5)*
+  %i12 = load volatile i32, i32 addrspace(5)* %i11, align 4
+  ret void
+}
+
+define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
+; GFX9-LABEL: store_load_large_imm_offset_kernel:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s0, s3
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s1, 0
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    s_mov_b32 vcc_hi, 0
+; GFX9-NEXT:    scratch_store_dword off, v0, vcc_hi offset:4
+; GFX9-NEXT:    s_add_u32 s0, 4, s0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    scratch_store_dword off, v0, s0 offset:3712
+; GFX9-NEXT:    scratch_load_dword v0, off, s0 offset:3712
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_large_imm_offset_kernel:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s0, s0, s3
+; GFX10-NEXT:    s_addc_u32 s1, s1, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 13
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    s_movk_i32 s0, 0x3800
+; GFX10-NEXT:    s_add_u32 s0, 4, s0
+; GFX10-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX10-NEXT:    scratch_store_dword off, v1, s0 offset:1664
+; GFX10-NEXT:    scratch_load_dword v0, off, s0 offset:1664
+; GFX10-NEXT:    s_endpgm
+bb:
+  %i = alloca [4096 x i32], align 4, addrspace(5)
+  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
+  store volatile i32 13, i32 addrspace(5)* %i1, align 4
+  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
+  store volatile i32 15, i32 addrspace(5)* %i7, align 4
+  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
+  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+  ret void
+}
+
+define void @store_load_large_imm_offset_foo() {
+; GFX9-LABEL: store_load_large_imm_offset_foo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_movk_i32 s4, 0x3000
+; GFX9-NEXT:    v_mov_b32_e32 v0, 13
+; GFX9-NEXT:    scratch_store_dword off, v0, s32
+; GFX9-NEXT:    s_add_u32 s4, s32, s4
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
+; GFX9-NEXT:    scratch_store_dword off, v0, s4 offset:3712
+; GFX9-NEXT:    scratch_load_dword v0, off, s4 offset:3712
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_large_imm_offset_foo:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 13
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    s_movk_i32 s4, 0x3800
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    s_add_u32 s4, s32, s4
+; GFX10-NEXT:    scratch_store_dword off, v0, s32
+; GFX10-NEXT:    scratch_store_dword off, v1, s4 offset:1664
+; GFX10-NEXT:    scratch_load_dword v0, off, s4 offset:1664
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = alloca [4096 x i32], align 4, addrspace(5)
+  %i1 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 undef
+  store volatile i32 13, i32 addrspace(5)* %i1, align 4
+  %i7 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
+  store volatile i32 15, i32 addrspace(5)* %i7, align 4
+  %i10 = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(5)* %i, i32 0, i32 4000
+  %i12 = load volatile i32, i32 addrspace(5)* %i10, align 4
+  ret void
+}
+
+define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
+; GFX9-LABEL: store_load_vidx_sidx_offset:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, 4
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
+; GFX9-NEXT:    v_add_u32_e32 v0, s0, v0
+; GFX9-NEXT:    v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX9-NEXT:    scratch_load_dword v0, v0, off offset:1024
+; GFX9-NEXT:    s_endpgm
+;
+; GFX10-LABEL: store_load_vidx_sidx_offset:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_add_u32 s2, s2, s5
+; GFX10-NEXT:    s_addc_u32 s3, s3, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; GFX10-NEXT:    s_load_dword s0, s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v1, 15
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX10-NEXT:    v_lshl_add_u32 v0, v0, 2, 4
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:1024
+; GFX10-NEXT:    scratch_load_dword v0, v0, off offset:1024
+; GFX10-NEXT:    s_endpgm
+bb:
+  %alloca = alloca [32 x i32], align 4, addrspace(5)
+  %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add1 = add nsw i32 %sidx, %vidx
+  %add2 = add nsw i32 %add1, 256
+  %gep = getelementptr inbounds [32 x i32], [32 x i32] addrspace(5)* %alloca, i32 0, i32 %add2
+  store volatile i32 15, i32 addrspace(5)* %gep, align 4
+  %load = load volatile i32, i32 addrspace(5)* %gep, align 4
+  ret void
+}
+
+; FIXME: Multi-DWORD scratch shall be supported
+define void @store_load_i64_aligned(i64 addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_i64_aligned:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_dword v0, v1, off
+; GFX9-NEXT:    scratch_load_dword v1, v0, off offset:4
+; GFX9-NEXT:    scratch_load_dword v0, v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_i64_aligned:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_dword v0, v1, off offset:4
+; GFX10-NEXT:    scratch_store_dword v0, v2, off
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    scratch_load_dword v1, v0, off offset:4
+; GFX10-NEXT:    scratch_load_dword v0, v0, off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  store volatile i64 15, i64 addrspace(5)* %arg, align 8
+  %load = load volatile i64, i64 addrspace(5)* %arg, align 8
+  ret void
+}
+
+; FIXME: Multi-DWORD unaligned scratch shall be supported
+define void @store_load_i64_unaligned(i64 addrspace(5)* nocapture %arg) {
+; GFX9-LABEL: store_load_i64_unaligned:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:7
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:6
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:5
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:4
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:3
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:2
+; GFX9-NEXT:    scratch_store_byte v0, v1, off offset:1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 15
+; GFX9-NEXT:    scratch_store_byte v0, v1, off
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:6
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:4
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:5
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off offset:3
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    scratch_load_ubyte v1, v0, off
+; GFX9-NEXT:    scratch_load_ubyte v0, v0, off offset:1
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: store_load_i64_unaligned:
+; GFX10:       ; %bb.0: ; %bb
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 15
+; GFX10-NEXT:    ; implicit-def: $vcc_hi
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:7
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:6
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:5
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:4
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:3
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:2
+; GFX10-NEXT:    scratch_store_byte v0, v1, off offset:1
+; GFX10-NEXT:    scratch_store_byte v0, v2, off
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:6
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:7
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:4
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:5
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off offset:3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_clause 0x1
+; GFX10-NEXT:    scratch_load_ubyte v1, v0, off
+; GFX10-NEXT:    scratch_load_ubyte v0, v0, off offset:1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  store volatile i64 15, i64 addrspace(5)* %arg, align 1
+  %load = load volatile i64, i64 addrspace(5)* %arg, align 1
+  ret void
+}
+
+declare void @llvm.memset.p5i8.i64(i8 addrspace(5)* nocapture writeonly, i8, i64, i1 immarg)
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
--- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-MUBUF,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-sroa=0 -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLATSCR %s
 
 ; Test that non-entry function frame indices are expanded properly to
 ; give an index relative to the scratch wave offset register
@@ -9,9 +10,13 @@
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 
 ; CI-NEXT: v_lshr_b32_e64 v0, s32, 6
-; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF-NEXT: v_lshrrev_b32_e64 v0, 6, s32
+
+; GFX9-FLATSCR:     v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR-NOT: v_lshrrev_b32_e64
+
+; MUBUF-NOT: v_mov
 
-; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
 define void @func_mov_fi_i32() #0 {
   %alloca = alloca i32, addrspace(5)
@@ -30,11 +35,14 @@
 ; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]]
 ; CI-NEXT: ds_write_b32 v0, v0
 
-; GFX9: v_lshrrev_b32_e64 v0, 6, s32
-; GFX9-NEXT: ds_write_b32 v0, v0
-; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
-; GFX9-NEXT: ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:   v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-FLATSCR:      v_mov_b32_e32 v0, s32
+; GFX9-FLATSCR:      s_add_u32 [[ADD:[^,]+]], s32, 4
+; GFX9-NEXT:         ds_write_b32 v0, v0
+; GFX9-MUBUF-NEXT:   v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
+; GFX9-MUBUF-NEXT:   v_add_u32_e32 v0, 4, [[SCALED]]
+; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, [[ADD]]
+; GFX9-NEXT:         ds_write_b32 v0, v0
 define void @func_mov_fi_i32_offset() #0 {
   %alloca0 = alloca i32, addrspace(5)
   %alloca1 = alloca i32, addrspace(5)
@@ -52,8 +60,11 @@
 ; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]]
 
-; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]]
+; GFX9-MUBUF:       v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
+; GFX9-MUBUF-NEXT:  v_add_u32_e32 v0, 4, [[SCALED]]
+
+; GFX9-FLATSCR:      v_mov_b32_e32 [[ADD:v[0-9]+]], s32
+; GFX9-FLATSCR-NEXT: v_add_u32_e32 v0, 4, [[ADD]]
 
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -71,7 +82,8 @@
 
 ; CI: v_lshr_b32_e64 v0, s32, 6
 
-; GFX9: v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-MUBUF:   v_lshrrev_b32_e64 v0, 6, s32
+; GFX9-FLATSCR: v_mov_b32_e32 v0, s32
 
 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0
 ; GCN-NOT: v_mov
@@ -86,7 +98,8 @@
 
 ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr:
 ; GCN: v_mov_b32_e32 v1, 15{{$}}
-; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
+; MUBUF:        buffer_store_dword v1, v0, s[0:3], 0 offen{{$}}
+; GFX9-FLATSCR: scratch_store_dword v0, v1, off{{$}}
 define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
   store volatile i32 15, i32 addrspace(5)* %ptr
   ret void
@@ -94,7 +107,8 @@
 
 ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr:
 ; GCN: s_waitcnt
-; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
+; MUBUF-NEXT:        buffer_load_dword v0, v0, s[0:3], 0 offen{{$}}
+; GFX9-FLATSCR-NEXT: scratch_load_dword v0, v0, off{{$}}
 define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 {
   %val = load volatile i32, i32 addrspace(5)* %ptr
   ret void
@@ -106,8 +120,11 @@
 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
 
-; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
-; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
+; GFX9-MUBUF:      v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
+; GFX9-MUBUF-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]]
+
+; GFX9-FLATSCR:      v_mov_b32_e32 [[SP:v[0-9]+]], s32
+; GFX9-FLATSCR-NEXT: v_or_b32_e32 v0, 4, [[SP]]
 
 ; GCN-NOT: v_mov
 ; GCN: ds_write_b32 v0, v0
@@ -121,8 +138,10 @@
 
 ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_value:
 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
-; GCN_NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; MUBUF-NEXT: buffer_load_ubyte v0, off, s[0:3], s32
+; MUBUF-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4
+; GFX9-FLATSCR-NEXT: scratch_load_ubyte v0, off, s32
+; GFX9-FLATSCR-NEXT: scratch_load_dword v1, off, s32 offset:4
 define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* byval %arg0) #0 {
   %gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 0
   %gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 } addrspace(5)* %arg0, i32 0, i32 1
@@ -137,15 +156,17 @@
 
 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
 
-; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
+; GFX9-MUBUF:   v_lshrrev_b32_e64 [[SP:v[0-9]+]], 6, s32
+; GFX9-FLATSCR: v_mov_b32_e32 [[SP:v[0-9]+]], s32
 
 ; GCN: s_and_saveexec_b64
 
 ; CI: v_add_i32_e32 [[GEP:v[0-9]+]], vcc, 4, [[SHIFT]]
 ; CI: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
 
-; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SHIFT]]
-; GFX9: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
+; GFX9: v_add_u32_e32 [[GEP:v[0-9]+]], 4, [[SP]]
+; GFX9-MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4{{$}}
+; GFX9-FLATSCR: scratch_load_dword v{{[0-9]+}}, [[SP]], off offset:4{{$}}
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[GEP]]
 define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 } addrspace(5)* byval %arg0, i32 %arg2) #0 {
@@ -170,8 +191,11 @@
 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]]
 
-; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
+; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
+; GFX9-MUBUF:     v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
+
+; GFX9-FLATSCR-DAG: s_add_u32 [[SZ:[^,]+]], s32, 0x200
+; GFX9-FLATSCR:     v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]]
 
 ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
 ; GCN: ds_write_b32 v0, [[VZ]]
@@ -193,8 +217,11 @@
 ; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6
 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]]
 
-; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
-; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
+; GFX9-MUBUF-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32
+; GFX9-MUBUF:     v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]]
+
+; GFX9-FLATSCR-DAG: s_add_u32 [[SZ:[^,]+]], s32, 0x200
+; GFX9-FLATSCR:     v_mov_b32_e32 [[VZ:v[0-9]+]], [[SZ]]
 
 ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]]
 ; GCN: ds_write_b32 v0, [[VZ]]
@@ -219,10 +246,14 @@
 
 ; GCN-LABEL: {{^}}undefined_stack_store_reg:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_store_dword v0, off, s[0:3], s33 offset:
-; GCN: buffer_store_dword v0, off, s[0:3], s33 offset:
-; GCN: buffer_store_dword v0, off, s[0:3], s33 offset:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
+; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
+; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
+; MUBUF: buffer_store_dword v0, off, s[0:3], s33 offset:
+; MUBUF: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:
+; FLATSCR: scratch_store_dword v0, off, s33 offset:
+; FLATSCR: scratch_store_dword v0, off, s33 offset:
+; FLATSCR: scratch_store_dword v0, off, s33 offset:
+; FLATSCR: scratch_store_dword v{{[0-9]+}}, off, s33 offset:
 define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca <4 x float>, align 16, addrspace(5)
@@ -243,13 +274,17 @@
 
 ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
 ; GCN: s_and_saveexec_b64
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; MUBUF:   buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4
+; FLATSCR: scratch_load_dword v{{[0-9]+}}, off, s32 offset:4
 
 ; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6
 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
 
-; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
-; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
+; GFX9-MUBUF: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32
+; GFX9-MUBUF-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]]
+
+; GFX9-FLATSCR:      v_mov_b32_e32 [[SP:v[0-9]+]], s32
+; GFX9-FLATSCR-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SP]]
 
 ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]]
 define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
--- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s
 
 ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo:
 ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -493,7 +494,8 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
+; GFX900-MUBUF:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
+; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -512,7 +514,8 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
+; GFX900-MUBUF:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
+; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, s32 offset:4094{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -531,7 +534,9 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUFF:  buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-FLATSCR: s_movk_i32 [[SOFF:[^,]+]], 0xffe
+; GFX900-FLATSCR: scratch_load_short_d16_hi v0, off, [[SOFF]]{{$}}
 ; GFX900: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -549,7 +554,9 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v1, off, [[SOFF]]{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
@@ -567,7 +574,8 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-MUBUF:   buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -587,7 +595,8 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-MUBUF:   buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-FLATSCR: scratch_load_ubyte_d16_hi v0, off, s32 offset:4095{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -608,7 +617,8 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-MUBUF:   buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -629,7 +639,8 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-MUBUF:   buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
+; GFX900-FLATSCR: scratch_load_sbyte_d16_hi v0, off, s32 offset:4095{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
 ; GFX900-NEXT: s_waitcnt
@@ -649,7 +660,9 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
@@ -668,7 +681,9 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v1, off, [[SOFF]]{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
@@ -687,7 +702,9 @@
 
 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
 ; GCN: s_waitcnt
-; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}}
+; GFX900-FLATSCR-NEXT: s_movk_i32 [[SOFF:[^,]+]], 0xffe
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v1, off, [[SOFF]]{{$}}
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
 ; GFX900-NEXT: s_waitcnt
@@ -787,8 +804,10 @@
 ; to offset variant.
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
-; GFX900: buffer_store_dword
-; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
+; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v{{[0-9]+}}, off, s32 offset:4094
 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -804,8 +823,10 @@
 }
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
-; GFX900: buffer_store_dword
-; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: scratch_load_sbyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -822,8 +843,10 @@
 }
 
 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
-; GFX900: buffer_store_dword
-; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
+; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: scratch_load_ubyte_d16_hi v{{[0-9]+}}, off, s32 offset:4095
 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -974,9 +997,11 @@
 ; FIXME: Is there a cost to using the extload over not?
 ; GCN-LABEL: {{^}}load_private_v2i16_split:
 ; GCN: s_waitcnt
-; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}}
+; GFX900-MUBUF:   buffer_load_ushort v0, off, s[0:3], s32{{$}}
+; GFX900-FLATSCR: scratch_load_ushort v0, off, s32{{$}}
 ; GFX900-NEXT: s_waitcnt
-; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
+; GFX900-MUBUF-NEXT:   buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
+; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:2
 ; GFX900-NEXT: s_waitcnt
 ; GFX900-NEXT: s_setpc_b64
 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
--- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-MUBUF %s
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX900,GFX900-FLATSCR %s
 
 define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
 ; GFX900-LABEL: load_local_lo_v2i16_undeflo:
@@ -1177,14 +1178,14 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1207,6 +1208,15 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
@@ -1217,16 +1227,16 @@
 }
 
 define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX900-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_ushort v1, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX900-MUBUF-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1249,6 +1259,17 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ushort v1, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX900-FLATSCR-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
   %load = load i16, i16 addrspace(5)* %gep
@@ -1259,14 +1280,14 @@
 }
 
 define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1290,6 +1311,15 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
   %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
@@ -1300,14 +1330,14 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1330,6 +1360,16 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
@@ -1339,14 +1379,14 @@
 }
 
 define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1369,6 +1409,16 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reghi_vreg_nooff:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
@@ -1378,14 +1428,14 @@
 }
 
 define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1409,6 +1459,16 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
   %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
@@ -1418,14 +1478,14 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1449,6 +1509,15 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
@@ -1460,14 +1529,14 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1490,6 +1559,15 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
@@ -1501,14 +1579,14 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1532,6 +1610,16 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@@ -1542,14 +1630,14 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1572,6 +1660,16 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x i16>
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@@ -1582,14 +1680,14 @@
 }
 
 define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v1, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1614,6 +1712,16 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_movk_i32 s4, 0xffe
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v1, off, s4
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v1, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %reg.bc = bitcast i32 %reg to <2 x half>
   %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
@@ -1801,16 +1909,16 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1837,6 +1945,17 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_load_short_d16 v0, off, s32 offset:4094
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i16], align 2, addrspace(5)
@@ -1851,16 +1970,16 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1887,6 +2006,17 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
@@ -1902,16 +2032,16 @@
 }
 
 define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1939,6 +2069,17 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
@@ -1954,16 +2095,16 @@
 }
 
 define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
@@ -1991,6 +2132,17 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_load_sbyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
@@ -2007,16 +2159,16 @@
 }
 
 define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
-; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v1, 0x7b
-; GFX900-NEXT:    buffer_store_dword v1, off, s[0:3], s32
-; GFX900-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    global_store_dword v[0:1], v0, off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-MUBUF-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
+; GFX900-MUBUF:       ; %bb.0: ; %entry
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-MUBUF-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32
+; GFX900-MUBUF-NEXT:    buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-MUBUF-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
 ; GFX906:       ; %bb.0: ; %entry
@@ -2045,6 +2197,17 @@
 ; GFX803-NEXT:    flat_store_dword v[0:1], v0
 ; GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; GFX803-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX900-FLATSCR-LABEL: load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
+; GFX900-FLATSCR:       ; %bb.0: ; %entry
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x7b
+; GFX900-FLATSCR-NEXT:    scratch_store_dword off, v1, s32
+; GFX900-FLATSCR-NEXT:    scratch_load_ubyte_d16 v0, off, s32 offset:4095
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; GFX900-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
   %obj1 = alloca [4096 x i8], align 2, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
--- a/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck -check-prefixes=GCN,GFX9,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,GFX9,FLATSCR %s
 
 ; Make sure we use the correct frame offset is used with the local
 ; frame area.
@@ -16,42 +17,78 @@
 ; correct FP offset.
 
 define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) {
-; GCN-LABEL: local_stack_offset_uses_sp:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3000
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    v_add_u32_e32 v0, 64, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, 0x2000
-; GCN-NEXT:    s_mov_b32 s6, 0
-; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN-NEXT:  BB0_1: ; %loadstoreloop
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_add_u32_e32 v3, s6, v1
-; GCN-NEXT:    s_add_i32 s6, s6, 1
-; GCN-NEXT:    s_cmpk_lt_u32 s6, 0x2120
-; GCN-NEXT:    buffer_store_byte v2, v3, s[0:3], 0 offen
-; GCN-NEXT:    s_cbranch_scc1 BB0_1
-; GCN-NEXT:  ; %bb.2: ; %split
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x3000
-; GCN-NEXT:    v_add_u32_e32 v1, 0x20d0, v1
-; GCN-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
-; GCN-NEXT:    v_mov_b32_e32 v3, s5
-; GCN-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
-; GCN-NEXT:    s_endpgm
+; MUBUF-LABEL: local_stack_offset_uses_sp:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; MUBUF-NEXT:    s_add_u32 s0, s0, s9
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x3000
+; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    v_add_u32_e32 v0, 64, v1
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v3, 0x2000
+; MUBUF-NEXT:    s_mov_b32 s6, 0
+; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:  BB0_1: ; %loadstoreloop
+; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; MUBUF-NEXT:    v_add_u32_e32 v3, s6, v1
+; MUBUF-NEXT:    s_add_i32 s6, s6, 1
+; MUBUF-NEXT:    s_cmpk_lt_u32 s6, 0x2120
+; MUBUF-NEXT:    buffer_store_byte v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    s_cbranch_scc1 BB0_1
+; MUBUF-NEXT:  ; %bb.2: ; %split
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x3000
+; MUBUF-NEXT:    v_add_u32_e32 v1, 0x20d0, v1
+; MUBUF-NEXT:    buffer_load_dword v2, v1, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_load_dword v3, v0, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_load_dword v4, v0, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; MUBUF-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v3
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; MUBUF-NEXT:    v_mov_b32_e32 v3, s5
+; MUBUF-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; MUBUF-NEXT:    s_endpgm
+;
+; FLATSCR-LABEL: local_stack_offset_uses_sp:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    s_movk_i32 vcc_hi, 0x2000
+; FLATSCR-NEXT:    s_mov_b32 s6, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, vcc_hi
+; FLATSCR-NEXT:  BB0_1: ; %loadstoreloop
+; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLATSCR-NEXT:    s_add_u32 s7, 0x3000, s6
+; FLATSCR-NEXT:    s_add_i32 s6, s6, 1
+; FLATSCR-NEXT:    s_cmpk_lt_u32 s6, 0x2120
+; FLATSCR-NEXT:    scratch_store_byte off, v0, s7
+; FLATSCR-NEXT:    s_cbranch_scc1 BB0_1
+; FLATSCR-NEXT:  ; %bb.2: ; %split
+; FLATSCR-NEXT:    s_movk_i32 s6, 0x20d0
+; FLATSCR-NEXT:    s_add_u32 s6, 0x3000, s6
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s6 offset:4
+; FLATSCR-NEXT:    s_movk_i32 s6, 0x2000
+; FLATSCR-NEXT:    s_add_u32 s6, 0x3000, s6
+; FLATSCR-NEXT:    scratch_load_dword v0, off, s6 offset:208
+; FLATSCR-NEXT:    s_movk_i32 s6, 0x3000
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s6 offset:68
+; FLATSCR-NEXT:    s_movk_i32 s6, 0x3000
+; FLATSCR-NEXT:    scratch_load_dword v3, off, s6 offset:64
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, s4
+; FLATSCR-NEXT:    v_mov_b32_e32 v3, s5
+; FLATSCR-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
+; FLATSCR-NEXT:    s_endpgm
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)
   %local.area = alloca [1060 x i64], align 4096, addrspace(5)
@@ -68,43 +105,83 @@
 }
 
 define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspace(1)* %in) {
-; GCN-LABEL: func_local_stack_offset_uses_sp:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s4, s32, 0x7ffc0
-; GCN-NEXT:    s_mov_b32 s5, s33
-; GCN-NEXT:    s_and_b32 s33, s4, 0xfff80000
-; GCN-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v3, 0x1000, v3
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_add_u32_e32 v2, 64, v3
-; GCN-NEXT:    s_mov_b32 s4, 0
-; GCN-NEXT:    s_add_u32 s32, s32, 0x180000
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33
-; GCN-NEXT:  BB1_1: ; %loadstoreloop
-; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    v_add_u32_e32 v5, s4, v3
-; GCN-NEXT:    s_add_i32 s4, s4, 1
-; GCN-NEXT:    s_cmpk_lt_u32 s4, 0x2120
-; GCN-NEXT:    buffer_store_byte v4, v5, s[0:3], 0 offen
-; GCN-NEXT:    s_cbranch_scc1 BB1_1
-; GCN-NEXT:  ; %bb.2: ; %split
-; GCN-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
-; GCN-NEXT:    v_add_u32_e32 v3, 0x1000, v3
-; GCN-NEXT:    v_add_u32_e32 v3, 0x20d0, v3
-; GCN-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen offset:4
-; GCN-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen
-; GCN-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:4
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x180000
-; GCN-NEXT:    s_mov_b32 s33, s5
-; GCN-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v5
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
-; GCN-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; MUBUF-LABEL: func_local_stack_offset_uses_sp:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_add_u32 s4, s32, 0x7ffc0
+; MUBUF-NEXT:    s_mov_b32 s5, s33
+; MUBUF-NEXT:    s_and_b32 s33, s4, 0xfff80000
+; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
+; MUBUF-NEXT:    v_add_u32_e32 v3, 0x1000, v3
+; MUBUF-NEXT:    v_mov_b32_e32 v4, 0
+; MUBUF-NEXT:    v_add_u32_e32 v2, 64, v3
+; MUBUF-NEXT:    s_mov_b32 s4, 0
+; MUBUF-NEXT:    s_add_u32 s32, s32, 0x180000
+; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s33
+; MUBUF-NEXT:  BB1_1: ; %loadstoreloop
+; MUBUF-NEXT:    ; =>This Inner Loop Header: Depth=1
+; MUBUF-NEXT:    v_add_u32_e32 v5, s4, v3
+; MUBUF-NEXT:    s_add_i32 s4, s4, 1
+; MUBUF-NEXT:    s_cmpk_lt_u32 s4, 0x2120
+; MUBUF-NEXT:    buffer_store_byte v4, v5, s[0:3], 0 offen
+; MUBUF-NEXT:    s_cbranch_scc1 BB1_1
+; MUBUF-NEXT:  ; %bb.2: ; %split
+; MUBUF-NEXT:    v_lshrrev_b32_e64 v3, 6, s33
+; MUBUF-NEXT:    v_add_u32_e32 v3, 0x1000, v3
+; MUBUF-NEXT:    v_add_u32_e32 v3, 0x20d0, v3
+; MUBUF-NEXT:    buffer_load_dword v4, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_load_dword v3, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    buffer_load_dword v5, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    buffer_load_dword v6, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x180000
+; MUBUF-NEXT:    s_mov_b32 s33, s5
+; MUBUF-NEXT:    s_waitcnt vmcnt(1)
+; MUBUF-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v5
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; MUBUF-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: func_local_stack_offset_uses_sp:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_add_u32 s4, s32, 0x1fff
+; FLATSCR-NEXT:    s_mov_b32 s6, s33
+; FLATSCR-NEXT:    s_and_b32 s33, s4, 0xffffe000
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
+; FLATSCR-NEXT:    s_mov_b32 s4, 0
+; FLATSCR-NEXT:    s_add_u32 s32, s32, 0x6000
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s33
+; FLATSCR-NEXT:  BB1_1: ; %loadstoreloop
+; FLATSCR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; FLATSCR-NEXT:    s_add_u32 vcc_hi, s33, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s5, vcc_hi, s4
+; FLATSCR-NEXT:    s_add_i32 s4, s4, 1
+; FLATSCR-NEXT:    s_cmpk_lt_u32 s4, 0x2120
+; FLATSCR-NEXT:    scratch_store_byte off, v2, s5
+; FLATSCR-NEXT:    s_cbranch_scc1 BB1_1
+; FLATSCR-NEXT:  ; %bb.2: ; %split
+; FLATSCR-NEXT:    s_movk_i32 s4, 0x20d0
+; FLATSCR-NEXT:    s_add_u32 s5, s33, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s4, s5, s4
+; FLATSCR-NEXT:    scratch_load_dword v3, off, s4 offset:4
+; FLATSCR-NEXT:    s_movk_i32 s4, 0x2000
+; FLATSCR-NEXT:    s_add_u32 s5, s33, 0x1000
+; FLATSCR-NEXT:    s_add_u32 s4, s5, s4
+; FLATSCR-NEXT:    scratch_load_dword v2, off, s4 offset:208
+; FLATSCR-NEXT:    s_add_u32 s4, s33, 0x1000
+; FLATSCR-NEXT:    scratch_load_dword v4, off, s4 offset:68
+; FLATSCR-NEXT:    s_add_u32 s4, s33, 0x1000
+; FLATSCR-NEXT:    scratch_load_dword v5, off, s4 offset:64
+; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x6000
+; FLATSCR-NEXT:    s_mov_b32 s33, s6
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; FLATSCR-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; FLATSCR-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %pin.low = alloca i32, align 8192, addrspace(5)
   %local.area = alloca [1060 x i64], align 4096, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
--- a/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll
@@ -1,31 +1,56 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=MUBUF
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefix=FLATSCR
 
 ; Make sure there's no assertion from passing a 0 alignment value
 define void @memcpy_fixed_align(i8 addrspace(5)*  %dst, i8 addrspace(1)* %src) {
-; CHECK-LABEL: memcpy_fixed_align:
-; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    global_load_dword v0, v[1:2], off offset:36
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:36
-; CHECK-NEXT:    global_load_dword v0, v[1:2], off offset:32
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
-; CHECK-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16
-; CHECK-NEXT:    global_load_dwordx4 v[0:3], v[1:2], off
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
-; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
-; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; MUBUF-LABEL: memcpy_fixed_align:
+; MUBUF:       ; %bb.0:
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    global_load_dword v0, v[1:2], off offset:36
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:36
+; MUBUF-NEXT:    global_load_dword v0, v[1:2], off offset:32
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:32
+; MUBUF-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:28
+; MUBUF-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:24
+; MUBUF-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:20
+; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:16
+; MUBUF-NEXT:    global_load_dwordx4 v[0:3], v[1:2], off
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:12
+; MUBUF-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:8
+; MUBUF-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
+; MUBUF-NEXT:    buffer_store_dword v0, off, s[0:3], s32
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: memcpy_fixed_align:
+; FLATSCR:       ; %bb.0:
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    global_load_dword v0, v[1:2], off offset:36
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:36
+; FLATSCR-NEXT:    global_load_dword v0, v[1:2], off offset:32
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32 offset:32
+; FLATSCR-NEXT:    global_load_dwordx4 v[3:6], v[1:2], off offset:16
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_dword off, v6, s32 offset:28
+; FLATSCR-NEXT:    scratch_store_dword off, v5, s32 offset:24
+; FLATSCR-NEXT:    scratch_store_dword off, v4, s32 offset:20
+; FLATSCR-NEXT:    scratch_store_dword off, v3, s32 offset:16
+; FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[1:2], off
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    scratch_store_dword off, v3, s32 offset:12
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s32 offset:8
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s32 offset:4
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s32
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [40 x i8], addrspace(5)
   %cast = bitcast [40 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
   call void @llvm.memcpy.p5i8.p1i8.i64(i8 addrspace(5)* align 4 dereferenceable(40) %cast, i8 addrspace(1)* align 4 dereferenceable(40) %src, i64 40, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
--- a/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-dword-vgpr-spill.ll
@@ -1,12 +1,17 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s
-
-; CHECK-LABEL: spill_v2i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 < %s | FileCheck %s -check-prefixes=GCN,MUBUF
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -stress-regalloc=8 -amdgpu-enable-flat-scratch < %s | FileCheck %s -check-prefixes=GCN,FLATSCR
+
+; GCN-LABEL: spill_v2i32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:16 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:20 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
 
 define void @spill_v2i32() {
 entry:
@@ -24,13 +29,17 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v2f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v2f32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:16 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:20 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:16 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:20 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:16 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:20 ; 4-byte Folded Reload
 
 define void @spill_v2f32() {
 entry:
@@ -48,15 +57,21 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v3i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v3i32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
 
 define void @spill_v3i32() {
 entry:
@@ -74,15 +89,21 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v3f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v3f32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
 
 define void @spill_v3f32() {
 entry:
@@ -100,17 +121,25 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v4i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v4i32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:44 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
 
 define void @spill_v4i32() {
 entry:
@@ -128,17 +157,25 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v4f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v4f32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:32 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:36 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:40 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:44 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:32 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:36 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:40 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:44 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:32 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:36 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:40 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:44 ; 4-byte Folded Reload
 
 define void @spill_v4f32() {
 entry:
@@ -156,17 +193,25 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v5i32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v5i32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:64 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:68 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:72 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:76 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
 define void @spill_v5i32() {
 entry:
   %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
@@ -183,17 +228,25 @@
   ret void
 }
 
-; CHECK-LABEL: spill_v5f32:
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
-; CHECK-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
-; CHECK: ;;#ASMSTART
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
-; CHECK-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
+; GCN-LABEL: spill_v5f32:
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:64 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:68 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:72 ; 4-byte Folded Spill
+; MUBUF-DAG: buffer_store_dword v{{.*}} offset:76 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:64 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:68 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:72 ; 4-byte Folded Spill
+; FLATSCR-DAG: scratch_store_dword off, v{{.*}} offset:76 ; 4-byte Folded Spill
+; GCN: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; MUBUF-DAG: buffer_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:64 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:68 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:72 ; 4-byte Folded Reload
+; FLATSCR-DAG: scratch_load_dword v{{.*}} offset:76 ; 4-byte Folded Reload
 define void @spill_v5f32() {
 entry:
   %alloca = alloca <5 x i32>, i32 2, align 4, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
--- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
+++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,MUBUF %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,DEFAULTSIZE,FLATSCR %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -amdgpu-assume-dynamic-stack-object-size=1024 < %s | FileCheck -check-prefixes=GCN,ASSUME1024,FLATSCR %s
 
 ; FIXME: Generated test checks do not check metadata at the end of the
 ; function, so this also includes manually added checks.
@@ -11,44 +13,82 @@
 ; FIXME: FunctionLoweringInfo unhelpfully doesn't preserve an
 ; alignment less than the stack alignment.
 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
-; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_movk_i32 s32, 0x400
-; GCN-NEXT:    s_mov_b32 s33, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cbranch_scc1 BB0_3
-; GCN-NEXT:  ; %bb.1: ; %bb.0
-; GCN-NEXT:    s_cmp_lg_u32 s9, 0
-; GCN-NEXT:    s_cbranch_scc1 BB0_3
-; GCN-NEXT:  ; %bb.2: ; %bb.1
-; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    s_lshl_b32 s7, s10, 2
-; GCN-NEXT:    s_mov_b32 s32, s6
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    s_add_i32 s6, s6, s7
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
-; GCN-NEXT:  BB0_3: ; %bb.2
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    global_store_dword v[0:1], v0, off
-; GCN-NEXT:    s_endpgm
+; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; MUBUF-NEXT:    s_add_u32 s0, s0, s9
+; MUBUF-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_movk_i32 s32, 0x400
+; MUBUF-NEXT:    s_mov_b32 s33, 0
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    s_cmp_lg_u32 s8, 0
+; MUBUF-NEXT:    s_cbranch_scc1 BB0_3
+; MUBUF-NEXT:  ; %bb.1: ; %bb.0
+; MUBUF-NEXT:    s_cmp_lg_u32 s9, 0
+; MUBUF-NEXT:    s_cbranch_scc1 BB0_3
+; MUBUF-NEXT:  ; %bb.2: ; %bb.1
+; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
+; MUBUF-NEXT:    s_lshl_b32 s7, s10, 2
+; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 1
+; MUBUF-NEXT:    s_add_i32 s6, s6, s7
+; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    v_mov_b32_e32 v1, s6
+; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_add_u32_e32 v2, v1, v0
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    v_mov_b32_e32 v1, s5
+; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
+; MUBUF-NEXT:  BB0_3: ; %bb.2
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_endpgm
+;
+; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align4:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; FLATSCR-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x8
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT:    s_mov_b32 s32, 16
+; FLATSCR-NEXT:    s_mov_b32 s33, 0
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    s_cmp_lg_u32 s8, 0
+; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
+; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
+; FLATSCR-NEXT:    s_cmp_lg_u32 s9, 0
+; FLATSCR-NEXT:    s_cbranch_scc1 BB0_3
+; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
+; FLATSCR-NEXT:    s_mov_b32 s6, s32
+; FLATSCR-NEXT:    s_movk_i32 s7, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s8, s6, s7
+; FLATSCR-NEXT:    s_add_u32 s6, s6, s7
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s6
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, 1
+; FLATSCR-NEXT:    s_lshl_b32 s6, s10, 2
+; FLATSCR-NEXT:    s_mov_b32 s32, s8
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s8 offset:4
+; FLATSCR-NEXT:    s_add_i32 s8, s8, s6
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s8
+; FLATSCR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_u32_e32 v2, v1, v0
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s4
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT:  BB0_3: ; %bb.2
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_endpgm
 
 entry:
   %cond0 = icmp eq i32 %arg.cond0, 0
@@ -83,42 +123,75 @@
 ; ASSUME1024: ; ScratchSize: 1040
 
 define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reached_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
-; GCN-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
-; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_movk_i32 s32, 0x1000
-; GCN-NEXT:    s_mov_b32 s33, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cbranch_scc1 BB1_2
-; GCN-NEXT:  ; %bb.1: ; %bb.0
-; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
-; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    s_lshl_b32 s7, s7, 2
-; GCN-NEXT:    s_mov_b32 s32, s6
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    s_add_i32 s6, s6, s7
-; GCN-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
-; GCN-NEXT:    v_mov_b32_e32 v1, s6
-; GCN-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v2, v1, v0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
-; GCN-NEXT:  BB1_2: ; %bb.1
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    global_store_dword v[0:1], v0, off
-; GCN-NEXT:    s_endpgm
+; MUBUF-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; MUBUF-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; MUBUF-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; MUBUF-NEXT:    s_add_u32 s0, s0, s9
+; MUBUF-NEXT:    s_addc_u32 s1, s1, 0
+; MUBUF-NEXT:    s_movk_i32 s32, 0x1000
+; MUBUF-NEXT:    s_mov_b32 s33, 0
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    s_cmp_lg_u32 s6, 0
+; MUBUF-NEXT:    s_cbranch_scc1 BB1_2
+; MUBUF-NEXT:  ; %bb.1: ; %bb.0
+; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v2, s6
+; MUBUF-NEXT:    s_lshl_b32 s7, s7, 2
+; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 1
+; MUBUF-NEXT:    s_add_i32 s6, s6, s7
+; MUBUF-NEXT:    buffer_store_dword v1, v2, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    v_mov_b32_e32 v1, s6
+; MUBUF-NEXT:    buffer_load_dword v1, v1, s[0:3], 0 offen
+; MUBUF-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_add_u32_e32 v2, v1, v0
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s4
+; MUBUF-NEXT:    v_mov_b32_e32 v1, s5
+; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
+; MUBUF-NEXT:  BB1_2: ; %bb.1
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_endpgm
+;
+; FLATSCR-LABEL: kernel_non_entry_block_static_alloca_uniformly_reached_align64:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; FLATSCR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x8
+; FLATSCR-NEXT:    s_mov_b32 s32, 64
+; FLATSCR-NEXT:    s_mov_b32 s33, 0
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    s_cmp_lg_u32 s6, 0
+; FLATSCR-NEXT:    s_cbranch_scc1 BB1_2
+; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
+; FLATSCR-NEXT:    s_add_i32 s6, s32, 0x1000
+; FLATSCR-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s6
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, 1
+; FLATSCR-NEXT:    s_lshl_b32 s7, s7, 2
+; FLATSCR-NEXT:    s_mov_b32 s32, s6
+; FLATSCR-NEXT:    scratch_store_dword off, v1, s6 offset:4
+; FLATSCR-NEXT:    s_add_i32 s6, s6, s7
+; FLATSCR-NEXT:    scratch_load_dword v1, off, s6
+; FLATSCR-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_u32_e32 v2, v1, v0
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s4
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, s5
+; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT:  BB1_2: ; %bb.1
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_endpgm
 entry:
   %cond = icmp eq i32 %arg.cond, 0
   br i1 %cond, label %bb.0, label %bb.1
@@ -149,41 +222,79 @@
 
 
 define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i32 %arg.cond0, i32 %arg.cond1, i32 %in) {
-; GCN-LABEL: func_non_entry_block_static_alloca_align4:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s7, s33
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    s_mov_b32 s33, s32
-; GCN-NEXT:    s_add_u32 s32, s32, 0x400
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_cbranch_execz BB2_3
-; GCN-NEXT:  ; %bb.1: ; %bb.0
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GCN-NEXT:    s_and_b64 exec, exec, vcc
-; GCN-NEXT:    s_cbranch_execz BB2_3
-; GCN-NEXT:  ; %bb.2: ; %bb.1
-; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_e32 v3, s6
-; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
-; GCN-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
-; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
-; GCN-NEXT:    s_mov_b32 s32, s6
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
-; GCN-NEXT:  BB2_3: ; %bb.2
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    global_store_dword v[0:1], v0, off
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x400
-; GCN-NEXT:    s_mov_b32 s33, s7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; MUBUF-LABEL: func_non_entry_block_static_alloca_align4:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_mov_b32 s7, s33
+; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; MUBUF-NEXT:    s_mov_b32 s33, s32
+; MUBUF-NEXT:    s_add_u32 s32, s32, 0x400
+; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; MUBUF-NEXT:    s_cbranch_execz BB2_3
+; MUBUF-NEXT:  ; %bb.1: ; %bb.0
+; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; MUBUF-NEXT:    s_and_b64 exec, exec, vcc
+; MUBUF-NEXT:    s_cbranch_execz BB2_3
+; MUBUF-NEXT:  ; %bb.2: ; %bb.1
+; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v3, s6
+; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
+; MUBUF-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    v_lshl_add_u32 v2, v4, 2, s6
+; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
+; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
+; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
+; MUBUF-NEXT:  BB2_3: ; %bb.2
+; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x400
+; MUBUF-NEXT:    s_mov_b32 s33, s7
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: func_non_entry_block_static_alloca_align4:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_mov_b32 s9, s33
+; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; FLATSCR-NEXT:    s_mov_b32 s33, s32
+; FLATSCR-NEXT:    s_add_u32 s32, s32, 16
+; FLATSCR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; FLATSCR-NEXT:    s_cbranch_execz BB2_3
+; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
+; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; FLATSCR-NEXT:    s_and_b64 exec, exec, vcc
+; FLATSCR-NEXT:    s_cbranch_execz BB2_3
+; FLATSCR-NEXT:  ; %bb.2: ; %bb.1
+; FLATSCR-NEXT:    s_mov_b32 s6, s32
+; FLATSCR-NEXT:    s_movk_i32 s7, 0x1000
+; FLATSCR-NEXT:    s_add_i32 s8, s6, s7
+; FLATSCR-NEXT:    s_add_u32 s6, s6, s7
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s6
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s8 offset:4
+; FLATSCR-NEXT:    v_lshl_add_u32 v2, v4, 2, s8
+; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
+; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v5
+; FLATSCR-NEXT:    s_mov_b32 s32, s8
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
+; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT:  BB2_3: ; %bb.2
+; FLATSCR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_sub_u32 s32, s32, 16
+; FLATSCR-NEXT:    s_mov_b32 s33, s9
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 
 entry:
   %cond0 = icmp eq i32 %arg.cond0, 0
@@ -213,39 +324,72 @@
 }
 
 define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, i32 %arg.cond, i32 %in) {
-; GCN-LABEL: func_non_entry_block_static_alloca_align64:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_add_u32 s4, s32, 0xfc0
-; GCN-NEXT:    s_mov_b32 s7, s33
-; GCN-NEXT:    s_and_b32 s33, s4, 0xfffff000
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GCN-NEXT:    s_add_u32 s32, s32, 0x2000
-; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_cbranch_execz BB3_2
-; GCN-NEXT:  ; %bb.1: ; %bb.0
-; GCN-NEXT:    s_add_i32 s6, s32, 0x1000
-; GCN-NEXT:    s_and_b32 s6, s6, 0xfffff000
-; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, s6
-; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
-; GCN-NEXT:    v_mov_b32_e32 v2, 1
-; GCN-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
-; GCN-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
-; GCN-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
-; GCN-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
-; GCN-NEXT:    s_mov_b32 s32, s6
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_u32_e32 v2, v2, v3
-; GCN-NEXT:    global_store_dword v[0:1], v2, off
-; GCN-NEXT:  BB3_2: ; %bb.1
-; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    global_store_dword v[0:1], v0, off
-; GCN-NEXT:    s_sub_u32 s32, s32, 0x2000
-; GCN-NEXT:    s_mov_b32 s33, s7
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; MUBUF-LABEL: func_non_entry_block_static_alloca_align64:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; MUBUF-NEXT:    s_add_u32 s4, s32, 0xfc0
+; MUBUF-NEXT:    s_mov_b32 s7, s33
+; MUBUF-NEXT:    s_and_b32 s33, s4, 0xfffff000
+; MUBUF-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; MUBUF-NEXT:    s_add_u32 s32, s32, 0x2000
+; MUBUF-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; MUBUF-NEXT:    s_cbranch_execz BB3_2
+; MUBUF-NEXT:  ; %bb.1: ; %bb.0
+; MUBUF-NEXT:    s_add_i32 s6, s32, 0x1000
+; MUBUF-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v5, s6
+; MUBUF-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 1
+; MUBUF-NEXT:    buffer_store_dword v2, v5, s[0:3], 0 offen offset:4
+; MUBUF-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
+; MUBUF-NEXT:    buffer_load_dword v2, v2, s[0:3], 0 offen
+; MUBUF-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
+; MUBUF-NEXT:    s_mov_b32 s32, s6
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_add_u32_e32 v2, v2, v3
+; MUBUF-NEXT:    global_store_dword v[0:1], v2, off
+; MUBUF-NEXT:  BB3_2: ; %bb.1
+; MUBUF-NEXT:    s_or_b64 exec, exec, s[4:5]
+; MUBUF-NEXT:    v_mov_b32_e32 v0, 0
+; MUBUF-NEXT:    global_store_dword v[0:1], v0, off
+; MUBUF-NEXT:    s_sub_u32 s32, s32, 0x2000
+; MUBUF-NEXT:    s_mov_b32 s33, s7
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    s_setpc_b64 s[30:31]
+;
+; FLATSCR-LABEL: func_non_entry_block_static_alloca_align64:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; FLATSCR-NEXT:    s_add_u32 s4, s32, 63
+; FLATSCR-NEXT:    s_mov_b32 s7, s33
+; FLATSCR-NEXT:    s_and_b32 s33, s4, 0xffffffc0
+; FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; FLATSCR-NEXT:    s_add_u32 s32, s32, 0x80
+; FLATSCR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; FLATSCR-NEXT:    s_cbranch_execz BB3_2
+; FLATSCR-NEXT:  ; %bb.1: ; %bb.0
+; FLATSCR-NEXT:    s_add_i32 s6, s32, 0x1000
+; FLATSCR-NEXT:    s_and_b32 s6, s6, 0xfffff000
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s6
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 1
+; FLATSCR-NEXT:    scratch_store_dword off, v2, s6 offset:4
+; FLATSCR-NEXT:    v_lshl_add_u32 v2, v3, 2, s6
+; FLATSCR-NEXT:    scratch_load_dword v2, v2, off
+; FLATSCR-NEXT:    v_and_b32_e32 v3, 0x3ff, v4
+; FLATSCR-NEXT:    s_mov_b32 s32, s6
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_u32_e32 v2, v2, v3
+; FLATSCR-NEXT:    global_store_dword v[0:1], v2, off
+; FLATSCR-NEXT:  BB3_2: ; %bb.1
+; FLATSCR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, 0
+; FLATSCR-NEXT:    global_store_dword v[0:1], v0, off
+; FLATSCR-NEXT:    s_sub_u32 s32, s32, 0x80
+; FLATSCR-NEXT:    s_mov_b32 s33, s7
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %cond = icmp eq i32 %arg.cond, 0
   br i1 %cond, label %bb.0, label %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck --check-prefix=MUBUF %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog -amdgpu-enable-flat-scratch %s -o - | FileCheck --check-prefix=FLATSCR %s
 
 # Test what happens when an SGPR is unavailable for the unused add. The non-inline constant needs to be folded into the add instruction and not materialized in a register.
 
@@ -21,19 +22,32 @@
   bb.0:
     liveins: $vgpr1
 
-    ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; CHECK: liveins: $sgpr27, $vgpr1
-    ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
-    ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
-    ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
-    ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
-    ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
-    ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec
-    ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
-    ; CHECK: $sgpr33 = frame-setup COPY $sgpr27
-    ; CHECK: S_ENDPGM 0, implicit $vcc
+    ; MUBUF-LABEL: name: scavenge_sgpr_pei_no_sgprs
+    ; MUBUF: liveins: $sgpr27, $vgpr1
+    ; MUBUF: $sgpr27 = frame-setup COPY $sgpr33
+    ; MUBUF: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
+    ; MUBUF: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
+    ; MUBUF: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc
+    ; MUBUF: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    ; MUBUF: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
+    ; MUBUF: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec
+    ; MUBUF: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; MUBUF: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc
+    ; MUBUF: $sgpr33 = frame-setup COPY $sgpr27
+    ; MUBUF: S_ENDPGM 0, implicit $vcc
+    ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs
+    ; FLATSCR: liveins: $sgpr27, $vgpr1
+    ; FLATSCR: $sgpr27 = frame-setup COPY $sgpr33
+    ; FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc
+    ; FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc
+    ; FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc
+    ; FLATSCR: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
+    ; FLATSCR: $sgpr33 = S_ADD_U32 $sgpr33, 8192, implicit-def $scc
+    ; FLATSCR: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
+    ; FLATSCR: $sgpr33 = S_SUB_U32 $sgpr33, 8192, implicit-def $scc
+    ; FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc
+    ; FLATSCR: $sgpr33 = frame-setup COPY $sgpr27
+    ; FLATSCR: S_ENDPGM 0, implicit $vcc
     S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     S_ENDPGM 0, implicit $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -1,6 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX8 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX9 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog -amdgpu-enable-flat-scratch %s -o - | FileCheck -check-prefix=GFX9-FLATSCR %s
 
 # Test case where spilling a VGPR to an emergency slot is needed during frame index elimination.
 
@@ -55,6 +56,17 @@
     ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc
     ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5)
     ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs
+    ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
+    ; GFX9-FLATSCR: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2
+    ; GFX9-FLATSCR: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2
+    ; GFX9-FLATSCR: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 8191, implicit-def $scc
+    ; GFX9-FLATSCR: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294959104, implicit-def $scc
+    ; GFX9-FLATSCR: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 24576, implicit-def $scc
+    ; GFX9-FLATSCR: $vcc_hi = S_ADD_U32 $sgpr33, 8192, implicit-def $scc
+    ; GFX9-FLATSCR: $vgpr0 = V_OR_B32_e32 killed $vcc_hi, $vgpr1, implicit $exec
+    ; GFX9-FLATSCR: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 24576, implicit-def $scc
+    ; GFX9-FLATSCR: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0
+    ; GFX9-FLATSCR: S_ENDPGM 0, csr_amdgpu_allvgprs
     $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec
     S_ENDPGM 0, csr_amdgpu_allvgprs
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
--- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -1,9 +1,11 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=verde -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,SI,SIVI,MUBUF %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx803 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,VI,SIVI,MUBUF %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,MUBUF,GFX9-MUBUF,GFX9_10-MUBUF %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -filetype=obj -amdgpu-use-divergent-register-indexing < %s | llvm-readobj -r - | FileCheck --check-prefix=RELS %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10 %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10 %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,MUBUF,GFX10_W32-MUBUF,GFX9_10-MUBUF %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global,+wavefrontsize64 -amdgpu-use-divergent-register-indexing -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W64,GFX9_10,MUBUF,GFX10_W64-MUBUF,GFX9_10-MUBUF %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9,GFX9_10,FLATSCR,GFX9-FLATSCR %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx1030 -mattr=-flat-for-global -amdgpu-use-divergent-register-indexing -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10_W32,GFX9_10,FLATSCR,GFX10-FLATSCR,GFX9_10-FLATSCR %s
 
 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD0 0x0
 ; RELS: R_AMDGPU_ABS32_LO SCRATCH_RSRC_DWORD1 0x0
@@ -14,14 +16,33 @@
 ;
 ; GCN-LABEL: {{^}}ps_main:
 
-; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s2, -1
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
+; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; MUBUF-DAG: s_mov_b32 s1, SCRATCH_RSRC_DWORD1
+; MUBUF-DAG: s_mov_b32 s2, -1
 ; SI-DAG: s_mov_b32 s3, 0xe8f000
 ; VI-DAG: s_mov_b32 s3, 0xe80000
-; GFX9-DAG: s_mov_b32 s3, 0xe00000
-; GFX10_W32-DAG: s_mov_b32 s3, 0x31c16000
-; GFX10_W64-DAG: s_mov_b32 s3, 0x31e16000
+; GFX9-MUBUF-DAG: s_mov_b32 s3, 0xe00000
+; GFX10_W32-MUBUF-DAG: s_mov_b32 s3, 0x31c16000
+; GFX10_W64-MUBUF-DAG: s_mov_b32 s3, 0x31e16000
+
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
+
+; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
+; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
+; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
+; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
+
+; GFX10-FLATSCR: scratch_store_dword off, v2, off offset:
+; GFX10-FLATSCR: scratch_store_dword off, v2, off offset:
+
 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 ; GCN-NOT: s_mov_b32 s0
@@ -29,8 +50,10 @@
 ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
 ; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[LO_OFF:v[0-9]+]],{{.*}} {{v2|0x80}}, [[CLAMP_IDX]]
 
-; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[HI_OFF]], off
 define amdgpu_ps float @ps_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -39,10 +62,30 @@
 }
 
 ; GCN-LABEL: {{^}}vs_main:
-; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
+; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
 ; GCN-NOT: s_mov_b32 s0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
+
+; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
+; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
+; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
+; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
+
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+
 define amdgpu_vs float @vs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -51,9 +94,23 @@
 }
 
 ; GCN-LABEL: {{^}}cs_main:
-; GCN-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s2
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
+; MUBUF-DAG: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
+
+; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
 define amdgpu_cs float @cs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -62,15 +119,27 @@
 }
 
 ; GCN-LABEL: {{^}}hs_main:
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
 ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
 ; SIVI-NOT: s_mov_b32 s0
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
-; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
-; GFX9_10-NOT: s_mov_b32 s5
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9_10-NOT:   s_mov_b32 s5
+; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
 define amdgpu_hs float @hs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -79,13 +148,25 @@
 }
 
 ; GCN-LABEL: {{^}}gs_main:
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
 ; SIVI: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
-; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF: s_mov_b32 s0, SCRATCH_RSRC_DWORD0
+; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
 define amdgpu_gs float @gs_main(i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -99,17 +180,29 @@
 ; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset.
 
 ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
+; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
 
 ; SIVI-NOT: s_mov_b32 s6
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
 ; GFX9_10-NOT: s_mov_b32 s5
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; MUBUF-DAG: s_mov_b32 s2, s5
 
-; GCN-DAG: s_mov_b32 s2, s5
+; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -120,15 +213,27 @@
 }
 
 ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset:
-; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s5
+; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
+
+; GFX10-FLATSCR: s_add_u32 s0, s0, s5
+; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
+; GFX10-FLATSCR: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
+
+; MUBUF: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; FLATSCR-NOT: SCRATCH_RSRC_DWORD
 
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 ; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
 
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
-; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+; GFX9_10-MUBUF-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
+
+; MUBUF-DAG: s_mov_b32 s2, s5
 
-; GCN-DAG: s_mov_b32 s2, s5
+; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
+; FLATSCR-DAG: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) {
   %v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
   %v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -1,21 +1,28 @@
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN64 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=CHECK -check-prefix=GCN32 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,MUBUF %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN32,MUBUF %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-flat-scratch -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=CHECK,GCN64,FLATSCR %s
 
 
 # CHECK-LABEL: name: check_spill
 
+# FLATSCR: $sgpr33 = S_MOV_B32 0
+# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc
+# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc
+
 # S32 with kill
 # CHECK: V_WRITELANE
 # CHECK: $sgpr12 = S_MOV_B32 $exec_lo
 # CHECK: $exec_lo = S_MOV_B32 1
-# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 4
 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12
 
 # S32 without kill
 # CHECK: V_WRITELANE
 # CHECK: $sgpr12 = S_MOV_B32 $exec_lo
 # CHECK: $exec_lo = S_MOV_B32 1
-# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 4
 # CHECK: $sgpr12 = V_READLANE
 
 # S64 with kill
@@ -25,7 +32,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 3
 # GCN64: $exec = S_MOV_B64 3
-# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 8
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 
@@ -36,7 +44,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 3
 # GCN64: $exec = S_MOV_B64 3
-# CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR $vgpr{{[0-9]+}}, $sgpr33, 8
 # GCN32: $exec_lo = S_MOV_B32 $sgpr12
 # GCN64: $exec = S_MOV_B64 $sgpr12_sgpr13
 # GCN64: $sgpr13 = V_READLANE
@@ -50,7 +59,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 7
 # GCN64: $exec = S_MOV_B64 7
-# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 16
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 
@@ -63,7 +73,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 15
 # GCN64: $exec = S_MOV_B64 15
-# CHECK: BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET killed $vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR killed $vgpr{{[0-9]+}}, $sgpr33, 28
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 
@@ -77,7 +88,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 31
 # GCN64: $exec = S_MOV_B64 31
-# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 44
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 
@@ -94,7 +106,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 255
 # GCN64: $exec = S_MOV_B64 255
-# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 64
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 
@@ -119,7 +132,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 65535
 # GCN64: $exec = S_MOV_B64 65535
-# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 96
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 
@@ -160,7 +174,8 @@
 # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 4294967295
 # GCN64: $exec = S_MOV_B64 4294967295
-# CHECK: BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
+# MUBUF:   BUFFER_STORE_DWORD_OFFSET {{(killed )?}}$vgpr{{[0-9]+}}, ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
+# FLATSCR: SCRATCH_STORE_DWORD_SADDR {{(killed )?}}$vgpr{{[0-9]+}}, $sgpr33, 160
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65
 
@@ -203,11 +218,12 @@
   stackPtrOffsetReg: '$sgpr32'
   frameOffsetReg: '$sgpr33'
   argumentInfo:
-    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
-    workGroupIDX:    { reg: '$sgpr8' }
-    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+    dispatchPtr:     { reg: '$sgpr2_sgpr3' }
+    privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' }
+    kernargSegmentPtr: { reg: '$sgpr8_sgpr9' }
+    workGroupIDX:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
 body:             |
   bb.0:
     liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
@@ -245,10 +261,15 @@
 
 # CHECK-LABEL: name: check_reload
 
+# FLATSCR: $sgpr33 = S_MOV_B32 0
+# FLATSCR: $flat_scr_lo = S_ADD_U32 $sgpr0, $sgpr11, implicit-def $scc
+# FLATSCR: $flat_scr_hi = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc
+
 # S32
 # CHECK: $sgpr12 = S_MOV_B32 $exec_lo
 # CHECK: $exec_lo = S_MOV_B32 1
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 4
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4
 # CHECK: $exec_lo = S_MOV_B32 killed $sgpr12
 # CHECK: $sgpr12 = V_READLANE
 
@@ -257,7 +278,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 3
 # GCN64: $exec = S_MOV_B64 3
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 8
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 # CHECK: $sgpr12 = V_READLANE
@@ -268,7 +290,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 7
 # GCN64: $exec = S_MOV_B64 7
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 16
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 # CHECK: $sgpr12 = V_READLANE
@@ -280,7 +303,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 15
 # GCN64: $exec = S_MOV_B64 15
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 28
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 # CHECK: $sgpr12 = V_READLANE
@@ -293,7 +317,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 31
 # GCN64: $exec = S_MOV_B64 31
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 44
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 # CHECK: $sgpr12 = V_READLANE
@@ -307,7 +332,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 255
 # GCN64: $exec = S_MOV_B64 255
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 64
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 # CHECK: $sgpr12 = V_READLANE
@@ -324,7 +350,8 @@
 # GCN64: $sgpr12_sgpr13 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 65535
 # GCN64: $exec = S_MOV_B64 65535
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 96
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr12
 # GCN64: $exec = S_MOV_B64 killed $sgpr12_sgpr13
 # CHECK: $sgpr12 = V_READLANE
@@ -349,7 +376,8 @@
 # GCN64: $sgpr64_sgpr65 = S_MOV_B64 $exec
 # GCN32: $exec_lo = S_MOV_B32 4294967295
 # GCN64: $exec = S_MOV_B64 4294967295
-# CHECK: BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
+# MUBUF:   BUFFER_LOAD_DWORD_OFFSET ${{(sgpr[0-9_]+)*}}, $sgpr33, 160
+# FLATSCR: SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160
 # GCN32: $exec_lo = S_MOV_B32 killed $sgpr64
 # GCN64: $exec = S_MOV_B64 killed $sgpr64_sgpr65
 # CHECK: $sgpr64 = V_READLANE
@@ -412,11 +440,12 @@
   stackPtrOffsetReg: '$sgpr32'
   frameOffsetReg: '$sgpr33'
   argumentInfo:
-    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
-    workGroupIDX:    { reg: '$sgpr8' }
-    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+    dispatchPtr:     { reg: '$sgpr2_sgpr3' }
+    privateSegmentBuffer: { reg: '$sgpr4_sgpr5_sgpr6_sgpr7' }
+    kernargSegmentPtr: { reg: '$sgpr8_sgpr9' }
+    workGroupIDX:    { reg: '$sgpr10' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr11' }
 body:             |
   bb.0:
     liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX6 %s
 ; RUN: llc -regalloc=basic -march=amdgcn -mcpu=tonga -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 < %s | FileCheck -check-prefixes=CHECK,GFX7 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX9-FLATSCR,FLATSCR %s
+; RUN: llc -march=amdgcn -mcpu=gfx1030 -enable-misched=0 -post-RA-scheduler=0 -amdgpu-spill-sgpr-to-vgpr=0 -amdgpu-enable-flat-scratch < %s | FileCheck -check-prefixes=CHECK,GFX10-FLATSCR,FLATSCR %s
 ;
 ; There is something about Tonga that causes this test to spend a lot of time
 ; in the default register allocator.
@@ -11,6 +13,16 @@
 
 ; Just test that it compiles successfully.
 ; CHECK-LABEL: test
+
+; GFX9-FLATSCR:     s_mov_b32 [[SOFF1:s[0-9]+]], 4{{$}}
+; GFX9-FLATSCR-DAG: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill
+; GFX9-FLATSCR-DAG: scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GFX9-FLATSCR:     s_movk_i32 [[SOFF2:s[0-9]+]], 0x{{[0-9a-f]+}}{{$}}
+; GFX9-FLATSCR-DAG: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload
+; GFX9-FLATSCR-DAG: scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] offset:{{[0-9]+}} ; 4-byte Folded Reload
+
+; GFX10-FLATSCR: scratch_store_dword off, v{{[0-9]+}}, off offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GFX10-FLATSCR: scratch_load_dword v{{[0-9]+}}, off, off offset:{{[0-9]+}} ; 4-byte Folded Reload
 define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
@@ -35,11 +47,17 @@
 }
 
 ; CHECK-LABEL: test_limited_sgpr
-; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
+; GFX6: s_add_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
 ; GFX6-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9:]+}}], s32
-; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9]+]]
+; GFX6-NEXT: s_sub_u32 s32, s32, 0x[[OFFSET:[0-9a-f]+]]
 ; GFX6: NumSgprs: 48
 ; GFX6: ScratchSize: 8608
+
+; FLATSCR:           s_movk_i32 [[SOFF1:s[0-9]+]], 0x
+; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:      scratch_store_dword off, v{{[0-9]+}}, [[SOFF1]] ; 4-byte Folded Spill
+; FLATSCR:           s_movk_i32 [[SOFF2:s[0-9]+]], 0x
+; FLATSCR:           scratch_load_dword v{{[0-9]+}}, off, [[SOFF2]] ; 4-byte Folded Reload
 define amdgpu_kernel void @test_limited_sgpr(<64 x i32> addrspace(1)* %out, <64 x i32> addrspace(1)* %in) #0 {
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
--- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=MUBUF %s
+; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -amdgpu-enable-flat-scratch -verify-machineinstrs | FileCheck -check-prefix=FLATSCR %s
 
 ; FIXME: The MUBUF loads in this test output are incorrect, their SOffset
 ; should use the frame offset register, not the ABI stack pointer register. We
@@ -13,44 +14,89 @@
 
 ; An assert was hit when frame offset register was used to address FrameIndex.
 define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) {
-; GCN-LABEL: kernel_background_evaluate:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s0, s[0:1], 0x24
-; GCN-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GCN-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GCN-NEXT:    s_mov_b32 s38, -1
-; GCN-NEXT:    s_mov_b32 s39, 0x31c16000
-; GCN-NEXT:    s_add_u32 s36, s36, s3
-; GCN-NEXT:    s_addc_u32 s37, s37, 0
-; GCN-NEXT:    v_mov_b32_e32 v1, 0x2000
-; GCN-NEXT:    v_mov_b32_e32 v2, 0x4000
-; GCN-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NEXT:    v_mov_b32_e32 v4, 0x400000
-; GCN-NEXT:    s_mov_b32 s32, 0xc0000
-; GCN-NEXT:    v_add_nc_u32_e64 v40, 4, 0x4000
-; GCN-NEXT:    ; implicit-def: $vcc_hi
-; GCN-NEXT:    s_getpc_b64 s[4:5]
-; GCN-NEXT:    s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
-; GCN-NEXT:    s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GCN-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GCN-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GCN-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GCN-NEXT:    s_cbranch_execz BB0_2
-; GCN-NEXT:  ; %bb.1: ; %if.then4.i
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    buffer_load_dword v0, v40, s[36:39], 0 offen
-; GCN-NEXT:    buffer_load_dword v1, v40, s[36:39], 0 offen offset:4
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_add_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
-; GCN-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
-; GCN-NEXT:    buffer_store_dword v0, v0, s[36:39], 0 offen
-; GCN-NEXT:  BB0_2: ; %shader_eval_surface.exit
-; GCN-NEXT:    s_endpgm
+; MUBUF-LABEL: kernel_background_evaluate:
+; MUBUF:       ; %bb.0: ; %entry
+; MUBUF-NEXT:    s_load_dword s0, s[0:1], 0x24
+; MUBUF-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; MUBUF-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; MUBUF-NEXT:    s_mov_b32 s38, -1
+; MUBUF-NEXT:    s_mov_b32 s39, 0x31c16000
+; MUBUF-NEXT:    s_add_u32 s36, s36, s3
+; MUBUF-NEXT:    s_addc_u32 s37, s37, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v1, 0x2000
+; MUBUF-NEXT:    v_mov_b32_e32 v2, 0x4000
+; MUBUF-NEXT:    v_mov_b32_e32 v3, 0
+; MUBUF-NEXT:    v_mov_b32_e32 v4, 0x400000
+; MUBUF-NEXT:    s_mov_b32 s32, 0xc0000
+; MUBUF-NEXT:    v_add_nc_u32_e64 v40, 4, 0x4000
+; MUBUF-NEXT:    ; implicit-def: $vcc_hi
+; MUBUF-NEXT:    s_getpc_b64 s[4:5]
+; MUBUF-NEXT:    s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
+; MUBUF-NEXT:    s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
+; MUBUF-NEXT:    s_waitcnt lgkmcnt(0)
+; MUBUF-NEXT:    v_mov_b32_e32 v0, s0
+; MUBUF-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; MUBUF-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; MUBUF-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; MUBUF-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; MUBUF-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; MUBUF-NEXT:    s_cbranch_execz BB0_2
+; MUBUF-NEXT:  ; %bb.1: ; %if.then4.i
+; MUBUF-NEXT:    s_clause 0x1
+; MUBUF-NEXT:    buffer_load_dword v0, v40, s[36:39], 0 offen
+; MUBUF-NEXT:    buffer_load_dword v1, v40, s[36:39], 0 offen offset:4
+; MUBUF-NEXT:    s_waitcnt vmcnt(0)
+; MUBUF-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; MUBUF-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
+; MUBUF-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
+; MUBUF-NEXT:    buffer_store_dword v0, v0, s[36:39], 0 offen
+; MUBUF-NEXT:  BB0_2: ; %shader_eval_surface.exit
+; MUBUF-NEXT:    s_endpgm
+;
+; FLATSCR-LABEL: kernel_background_evaluate:
+; FLATSCR:       ; %bb.0: ; %entry
+; FLATSCR-NEXT:    s_add_u32 s2, s2, s5
+; FLATSCR-NEXT:    s_movk_i32 s32, 0x6000
+; FLATSCR-NEXT:    s_addc_u32 s3, s3, 0
+; FLATSCR-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
+; FLATSCR-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
+; FLATSCR-NEXT:    s_load_dword s0, s[0:1], 0x24
+; FLATSCR-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; FLATSCR-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; FLATSCR-NEXT:    s_mov_b32 s38, -1
+; FLATSCR-NEXT:    s_mov_b32 s39, 0x31c16000
+; FLATSCR-NEXT:    s_add_u32 s36, s36, s5
+; FLATSCR-NEXT:    s_addc_u32 s37, s37, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v1, 0x2000
+; FLATSCR-NEXT:    v_mov_b32_e32 v2, 0x4000
+; FLATSCR-NEXT:    v_mov_b32_e32 v3, 0
+; FLATSCR-NEXT:    v_mov_b32_e32 v4, 0x400000
+; FLATSCR-NEXT:    ; implicit-def: $vcc_hi
+; FLATSCR-NEXT:    s_getpc_b64 s[4:5]
+; FLATSCR-NEXT:    s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
+; FLATSCR-NEXT:    s_addc_u32 s5, s5, svm_eval_nodes@rel32@hi+12
+; FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
+; FLATSCR-NEXT:    v_mov_b32_e32 v0, s0
+; FLATSCR-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; FLATSCR-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; FLATSCR-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; FLATSCR-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; FLATSCR-NEXT:    s_and_saveexec_b32 s0, vcc_lo
+; FLATSCR-NEXT:    s_cbranch_execz BB0_2
+; FLATSCR-NEXT:  ; %bb.1: ; %if.then4.i
+; FLATSCR-NEXT:    s_movk_i32 vcc_lo, 0x4000
+; FLATSCR-NEXT:    s_nop 1
+; FLATSCR-NEXT:    scratch_load_dword v0, off, vcc_lo offset:4
+; FLATSCR-NEXT:    s_waitcnt_depctr 0xffe3
+; FLATSCR-NEXT:    s_movk_i32 vcc_lo, 0x4000
+; FLATSCR-NEXT:    scratch_load_dword v1, off, vcc_lo offset:8
+; FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; FLATSCR-NEXT:    v_add_nc_u32_e32 v0, v1, v0
+; FLATSCR-NEXT:    v_mul_lo_u32 v0, 0x41c64e6d, v0
+; FLATSCR-NEXT:    v_add_nc_u32_e32 v0, 0x3039, v0
+; FLATSCR-NEXT:    scratch_store_dword off, v0, s0
+; FLATSCR-NEXT:  BB0_2: ; %shader_eval_surface.exit
+; FLATSCR-NEXT:    s_endpgm
 entry:
   %sd = alloca < 1339 x i32>, align 8192, addrspace(5)
   %state = alloca <4 x i32>, align 16, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
--- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll
@@ -1,6 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-MUBUF %s
 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -amdgpu-enable-flat-scratch -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9,GFX900-FLATSCR %s
 
 ; GCN-LABEL: {{^}}store_global_hi_v2i16:
 ; GCN: s_waitcnt
@@ -389,7 +390,8 @@
 ; GCN-LABEL: {{^}}store_private_hi_v2i16:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
@@ -408,7 +410,8 @@
 ; GCN-LABEL: {{^}}store_private_hi_v2f16:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1
 ; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
@@ -427,7 +430,8 @@
 ; GCN-LABEL: {{^}}store_private_hi_i32_shift:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi v0, v1, off{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
 ; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}}
@@ -445,7 +449,8 @@
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
@@ -464,7 +469,8 @@
 ; GCN-LABEL: {{^}}store_private_hi_i8_shift:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}}
+; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi v0, v1, off{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
 ; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}}
@@ -481,7 +487,8 @@
 
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
 ; GCN: s_waitcnt
-; GFX900: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
+; GFX900-MUBUF:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
+; GFX900-FLATSCR: scratch_store_short_d16_hi off, v0, s32 offset:4094{{$}}
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s32 offset:4094{{$}}
@@ -502,7 +509,9 @@
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}}
+; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
+; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, [[SOFF]]{{$}}
 
 ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
 ; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}}
@@ -522,7 +531,9 @@
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff:
 ; GCN: s_waitcnt
 
-; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
+; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}}
+; GFX900-FLATSCR-NEXT: s_mov_b32 [[SOFF:s[0-9]+]], 0
+; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, [[SOFF]]{{$}}
 
 ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0
 ; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}}
@@ -634,8 +645,10 @@
 
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_to_offset:
 ; GCN: s_waitcnt
-; GFX900: buffer_store_dword
-; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
+; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   buffer_store_short_d16_hi v0, off, s[0:3], s32 offset:4094
+; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: scratch_store_short_d16_hi off, v0, s32 offset:4094
 define void @store_private_hi_v2i16_to_offset(i32 %arg) #0 {
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)
@@ -651,8 +664,10 @@
 
 ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_to_offset:
 ; GCN: s_waitcnt
-; GFX900: buffer_store_dword
-; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-MUBUF:        buffer_store_dword
+; GFX900-MUBUF-NEXT:   buffer_store_byte_d16_hi v0, off, s[0:3], s32 offset:4095
+; GFX900-FLATSCR:      scratch_store_dword
+; GFX900-FLATSCR-NEXT: scratch_store_byte_d16_hi off, v0, s32 offset:4095
 define void @store_private_hi_v2i16_i8_to_offset(i32 %arg) #0 {
 entry:
   %obj0 = alloca [10 x i32], align 4, addrspace(5)