Index: lib/Target/R600/AMDGPU.h
===================================================================
--- lib/Target/R600/AMDGPU.h
+++ lib/Target/R600/AMDGPU.h
@@ -76,33 +76,34 @@
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
   CONSTANT_ADDRESS = 2, ///< Address space for constant memory
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+  FLAT_ADDRESS     = 4, ///< Address space for flat accesses to local, private or global.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
+  ADDRESS_NONE     = 6, ///< Address space for unknown memory.
+  PARAM_D_ADDRESS  = 7, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 8, ///< Address space for indirect addressible parameter memory (VTX1)
 
   // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
   // order to be able to dynamically index a constant buffer, for example:
   //
   // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
 
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
+  CONSTANT_BUFFER_0 = 9,
+  CONSTANT_BUFFER_1 = 10,
+  CONSTANT_BUFFER_2 = 11,
+  CONSTANT_BUFFER_3 = 12,
+  CONSTANT_BUFFER_4 = 13,
+  CONSTANT_BUFFER_5 = 14,
+  CONSTANT_BUFFER_6 = 15,
+  CONSTANT_BUFFER_7 = 16,
+  CONSTANT_BUFFER_8 = 17,
+  CONSTANT_BUFFER_9 = 18,
+  CONSTANT_BUFFER_10 = 19,
+  CONSTANT_BUFFER_11 = 20,
+  CONSTANT_BUFFER_12 = 21,
+  CONSTANT_BUFFER_13 = 22,
+  CONSTANT_BUFFER_14 = 23,
+  CONSTANT_BUFFER_15 = 24,
+  LAST_ADDRESS = 25
 };
 
 } // namespace AMDGPUAS
Index: lib/Target/R600/AMDGPU.td
===================================================================
--- lib/Target/R600/AMDGPU.td
+++ lib/Target/R600/AMDGPU.td
@@ -68,6 +68,11 @@
         "true",
         "GPU has CF_ALU bug">;
 
+def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space",
+        "FlatAddressSpace",
+        "true",
+        "Support flat address space">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
@@ -108,7 +113,7 @@
         [Feature64BitPtr, FeatureFP64]>;
 
 def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
-        [Feature64BitPtr, FeatureFP64]>;
+        [Feature64BitPtr, FeatureFP64, FeatureFlatAddressSpace]>;
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
Index: lib/Target/R600/AMDGPUAsmPrinter.h
===================================================================
--- lib/Target/R600/AMDGPUAsmPrinter.h
+++ lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,15 +24,19 @@
 class AMDGPUAsmPrinter : public AsmPrinter {
 private:
   struct SIProgramInfo {
-    SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
+    SIProgramInfo() : NumSGPR(0),
+                      NumVGPR(0),
+                      VCCUsed(false),
+                      FlatUsed(false) {}
     unsigned NumSGPR;
     unsigned NumVGPR;
+    bool VCCUsed;
+    bool FlatUsed;
   };
 
   void getSIProgramInfo(SIProgramInfo &Out, MachineFunction &MF) const;
-  void findNumUsedRegistersSI(MachineFunction &MF,
-                              unsigned &NumSGPR,
-                              unsigned &NumVGPR) const;
+  void findUsedRegistersSI(MachineFunction &MF,
+                           SIProgramInfo &Out) const;
 
   /// \brief Emit register usage information so that the GPU driver
   /// can correctly setup the GPU state.
Index: lib/Target/R600/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -66,7 +66,7 @@
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
+    findUsedRegistersSI(MF, KernelInfo);
     EmitProgramInfoSI(MF, KernelInfo);
   } else {
     EmitProgramInfoR600(MF);
@@ -186,14 +186,14 @@
   }
 }
 
-void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
-                                              unsigned &NumSGPR,
-                                              unsigned &NumVGPR) const {
+void AMDGPUAsmPrinter::findUsedRegistersSI(MachineFunction &MF,
+                                           SIProgramInfo &Out) const {
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
-  const SIRegisterInfo * RI =
-                static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+  bool FlatUsed = false;
+  const SIRegisterInfo *RI
+    = static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
 
   for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                   BB != BB_E; ++BB) {
@@ -215,6 +215,10 @@
         if (reg == AMDGPU::VCC) {
           VCCUsed = true;
           continue;
+        } else if (reg == AMDGPU::FLAT_SCRATCH_SIZE ||
+                   reg == AMDGPU::FLAT_SCRATCH_OFFSET) {
+          FlatUsed = true;
+          continue;
         }
 
         switch (reg) {
@@ -275,13 +279,18 @@
   if (VCCUsed)
     MaxSGPR += 2;
 
-  NumSGPR = MaxSGPR;
-  NumVGPR = MaxVGPR;
+  if (FlatUsed)
+    MaxSGPR += 2;
+
+  Out.NumSGPR = MaxSGPR;
+  Out.NumVGPR = MaxVGPR;
+  Out.VCCUsed = VCCUsed;
+  Out.FlatUsed = FlatUsed;
 }
 
 void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
                                         MachineFunction &MF) const {
-  findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
+  findUsedRegistersSI(MF, Out);
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
@@ -316,6 +325,7 @@
   if (MFI->ShaderType == ShaderType::COMPUTE) {
     OutStreamer.EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
     OutStreamer.EmitIntValue(S_00B84C_LDS_SIZE(LDSBlocks), 4);
+    // TODO: Should probably note flat usage somewhere
   }
   if (MFI->ShaderType == ShaderType::PIXEL) {
     OutStreamer.EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
Index: lib/Target/R600/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -61,10 +61,12 @@
   SDValue SimplifyI24(SDValue &Op);
   bool SelectI24(SDValue Addr, SDValue &Op);
   bool SelectU24(SDValue Addr, SDValue &Op);
+  SDNode *SelectAddrSpaceCast(SDNode *N);
 
   static bool checkType(const Value *ptr, unsigned int addrspace);
 
   static bool isGlobalStore(const StoreSDNode *N);
+  static bool isFlatStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
   static bool isLocalStore(const StoreSDNode *N);
   static bool isRegionStore(const StoreSDNode *N);
@@ -72,6 +74,7 @@
   bool isCPLoad(const LoadSDNode *N) const;
   bool isConstantLoad(const LoadSDNode *N, int cbID) const;
   bool isGlobalLoad(const LoadSDNode *N) const;
+  bool isFlatLoad(const LoadSDNode *N) const;
   bool isParamLoad(const LoadSDNode *N) const;
   bool isPrivateLoad(const LoadSDNode *N) const;
   bool isLocalLoad(const LoadSDNode *N) const;
@@ -343,6 +346,9 @@
                                         CurDAG->getVTList(MVT::Other),
                                         Ops);
   }
+
+  case ISD::ADDRSPACECAST:
+    return SelectAddrSpaceCast(N);
   }
   return SelectCode(N);
 }
@@ -370,6 +376,10 @@
   return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
+bool AMDGPUDAGToDAGISel::isFlatStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
 bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
   return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
 }
@@ -400,6 +410,10 @@
   return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
+bool AMDGPUDAGToDAGISel::isFlatLoad(const  LoadSDNode *N) const {
+  return checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS);
+}
+
 bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
   return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
 }
@@ -428,6 +442,7 @@
   }
   if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
       && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::FLAT_ADDRESS)
       && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
       && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
       && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
@@ -558,6 +573,58 @@
   return false;
 }
 
+SDNode *AMDGPUDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
+  AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(N);
+  SDLoc DL(N);
+
+  assert(Subtarget.hasFlatAddressSpace() &&
+         "addrspacecast only supported with flat address space!");
+
+  assert((ASC->getSrcAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
+          ASC->getDestAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS) &&
+         "Cannot cast address space to / from constant address!");
+
+  assert((ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+          ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) &&
+         "Can only cast to / from flat address space!");
+
+  // The flat instructions read the address as the index of the VGPR holding the
+  // address, so casting should just be reinterpreting the base VGPR, so just
+  // insert trunc / bitcast / zext.
+
+  SDValue Src = ASC->getOperand(0);
+  EVT DestVT = ASC->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+
+  unsigned SrcSize = SrcVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+
+  if (SrcSize > DestSize) {
+    assert(SrcSize == 64 && DestSize == 32);
+    return CurDAG->getMachineNode(
+      TargetOpcode::EXTRACT_SUBREG,
+      DL,
+      DestVT,
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32));
+  }
+
+
+  if (DestSize > SrcSize) {
+    assert(SrcSize == 32 && DestSize == 64);
+    return CurDAG->getMachineNode(
+      TargetOpcode::SUBREG_TO_REG,
+      DL,
+      DestVT,
+      CurDAG->getTargetConstant(0, MVT::i32),
+      Src,
+      CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32));
+  }
+
+  assert(SrcSize == 64 && DestSize == 64);
+  return CurDAG->getNode(ISD::BITCAST, DL, DestVT, Src).getNode();
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     (*(const AMDGPUTargetLowering*)getTargetLowering());
Index: lib/Target/R600/AMDGPUInstrInfo.h
===================================================================
--- lib/Target/R600/AMDGPUInstrInfo.h
+++ lib/Target/R600/AMDGPUInstrInfo.h
@@ -100,6 +100,7 @@
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const;
+public:
   /// \returns the smallest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
   virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
@@ -108,7 +109,6 @@
   /// read or write or -1 if indirect addressing is not used by this program.
   virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
 
-public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
                             const SmallVectorImpl<unsigned> &Ops) const;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
Index: lib/Target/R600/AMDGPUInstructions.td
===================================================================
--- lib/Target/R600/AMDGPUInstructions.td
+++ lib/Target/R600/AMDGPUInstructions.td
@@ -133,6 +133,14 @@
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def az_extloadi8_flat : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi8_flat : PatFrag<(ops node:$ptr), (sextloadi8 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi8_constant : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@@ -161,6 +169,14 @@
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def az_extloadi16_flat : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+def sextloadi16_flat : PatFrag<(ops node:$ptr), (sextloadi16 node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi16_constant : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
 }]>;
@@ -186,6 +202,11 @@
   return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
 
+def az_extloadi32_flat : PatFrag<(ops node:$ptr),
+                                   (az_extloadi32 node:$ptr), [{
+  return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
 def az_extloadi32_constant : PatFrag<(ops node:$ptr),
                                      (az_extloadi32 node:$ptr), [{
   return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
@@ -201,6 +222,16 @@
   return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;
 
+def truncstorei8_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei8 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+def truncstorei16_flat : PatFrag<(ops node:$val, node:$ptr),
+                                  (truncstorei16 node:$val, node:$ptr), [{
+  return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
 def local_store : PatFrag<(ops node:$val, node:$ptr),
                              (store node:$val, node:$ptr), [{
   return isLocalStore(dyn_cast<StoreSDNode>(N));
@@ -235,6 +266,11 @@
   return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
 }]>;
 
+def mskor_flat : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return dyn_cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
+}]>;
+
 class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
Index: lib/Target/R600/AMDGPUMachineFunction.h
===================================================================
--- lib/Target/R600/AMDGPUMachineFunction.h
+++ lib/Target/R600/AMDGPUMachineFunction.h
@@ -28,6 +28,8 @@
   std::map<const GlobalValue *, unsigned> LocalMemoryObjects;
   /// Number of bytes in the LDS that are being used.
   unsigned LDSSize;
+  unsigned ScratchSize;
+  bool IsKernel;
 };
 
 }
Index: lib/Target/R600/AMDGPUMachineFunction.cpp
===================================================================
--- lib/Target/R600/AMDGPUMachineFunction.cpp
+++ lib/Target/R600/AMDGPUMachineFunction.cpp
@@ -10,9 +10,11 @@
 void AMDGPUMachineFunction::anchor() {}
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
-    MachineFunctionInfo() {
-  ShaderType = ShaderType::COMPUTE;
-  LDSSize = 0;
+  MachineFunctionInfo(),
+  ShaderType(ShaderType::COMPUTE),
+  LDSSize(0),
+  ScratchSize(0),
+  IsKernel(true) {
   AttributeSet Set = MF.getFunction()->getAttributes();
   Attribute A = Set.getAttribute(AttributeSet::FunctionIndex,
                                  ShaderTypeAttribute);
Index: lib/Target/R600/AMDGPUSubtarget.h
===================================================================
--- lib/Target/R600/AMDGPUSubtarget.h
+++ lib/Target/R600/AMDGPUSubtarget.h
@@ -49,6 +49,7 @@
   enum Generation Gen;
   bool FP64;
   bool CaymanISA;
+  bool FlatAddressSpace;
   bool EnableIRStructurizer;
   bool EnableIfCvt;
   unsigned WavefrontSize;
@@ -68,6 +69,9 @@
   enum Generation getGeneration() const;
   bool hasHWFP64() const;
   bool hasCaymanISA() const;
+  bool hasFlatAddressSpace() const {
+    return FlatAddressSpace;
+  }
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
Index: lib/Target/R600/AMDGPUSubtarget.cpp
===================================================================
--- lib/Target/R600/AMDGPUSubtarget.cpp
+++ lib/Target/R600/AMDGPUSubtarget.cpp
@@ -36,6 +36,7 @@
   Gen = AMDGPUSubtarget::R600;
   FP64 = false;
   CaymanISA = false;
+  FlatAddressSpace = false;
   EnableIRStructurizer = true;
   EnableIfCvt = true;
   WavefrontSize = 0;
Index: lib/Target/R600/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/R600/AMDGPUTargetMachine.cpp
+++ lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -53,8 +53,9 @@
   std::string Ret = "e-p:32:32";
 
   if (ST.is64bit()) {
-    // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+    // 32-bit private, local, and region pointers. 64-bit global, flat and
+    // constant.
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p6:64:64";
   }
 
   Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
Index: lib/Target/R600/AMDILInstrInfo.td
===================================================================
--- lib/Target/R600/AMDILInstrInfo.td
+++ lib/Target/R600/AMDILInstrInfo.td
@@ -74,6 +74,12 @@
         return isGlobalStore(dyn_cast<StoreSDNode>(N));
 }]>;
 
+def flat_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isFlatStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+
 //===----------------------------------------------------------------------===//
 // Load pattern fragments
 //===----------------------------------------------------------------------===//
@@ -81,6 +87,10 @@
 def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
     return isGlobalLoad(dyn_cast<LoadSDNode>(N));
 }]>;
+// Flat address space loads
+def flat_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isFlatLoad(dyn_cast<LoadSDNode>(N));
+}]>;
 // Constant address space loads
 def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
     return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
Index: lib/Target/R600/SIInstrFormats.td
===================================================================
--- lib/Target/R600/SIInstrFormats.td
+++ lib/Target/R600/SIInstrFormats.td
@@ -446,6 +446,36 @@
   let MIMG = 1;
 }
 
+class FLAT <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+  bits<8> addr;
+  bits<8> data;
+  bits<8> vdst;
+  bits<1> slc;
+  bits<1> glc;
+  bits<1> tfe;
+
+  // 15-0 is reserved.
+  let Inst{16} = glc;
+  let Inst{17} = slc;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x37; // Encoding.
+  let Inst{39-32} = addr;
+  let Inst{47-40} = data;
+  // 54-48 is reserved.
+  let Inst{55} = tfe;
+  let Inst{63-56} = vdst;
+
+  // Internally, FLAT instruction are executed as both an LDS and a
+  // Buffer instruction; so, they increment both VM_CNT and LGKM_CNT
+  // and are not considered done until both have been decremented.
+  let VM_CNT = 1;
+  let EXP_CNT = 1; // XXX - Need this?
+  let LGKM_CNT = 1;
+
+  let neverHasSideEffects = 1;
+}
+
 def EXP : Enc64<
   (outs),
   (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
Index: lib/Target/R600/SIInstrInfo.cpp
===================================================================
--- lib/Target/R600/SIInstrInfo.cpp
+++ lib/Target/R600/SIInstrInfo.cpp
@@ -383,6 +383,11 @@
         if (!MO.isImplicit() && MO.getReg() == AMDGPU::EXEC)
           ++ConstantBusCount;
 
+        // XXX - I'm sort of guessing about this.
+        if (!MO.isImplicit() && (MO.getReg() == AMDGPU::FLAT_SCRATCH_SIZE ||
+                                 MO.getReg() == AMDGPU::FLAT_SCRATCH_OFFSET))
+          ++ConstantBusCount;
+
         // SGPRs use the constant bus
         if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC ||
             (!MO.isImplicit() &&
Index: lib/Target/R600/SIInstrInfo.td
===================================================================
--- lib/Target/R600/SIInstrInfo.td
+++ lib/Target/R600/SIInstrInfo.td
@@ -132,6 +132,8 @@
 def SIOperand {
   int ZERO = 0x80;
   int VCC = 0x6A;
+  int FLAT_SCRATCH_OFFSET = 0x68;
+  int FLAT_SCRATCH_SIZE = 0x69;
 }
 
 include "SIInstrFormats.td"
@@ -490,6 +492,31 @@
   let soffset = 128; // ZERO
 }
 
+class FLAT_Load_Helper <bits<7> op, string asm, RegisterClass regClass> :
+      FLAT <op, (outs regClass:$data),
+                (ins VReg_64:$addr),
+            asm#" $data, $addr, [M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]", []> {
+  let glc = 0;
+  let slc = 0;
+  let tfe = 0;
+  let mayLoad = 1;
+  let Uses = [EXEC, M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE];
+}
+
+class FLAT_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
+      FLAT <op, (outs), (ins vdataClass:$data, VReg_64:$addr),
+          name#" $data, $addr, [M0, FLAT_SCRATCH_OFFSET, FLAT_SCRATCH_SIZE]",
+         []> {
+
+  let mayLoad = 0;
+  let mayStore = 1;
+
+  // Encoding
+  let glc = 0;
+  let slc = 0;
+  let tfe = 0;
+}
+
 class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs regClass:$dst),
Index: lib/Target/R600/SIInstructions.td
===================================================================
--- lib/Target/R600/SIInstructions.td
+++ lib/Target/R600/SIInstructions.td
@@ -29,6 +29,11 @@
 def isSI : Predicate<"Subtarget.getGeneration() "
                       ">= AMDGPUSubtarget::SOUTHERN_ISLANDS">;
 
+def isCI : Predicate<"Subtarget.getGeneration() "
+                      ">= AMDGPUSubtarget::SEA_ISLANDS">;
+def HasFlatAddressSpace : Predicate<"Subtarget.hasFlatAddressSpace()">;
+
+
 def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
 let Predicates = [isSI] in {
@@ -491,6 +496,78 @@
 def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
 def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
+let Predicates = [HasFlatAddressSpace] in {
+def FLAT_LOAD_UBYTE : FLAT_Load_Helper <0x00000008, "FLAT_LOAD_UBYTE", VReg_32>;
+def FLAT_LOAD_SBYTE : FLAT_Load_Helper <0x00000009, "FLAT_LOAD_SBYTE", VReg_32>;
+def FLAT_LOAD_USHORT : FLAT_Load_Helper <0x0000000a, "FLAT_LOAD_USHORT", VReg_32>;
+def FLAT_LOAD_SSHORT : FLAT_Load_Helper <0x0000000b, "FLAT_LOAD_SSHORT", VReg_32>;
+def FLAT_LOAD_DWORD : FLAT_Load_Helper <0x0000000c, "FLAT_LOAD_DWORD", VReg_32>;
+def FLAT_LOAD_DWORDX2 : FLAT_Load_Helper <0x0000000d, "FLAT_LOAD_DWORDX2", VReg_64>;
+def FLAT_LOAD_DWORDX4 : FLAT_Load_Helper <0x0000000e, "FLAT_LOAD_DWORDX4", VReg_128>;
+def FLAT_LOAD_DWORDX3 : FLAT_Load_Helper <0x00000010, "FLAT_LOAD_DWORDX3", VReg_96>;
+
+def FLAT_STORE_BYTE : FLAT_Store_Helper <
+  0x00000018, "FLAT_STORE_BYTE", VReg_32
+>;
+
+def FLAT_STORE_SHORT : FLAT_Store_Helper <
+  0x0000001a, "FLAT_STORE_SHORT", VReg_32
+>;
+
+def FLAT_STORE_DWORD : FLAT_Store_Helper <
+  0x0000001c, "FLAT_STORE_DWORD", VReg_32
+>;
+
+def FLAT_STORE_DWORDX2 : FLAT_Store_Helper <
+  0x0000001d, "FLAT_STORE_DWORDX2", VReg_64
+>;
+
+def FLAT_STORE_DWORDX4 : FLAT_Store_Helper <
+  0x0000001e, "FLAT_STORE_DWORDX4", VReg_128
+>;
+
+def FLAT_STORE_DWORDX3 : FLAT_Store_Helper <
+  0x0000001e, "FLAT_STORE_DWORDX3", VReg_96
+>;
+
+//def FLAT_ATOMIC_SWAP : FLAT_ <0x00000030, "FLAT_ATOMIC_SWAP", []>;
+//def FLAT_ATOMIC_CMPSWAP : FLAT_ <0x00000031, "FLAT_ATOMIC_CMPSWAP", []>;
+//def FLAT_ATOMIC_ADD : FLAT_ <0x00000032, "FLAT_ATOMIC_ADD", []>;
+//def FLAT_ATOMIC_SUB : FLAT_ <0x00000033, "FLAT_ATOMIC_SUB", []>;
+//def FLAT_ATOMIC_RSUB : FLAT_ <0x00000034, "FLAT_ATOMIC_RSUB", []>;
+//def FLAT_ATOMIC_SMIN : FLAT_ <0x00000035, "FLAT_ATOMIC_SMIN", []>;
+//def FLAT_ATOMIC_UMIN : FLAT_ <0x00000036, "FLAT_ATOMIC_UMIN", []>;
+//def FLAT_ATOMIC_SMAX : FLAT_ <0x00000037, "FLAT_ATOMIC_SMAX", []>;
+//def FLAT_ATOMIC_UMAX : FLAT_ <0x00000038, "FLAT_ATOMIC_UMAX", []>;
+//def FLAT_ATOMIC_AND : FLAT_ <0x00000039, "FLAT_ATOMIC_AND", []>;
+//def FLAT_ATOMIC_OR : FLAT_ <0x0000003a, "FLAT_ATOMIC_OR", []>;
+//def FLAT_ATOMIC_XOR : FLAT_ <0x0000003b, "FLAT_ATOMIC_XOR", []>;
+//def FLAT_ATOMIC_INC : FLAT_ <0x0000003c, "FLAT_ATOMIC_INC", []>;
+//def FLAT_ATOMIC_DEC : FLAT_ <0x0000003d, "FLAT_ATOMIC_DEC", []>;
+//def FLAT_ATOMIC_FCMPSWAP : FLAT_ <0x0000003e, "FLAT_ATOMIC_FCMPSWAP", []>;
+//def FLAT_ATOMIC_FMIN : FLAT_ <0x0000003f, "FLAT_ATOMIC_FMIN", []>;
+//def FLAT_ATOMIC_FMAX : FLAT_ <0x00000040, "FLAT_ATOMIC_FMAX", []>;
+//def FLAT_ATOMIC_SWAP_X2 : FLAT_X2 <0x00000050, "FLAT_ATOMIC_SWAP_X2", []>;
+//def FLAT_ATOMIC_CMPSWAP_X2 : FLAT_X2 <0x00000051, "FLAT_ATOMIC_CMPSWAP_X2", []>;
+//def FLAT_ATOMIC_ADD_X2 : FLAT_X2 <0x00000052, "FLAT_ATOMIC_ADD_X2", []>;
+//def FLAT_ATOMIC_SUB_X2 : FLAT_X2 <0x00000053, "FLAT_ATOMIC_SUB_X2", []>;
+//def FLAT_ATOMIC_RSUB_X2 : FLAT_X2 <0x00000054, "FLAT_ATOMIC_RSUB_X2", []>;
+//def FLAT_ATOMIC_SMIN_X2 : FLAT_X2 <0x00000055, "FLAT_ATOMIC_SMIN_X2", []>;
+//def FLAT_ATOMIC_UMIN_X2 : FLAT_X2 <0x00000056, "FLAT_ATOMIC_UMIN_X2", []>;
+//def FLAT_ATOMIC_SMAX_X2 : FLAT_X2 <0x00000057, "FLAT_ATOMIC_SMAX_X2", []>;
+//def FLAT_ATOMIC_UMAX_X2 : FLAT_X2 <0x00000058, "FLAT_ATOMIC_UMAX_X2", []>;
+//def FLAT_ATOMIC_AND_X2 : FLAT_X2 <0x00000059, "FLAT_ATOMIC_AND_X2", []>;
+//def FLAT_ATOMIC_OR_X2 : FLAT_X2 <0x0000005a, "FLAT_ATOMIC_OR_X2", []>;
+//def FLAT_ATOMIC_XOR_X2 : FLAT_X2 <0x0000005b, "FLAT_ATOMIC_XOR_X2", []>;
+//def FLAT_ATOMIC_INC_X2 : FLAT_X2 <0x0000005c, "FLAT_ATOMIC_INC_X2", []>;
+//def FLAT_ATOMIC_DEC_X2 : FLAT_X2 <0x0000005d, "FLAT_ATOMIC_DEC_X2", []>;
+//def FLAT_ATOMIC_FCMPSWAP_X2 : FLAT_X2 <0x0000005e, "FLAT_ATOMIC_FCMPSWAP_X2", []>;
+//def FLAT_ATOMIC_FMIN_X2 : FLAT_X2 <0x0000005f, "FLAT_ATOMIC_FMIN_X2", []>;
+//def FLAT_ATOMIC_FMAX_X2 : FLAT_X2 <0x00000060, "FLAT_ATOMIC_FMAX_X2", []>;
+
+} // End HasFlatAddressSpace predicate
+
+
 let mayLoad = 1 in {
 
 // We are using the SGPR_32 and not the SReg_32 register class for 32-bit
@@ -2084,6 +2161,39 @@
 def : MTBUF_StoreResource <v4i32, 3, TBUFFER_STORE_FORMAT_XYZ>;
 def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
 
+//===----------------------------------------------------------------------===//
+// Flat Patterns
+//===----------------------------------------------------------------------===//
+
+class FLATLoad_Pattern <FLAT Instr_ADDR64, ValueType vt,
+                             PatFrag flat_ld> :
+  Pat <(vt (flat_ld i64:$ptr)),
+       (Instr_ADDR64 $ptr)
+>;
+
+def : FLATLoad_Pattern <FLAT_LOAD_SBYTE, i32, sextloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_UBYTE, i32, az_extloadi8_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_SSHORT, i32, sextloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_USHORT, i32, az_extloadi16_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORD, i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, i64, az_extloadi32_flat>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX2, v2i32, flat_load>;
+def : FLATLoad_Pattern <FLAT_LOAD_DWORDX4, v4i32, flat_load>;
+
+class FLATStore_Pattern <FLAT Instr, ValueType vt, PatFrag st> :
+  Pat <(st vt:$value, i64:$ptr),
+        (Instr $value, $ptr)
+  >;
+
+def : FLATStore_Pattern <FLAT_STORE_BYTE, i32, truncstorei8_flat>;
+def : FLATStore_Pattern <FLAT_STORE_SHORT, i32, truncstorei16_flat>;
+def : FLATStore_Pattern <FLAT_STORE_DWORD, i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, i64, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX2, v2i32, flat_store>;
+def : FLATStore_Pattern <FLAT_STORE_DWORDX4, v4i32, flat_store>;
+
+
 /********** ====================== **********/
 /**********   Indirect adressing   **********/
 /********** ====================== **********/
Index: lib/Target/R600/SILowerControlFlow.cpp
===================================================================
--- lib/Target/R600/SILowerControlFlow.cpp
+++ lib/Target/R600/SILowerControlFlow.cpp
@@ -51,6 +51,7 @@
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -415,6 +416,7 @@
   bool HaveKill = false;
   bool NeedM0 = false;
   bool NeedWQM = false;
+  bool NeedFlat = false;
   unsigned Depth = 0;
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
@@ -500,6 +502,24 @@
           NeedWQM = true;
           break;
 
+      case AMDGPU::FLAT_LOAD_DWORD:
+      case AMDGPU::FLAT_LOAD_DWORDX2:
+      case AMDGPU::FLAT_LOAD_DWORDX3:
+      case AMDGPU::FLAT_LOAD_DWORDX4:
+      case AMDGPU::FLAT_LOAD_SBYTE:
+      case AMDGPU::FLAT_LOAD_SSHORT:
+      case AMDGPU::FLAT_LOAD_UBYTE:
+      case AMDGPU::FLAT_LOAD_USHORT:
+      case AMDGPU::FLAT_STORE_BYTE:
+      case AMDGPU::FLAT_STORE_DWORD:
+      case AMDGPU::FLAT_STORE_DWORDX2:
+      case AMDGPU::FLAT_STORE_DWORDX3:
+      case AMDGPU::FLAT_STORE_DWORDX4:
+      case AMDGPU::FLAT_STORE_SHORT:
+        // TODO: atomics and other flat instructions
+        NeedFlat = true;
+        break;
+
       }
     }
   }
@@ -518,5 +538,39 @@
             AMDGPU::EXEC).addReg(AMDGPU::EXEC);
   }
 
+  // FIXME: This seems inappropriate to do here.
+  if (NeedFlat && MFI->IsKernel) {
+    // Insert the prologue initializing the SGPRs pointing to the scratch space
+    // for flat accesses.
+    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+
+    // TODO: What to use with function calls?
+    unsigned StackSizeBytes = FrameInfo->getStackSize();
+
+    int IndirectBegin = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
+
+    // Convert register index to 256-byte unit.
+    // XXX - Does it mean bits? 256-bytes seems wrong.
+    unsigned StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
+
+    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
+           "Stack limits should be smaller than 16-bits");
+
+    // Initialize the flat scratch register pair.
+
+    // Offset is in units of 256-bytes.
+    MachineBasicBlock &MBB = MF.front();
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(),
+            TII->get(AMDGPU::S_MOVK_I32),
+            AMDGPU::FLAT_SCRATCH_OFFSET).addImm(StackOffset);
+
+    // XXX - Documentation says size is "per-thread scratch size in bytes", but
+    // that's crazy. Maybe it means per wave?
+    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(),
+            TII->get(AMDGPU::S_MOVK_I32),
+            AMDGPU::FLAT_SCRATCH_SIZE).addImm(StackSizeBytes);
+
+  }
+
   return true;
 }
Index: lib/Target/R600/SIRegisterInfo.td
===================================================================
--- lib/Target/R600/SIRegisterInfo.td
+++ lib/Target/R600/SIRegisterInfo.td
@@ -17,11 +17,20 @@
 }
 
 // Special Registers
+
+// Pair to indicate location of scratch space for flat accesses.
+// Offset is in units of 256-bytes,
+def FLAT_SCRATCH_OFFSET : SIReg <"FLAT_SCRATCH_OFFSET", 104>;
+
+// Size is the per-thread scratch size, in bytes.
+def FLAT_SCRATCH_SIZE : SIReg <"FLAT_SCRATCH_SIZE", 105>;
+
 def VCC : SIReg<"VCC", 106>;
 def EXEC : SIReg<"EXEC", 126>;
 def SCC : SIReg<"SCC", 253>;
 def M0 : SIReg <"M0", 124>;
 
+
 // SGPR registers
 foreach Index = 0-101 in {
   def SGPR#Index : SIReg <"SGPR"#Index, Index>;
Index: test/CodeGen/R600/flat-address-space.ll
===================================================================
--- /dev/null
+++ test/CodeGen/R600/flat-address-space.ll
@@ -0,0 +1,182 @@
+; RUN: llc -O0 -march=r600 -mcpu=bonaire < %s | FileCheck %s
+
+; Disable optimizations in case there are optimizations added that
+; specialize away generic pointer accesses.
+
+
+; CHECK-LABEL: @branch_use_flat_i32:
+; CHECK: ; BB#3:                                 ; %global
+
+; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-1]+]], {{s[0-9]+}}
+; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-1]+]], {{s[0-9]+}}
+
+; CHECK: ; BB#2:                                 ; %local
+
+; CHECK: V_MOV_B32_e32 v[[LO_VREG]], {{s[0-9]+}}
+; CHECK: V_MOV_B32_e32 v[[HI_VREG]], {{s[0-9]+}}
+
+; CHECK: FLAT_STORE_DWORD {{v[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+entry:
+  %cmp = icmp ne i32 %c, 0
+  br i1 %cmp, label %local, label %global
+
+local:
+  %flat_local = addrspacecast i32 addrspace(3)* %lptr to i32 addrspace(4)*
+  br label %end
+
+global:
+  %flat_global = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  br label %end
+
+end:
+  %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+;  %val = load i32 addrspace(4)* %fptr, align 4
+;  store i32 %val, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+
+
+; These testcases might become useless when there are optimizations to
+; remove generic pointers.
+
+; CHECK-LABEL: @store_flat_i32:
+; CHECK: V_MOV_B32_e32 v[[DATA:[0-9]+]], {{s[0-9]+}}
+; CHECK: V_MOV_B32_e32 v[[LO_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: V_MOV_B32_e32 v[[HI_VREG:[0-9]+]], {{s[0-9]+}}
+; CHECK: FLAT_STORE_DWORD v[[DATA]], v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
+define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_i64:
+; CHECK: FLAT_STORE_DWORDX2
+define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_v4i32:
+; CHECK: FLAT_STORE_DWORDX4
+define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_trunc_i16:
+; CHECK: FLAT_STORE_SHORT
+define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %y = trunc i32 %x to i16
+  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+; CHECK-LABEL: @store_flat_trunc_i8:
+; CHECK: FLAT_STORE_BYTE
+define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %y = trunc i32 %x to i8
+  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  ret void
+}
+
+
+
+; CHECK-LABEL @load_flat_i32:
+; CHECK: FLAT_LOAD_DWORD
+define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
+  %fload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %fload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @load_flat_i64:
+; CHECK: FLAT_LOAD_DWORDX2
+define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
+  %fload = load i64 addrspace(4)* %fptr, align 4
+  store i64 %fload, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @load_flat_v4i32:
+; CHECK: FLAT_LOAD_DWORDX4
+define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
+  %fload = load <4 x i32> addrspace(4)* %fptr, align 4
+  store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i8:
+; CHECK: FLAT_LOAD_SBYTE
+define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = sext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i8:
+; CHECK: FLAT_LOAD_UBYTE
+define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
+  %fload = load i8 addrspace(4)* %fptr, align 4
+  %ext = zext i8 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @sextload_flat_i16:
+; CHECK: FLAT_LOAD_SSHORT
+define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = sext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL @zextload_flat_i16:
+; CHECK: FLAT_LOAD_USHORT
+define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+  %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
+  %fload = load i16 addrspace(4)* %fptr, align 4
+  %ext = zext i16 %fload to i32
+  store i32 %ext, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local() #1
+
+
+; Check for prologue initializing special SGPRs pointing to scratch.
+; CHECK-LABEL: @store_flat_scratch:
+; CHECK: S_MOVK_I32 FLAT_SCRATCH_SIZE, 40
+; CHECK: S_MOVK_I32 FLAT_SCRATCH_OFFSET,
+; CHECK: FLAT_STORE_DWORD
+; CHECK: S_BARRIER
+; CHECK: FLAT_LOAD_DWORD
+define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32 %x) #0 {
+  %alloca = alloca i32, i32 9, align 4
+  %pptr = getelementptr i32* %alloca, i32 %x
+  %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
+  store i32 %x, i32 addrspace(4)* %fptr
+  ; Dummy call
+  call void @llvm.AMDGPU.barrier.local() #1
+  %reload = load i32 addrspace(4)* %fptr, align 4
+  store i32 %reload, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind noduplicate }