Index: lib/Target/AMDGPU/AMDGPUCallLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -22,6 +22,13 @@
 class AMDGPUTargetLowering;
 
 class AMDGPUCallLowering: public CallLowering {
+
+  unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+                             unsigned Offset) const;
+
+  void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+                      unsigned Offset, unsigned DstReg) const;
+
  public:
   AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
 
Index: lib/Target/AMDGPU/AMDGPUCallLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -14,8 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUCallLowering.h"
+#include "AMDGPU.h"
 #include "AMDGPUISelLowering.h"
-
+#include "AMDGPUSubtarget.h"
+#include "SIISelLowering.h"
+#include "SIRegisterInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 
@@ -31,12 +36,135 @@
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                         const Value *Val, unsigned VReg) const {
+  MIRBuilder.buildInstr(AMDGPU::S_ENDPGM);
   return true;
 }
 
+unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+                                               Type *ParamTy,
+                                               unsigned Offset) const {
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
+  LLT PtrType(*PtrTy, DL);
+  unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
+  unsigned KernArgSegmentPtr =
+      TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+  unsigned KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
+
+  unsigned OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
+  MIRBuilder.buildConstant(OffsetReg, Offset);
+
+  MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
+
+  return DstReg;
+}
+
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
+                                        Type *ParamTy, unsigned Offset,
+                                        unsigned DstReg) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = *MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+  unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
+  unsigned Align = DL.getABITypeAlignment(ParamTy);
+  unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
+                                       MachineMemOperand::MONonTemporal |
+                                       MachineMemOperand::MOInvariant,
+                                       TypeSize, Align);
+
+  // FIXME: We need to handle sign/zero extend
+  MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
+}
+
 bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                               const Function &F,
                                               ArrayRef<unsigned> VRegs) const {
-  // TODO: Implement once there are generic loads/stores.
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+
+  // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
+  if (Info->hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+    CCInfo.AllocateReg(PrivateSegmentBufferReg);
+  }
+
+  if (Info->hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(DispatchPtrReg);
+  }
+
+  if (Info->hasQueuePtr()) {
+    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(QueuePtrReg);
+  }
+
+  if (Info->hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+    const LLT P2 = LLT::pointer(2, 64);
+    unsigned VReg = MRI.createGenericVirtualRegister(P2);
+    MRI.addLiveIn(InputPtrReg, VReg);
+    MIRBuilder.getMBB().addLiveIn(InputPtrReg);
+    MIRBuilder.buildCopy(VReg, InputPtrReg);
+    CCInfo.AllocateReg(InputPtrReg);
+  }
+
+  if (Info->hasDispatchID()) {
+    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(DispatchIDReg);
+  }
+
+  if (Info->hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+    // FIXME: Need to add reg as live-in
+    CCInfo.AllocateReg(FlatScratchInitReg);
+  }
+
+  unsigned NumArgs = F.arg_size();
+  Function::const_arg_iterator CurOrigArg = F.arg_begin();
+  const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+  for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
+    CurOrigArg->getType()->dump();
+    MVT ValVT = TLI.getValueType(DL, CurOrigArg->getType()).getSimpleVT();
+    ISD::ArgFlagsTy Flags;
+    Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
+    CCAssignFn *AssignFn =
+        TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+
+  Function::const_arg_iterator Arg = F.arg_begin();
+  for (unsigned i = 0; i != NumArgs; ++i, ++Arg) {
+    // FIXME: We should be getting DebugInfo from the arguments some how.
+    CCValAssign &VA = ArgLocs[i];
+    lowerParameter(MIRBuilder, Arg->getType(),
+                   VA.getLocMemOffset() +
+                   Subtarget->getExplicitKernelArgOffset(), VRegs[i]);
+  }
+
   return true;
 }
Index: lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -0,0 +1,67 @@
+//===- AMDGPUGenRegisterBankInfo.def -----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by AMDGPURegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace llvm {
+namespace AMDGPU {
+
+RegisterBank SGPRRegBank;
+RegisterBank VGPRRegBank;
+
+RegisterBank *RegBanks[] = {&SGPRRegBank, &VGPRRegBank};
+
+enum PartialMappingIdx {
+  None = - 1,
+  PM_SGPR32 = 0,
+  PM_SGPR64 = 1,
+  PM_VGPR32 = 2,
+  PM_VGPR64 = 3
+};
+
+const RegisterBankInfo::PartialMapping PartMappings[] {
+  // StartIdx, Length, RegBank
+  {0, 32, SGPRRegBank},
+  {0, 64, SGPRRegBank},
+  {0, 32, VGPRRegBank},
+  {0, 64, VGPRRegBank}
+};
+
+const RegisterBankInfo::ValueMapping ValMappings[] {
+  // SGPR 32-bit
+  {&PartMappings[0], 1},
+  // SGPR 64-bit
+  {&PartMappings[1], 1},
+  // VGPR 32-bit
+  {&PartMappings[2], 1},
+  // VGPR 64-bit
+  {&PartMappings[3], 1}
+};
+
+enum ValueMappingIdx {
+  SGPRStartIdx = 0,
+  VGPRStartIdx = 2
+};
+
+const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
+                                                      unsigned Size) {
+  assert(Size % 32 == 0);
+  unsigned Idx = BankID == AMDGPU::SGPRRegBankID ? SGPRStartIdx : VGPRStartIdx;
+  Idx += (Size / 32) - 1;
+  return &ValMappings[Idx];
+}
+
+} // End AMDGPU namespace.
+} // End llvm namespace.
Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1141,8 +1141,8 @@
   SDLoc SL(ByteOffsetNode);
   AMDGPUSubtarget::Generation Gen = Subtarget->getGeneration();
   int64_t ByteOffset = C->getSExtValue();
-  int64_t EncodedOffset = Gen < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
-      ByteOffset >> 2 : ByteOffset;
+  int64_t EncodedOffset =
+      SIInstrInfo::getSMRDEncodedOffset(*Subtarget, ByteOffset);
 
   if (isLegalSMRDImmOffset(Subtarget, EncodedOffset)) {
     Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
Index: lib/Target/AMDGPU/AMDGPUISelLowering.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -211,6 +212,8 @@
   /// type of implicit parameter.
   uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
                                       const ImplicitParameter Param) const;
+
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
 };
 
 namespace AMDGPUISD {
Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -646,6 +646,11 @@
 // TargetLowering Callbacks
 //===---------------------------------------------------------------------===//
 
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                    bool IsVarArg) const {
+  return CC_AMDGPU;
+}
+
 /// The SelectionDAGBuilder will automatically promote function arguments
 /// with illegal types.  However, this does not work for the AMDGPU targets
 /// since the function arguments are stored in memory as these illegal types.
Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.h
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -0,0 +1,61 @@
+//===- AMDGPUInstructionSelector --------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the InstructionSelector class for
+/// AMDGPU.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+
+class AMDGPUInstrInfo;
+class AMDGPURegisterBankInfo;
+class SIInstrInfo;
+class SIRegisterInfo;
+class SISubtarget;
+
+class AMDGPUInstructionSelector : public InstructionSelector {
+public:
+  AMDGPUInstructionSelector(const SISubtarget &STI,
+                            const AMDGPURegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+  struct GEPInfo {
+    const MachineInstr &GEP;
+    SmallVector<unsigned, 2> SgprParts;
+    SmallVector<unsigned, 2> VgprParts;
+    int64_t Imm;
+    GEPInfo(const MachineInstr &GEP) : GEP(GEP), Imm(0) { }
+  };
+
+private:
+  MachineOperand getSubOperand64(MachineOperand &MO, unsigned SubIdx) const;
+  bool selectG_CONSTANT(MachineInstr &I) const;
+  bool selectG_ADD(MachineInstr &I) const;
+  bool selectG_GEP(MachineInstr &I) const;
+  bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
+  void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
+                       SmallVectorImpl<GEPInfo> &AddrInfo) const;
+  bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
+  bool selectG_LOAD(MachineInstr &I) const;
+  bool selectG_STORE(MachineInstr &I) const;
+
+  const SIInstrInfo &TII;
+  const SIRegisterInfo &TRI;
+  const AMDGPURegisterBankInfo &RBI;
+};
+
+} // End llvm namespace.
+#endif
Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -0,0 +1,406 @@
+//===- AMDGPUInstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "amdgpu-isel"
+
+using namespace llvm;
+
+AMDGPUInstructionSelector::AMDGPUInstructionSelector(
+    const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
+    : InstructionSelector(), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()),
+      RBI(RBI) {}
+
+MachineOperand AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
+                                      unsigned SubIdx) const {
+
+  MachineInstr *MI = MO.getParent();
+  MachineBasicBlock *BB = MO.getParent()->getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+  if (MO.isReg()) {
+    unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
+    unsigned Reg = MO.getReg();
+    BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
+            .addReg(Reg, 0, ComposedSubIdx);
+
+    return MachineOperand::CreateReg(DstReg, MO.isDef(), MO.isImplicit(),
+                                     MO.isKill(), MO.isDead(), MO.isUndef(),
+                                     MO.isEarlyClobber(), 0,
+                                     MO.isDebug(), MO.isInternalRead());
+  }
+
+  assert(MO.isImm());
+
+  APInt Imm(64, MO.getImm());
+
+  switch (SubIdx) {
+  default:
+    llvm_unreachable("do not know to split immediate with this sub index.");
+  case AMDGPU::sub0:
+    return MachineOperand::CreateImm(Imm.getLoBits(32).getSExtValue());
+  case AMDGPU::sub1:
+    return MachineOperand::CreateImm(Imm.getHiBits(32).getSExtValue());
+  }
+}
+
+bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned Size = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
+  unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  if (Size != 64)
+    return false;
+
+  DebugLoc DL = I.getDebugLoc();
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
+          .addOperand(getSubOperand64(I.getOperand(1), AMDGPU::sub0))
+          .addOperand(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
+          .addOperand(getSubOperand64(I.getOperand(1), AMDGPU::sub1))
+          .addOperand(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg())
+          .addReg(DstLo)
+          .addImm(AMDGPU::sub0)
+          .addReg(DstHi)
+          .addImm(AMDGPU::sub1);
+
+  for (MachineOperand &MO : I.explicit_operands()) {
+    if (!MO.isReg() || TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      continue;
+    RBI.constrainGenericRegister(MO.getReg(), AMDGPU::SReg_64RegClass, MRI);
+  }
+
+  I.eraseFromParent();
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
+  return selectG_ADD(I);
+}
+
+bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  DebugLoc DL = I.getDebugLoc();
+
+  // FIXME: Select store instruction based on address space
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(AMDGPU::FLAT_STORE_DWORD))
+          .addOperand(I.getOperand(1))
+          .addOperand(I.getOperand(0))
+          .addImm(0)
+          .addImm(0)
+          .addImm(0);
+
+  // Now that we selected an opcode, we need to constrain the register
+  // operands to use appropriate classes.
+  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
+
+  I.eraseFromParent();
+  return Ret;
+}
+
+bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+  if (Size == 32) {
+    I.setDesc(TII.get(AMDGPU::S_MOV_B32));
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
+
+  assert(Size == 64);
+
+  DebugLoc DL = I.getDebugLoc();
+  unsigned LoReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  unsigned HiReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  APInt Imm(64, I.getOperand(1).getImm());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), LoReg)
+          .addImm(Imm.trunc(32).getZExtValue());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
+          .addImm(Imm.ashr(32).getZExtValue());
+
+  BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+          .addReg(LoReg)
+          .addImm(AMDGPU::sub0)
+          .addReg(HiReg)
+          .addImm(AMDGPU::sub1);
+  // We can't call constrainSelectedInstRegOperands here, because it doesn't
+  // work for target independent opcodes
+  I.eraseFromParent();
+  return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+}
+
+static bool isConstant(const MachineInstr &MI) {
+  return MI.getOpcode() == TargetOpcode::G_CONSTANT;
+}
+
+void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
+    const MachineRegisterInfo &MRI, SmallVectorImpl<GEPInfo> &AddrInfo) const {
+
+  const MachineInstr *PtrMI = MRI.getUniqueVRegDef(Load.getOperand(1).getReg());
+
+  assert(PtrMI);
+
+  if (PtrMI->getOpcode() != TargetOpcode::G_GEP) {
+    return;
+  }
+
+  GEPInfo GEPInfo(*PtrMI);
+
+  for (unsigned i = 1, e = 3; i < e; ++i) {
+    const MachineOperand &GEPOp = PtrMI->getOperand(i);
+    const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
+    assert(OpDef);
+    if (isConstant(*OpDef)) {
+      // FIXME: Is it possible to have multiple Imm parts?  Maybe if we
+      // are lacking other optimizations.
+      assert(GEPInfo.Imm == 0);
+      GEPInfo.Imm = OpDef->getOperand(1).getImm();
+      continue;
+    }
+    const RegisterBank *OpBank = RBI.getRegBank(GEPOp.getReg(), MRI, TRI);
+    if (OpBank->getID() == AMDGPU::SGPRRegBankID)
+      GEPInfo.SgprParts.push_back(GEPOp.getReg());
+    else
+      GEPInfo.VgprParts.push_back(GEPOp.getReg());
+  }
+
+  AddrInfo.push_back(GEPInfo);
+  getAddrModeInfo(*PtrMI, MRI, AddrInfo);
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  const Value *Ptr = MMO->getValue();
+
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
+
+static unsigned getSmrdOpcode(unsigned BaseOpcode, unsigned LoadSize) {
+
+  if (LoadSize == 32)
+    return BaseOpcode;
+
+  switch (BaseOpcode) {
+    case AMDGPU::S_LOAD_DWORD_IMM:
+      switch (LoadSize) {
+      case 64: return AMDGPU::S_LOAD_DWORDX2_IMM;
+      case 128: return AMDGPU::S_LOAD_DWORDX4_IMM;
+      case 256: return AMDGPU::S_LOAD_DWORDX8_IMM;
+      case 512: return AMDGPU::S_LOAD_DWORDX16_IMM;
+      }
+      break;
+    case AMDGPU::S_LOAD_DWORD_IMM_ci:
+      switch (LoadSize) {
+      case 64: return AMDGPU::S_LOAD_DWORDX2_IMM_ci;
+      case 128: return AMDGPU::S_LOAD_DWORDX4_IMM_ci;
+      case 256: return AMDGPU::S_LOAD_DWORDX8_IMM_ci;
+      case 512: return AMDGPU::S_LOAD_DWORDX16_IMM_ci;
+      }
+      break;
+    case AMDGPU::S_LOAD_DWORD_SGPR:
+      switch (LoadSize) {
+      case 64: return AMDGPU::S_LOAD_DWORDX2_SGPR;
+      case 128: return AMDGPU::S_LOAD_DWORDX4_SGPR;
+      case 256: return AMDGPU::S_LOAD_DWORDX8_SGPR;
+      case 512: return AMDGPU::S_LOAD_DWORDX16_SGPR;
+      }
+      break;
+  }
+  llvm_unreachable("Invalid base smrd opcode or size");
+}
+
+bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
+  for (const GEPInfo &GEPInfo : AddrInfo) {
+    if (!GEPInfo.VgprParts.empty())
+      return true;
+  }
+  return false;
+}
+
+bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
+                                           ArrayRef<GEPInfo> AddrInfo) const {
+
+  if (!I.hasOneMemOperand())
+    return false;
+
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+    return false;
+
+  if (!isInstrUniform(I))
+    return false;
+
+  if (hasVgprParts(AddrInfo))
+    return false;
+
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  const SISubtarget &Subtarget = MF->getSubtarget<SISubtarget>();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned DstReg = I.getOperand(0).getReg();
+  const DebugLoc &DL = I.getDebugLoc();
+  unsigned Opcode;
+  unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+
+  if (!AddrInfo.empty() && AddrInfo[0].SgprParts.size() == 1) {
+
+    const GEPInfo &GEPInfo = AddrInfo[0];
+
+    unsigned PtrReg = GEPInfo.SgprParts[0];
+    int64_t EncodedImm = SIInstrInfo::getSMRDEncodedOffset(Subtarget,
+                                                           GEPInfo.Imm);
+    if (SIInstrInfo::isLegalSMRDImmOffset(Subtarget, GEPInfo.Imm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
+
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                 .addReg(PtrReg)
+                                 .addImm(EncodedImm)
+                                 .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+
+    if (Subtarget.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS &&
+        isUInt<32>(EncodedImm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM_ci, LoadSize);
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                   .addReg(PtrReg)
+                                   .addImm(EncodedImm)
+                                   .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+
+    if (isUInt<32>(GEPInfo.Imm)) {
+      Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_SGPR, LoadSize);
+      unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), OffsetReg)
+              .addImm(GEPInfo.Imm);
+
+      MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                                   .addReg(PtrReg)
+                                   .addReg(OffsetReg)
+                                   .addImm(0); // glc
+      return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+    }
+  }
+
+  unsigned PtrReg = I.getOperand(1).getReg();
+  Opcode = getSmrdOpcode(AMDGPU::S_LOAD_DWORD_IMM, LoadSize);
+  MachineInstr *SMRD = BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg)
+                               .addReg(PtrReg)
+                               .addImm(0)
+                               .addImm(0); // glc
+  return constrainSelectedInstRegOperands(*SMRD, TII, TRI, RBI);
+}
+
+
+bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  DebugLoc DL = I.getDebugLoc();
+  unsigned DstReg = I.getOperand(0).getReg();
+  unsigned PtrReg = I.getOperand(1).getReg();
+  unsigned LoadSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+  unsigned Opcode;
+
+  SmallVector<GEPInfo, 4> AddrInfo;
+
+  getAddrModeInfo(I, MRI, AddrInfo);
+
+  if (selectSMRD(I, AddrInfo)) {
+    I.eraseFromParent();
+    return true;
+  }
+
+  switch (LoadSize) {
+  default: llvm_unreachable("Load size not supported\n");
+  case 32:
+    Opcode = AMDGPU::FLAT_LOAD_DWORD;
+    break;
+  case 64:
+    Opcode = AMDGPU::FLAT_LOAD_DWORDX2;
+    break;
+  }
+
+  MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
+                               .addOperand(I.getOperand(0))
+                               .addReg(PtrReg)
+                               .addImm(0)
+                               .addImm(0)
+                               .addImm(0);
+
+  bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
+  I.eraseFromParent();
+  return Ret;
+}
+
+bool AMDGPUInstructionSelector::select(MachineInstr &I) const {
+
+  if (!isPreISelGenericOpcode(I.getOpcode()))
+    return true;
+
+  switch (I.getOpcode()) {
+  default: break;
+  case TargetOpcode::G_ADD:
+    return selectG_ADD(I);
+  case TargetOpcode::G_CONSTANT:
+    return selectG_CONSTANT(I);
+  case TargetOpcode::G_GEP:
+    return selectG_GEP(I);
+  case TargetOpcode::G_LOAD:
+    return selectG_LOAD(I);
+  case TargetOpcode::G_STORE:
+    return selectG_STORE(I);
+  }
+  return false;
+}
Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.h
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -0,0 +1,30 @@
+//===- AMDGPULegalizerInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class LLVMContext;
+
+/// This class provides the information for the target register banks.
+class AMDGPULegalizerInfo : public LegalizerInfo {
+public:
+  AMDGPULegalizerInfo();
+};
+} // End llvm namespace.
+#endif
Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -0,0 +1,57 @@
+//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
+  using namespace TargetOpcode;
+
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+  const LLT P1 = LLT::pointer(1, 64);
+  const LLT P2 = LLT::pointer(2, 64);
+
+  setAction({G_CONSTANT, S64}, Legal);
+
+  setAction({G_GEP, P1}, Legal);
+  setAction({G_GEP, 1, P1}, Legal);
+  setAction({G_GEP, P2}, Legal);
+  setAction({G_GEP, 1, P2}, Legal);
+  setAction({G_GEP, 1, S64}, Legal);
+  setAction({G_GEP, 2, S64}, Legal);
+
+  setAction({G_LOAD, P1}, Legal);
+  setAction({G_LOAD, P2}, Legal);
+  setAction({G_LOAD, S32}, Legal);
+  setAction({G_LOAD, 1, S64}, Legal);
+  setAction({G_LOAD, 1, P1}, Legal);
+  setAction({G_LOAD, 1, P2}, Legal);
+
+  setAction({G_STORE, S32}, Legal);
+  setAction({G_STORE, 1, S64}, Legal);
+  setAction({G_STORE, 1, P1}, Legal);
+
+  computeTables();
+}
Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -0,0 +1,75 @@
+//===- AMDGPURegisterBankInfo -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+namespace llvm {
+
+class SIRegisterInfo;
+class TargetRegisterInfo;
+
+namespace AMDGPU {
+enum {
+  SGPRRegBankID = 0,
+  VGPRRegBankID = 1,
+  NumRegisterBanks
+};
+} // End AMDGPU namespace.
+
+/// This class provides the information for the target register banks.
+class AMDGPURegisterBankInfo : public RegisterBankInfo {
+  const SIRegisterInfo *TRI;
+
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  RegisterBankInfo::InstructionMapping
+  getInstrMappingForLoad(const MachineInstr &MI) const;
+public:
+  AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
+  /// Get the cost of a copy from \p B to \p A, or put differently,
+  /// get the cost of A = COPY B. Since register banks may cover
+  /// different size, \p Size specifies what will be the size in bits
+  /// that will be copied around.
+  ///
+  /// \note Since this is a copy, both registers have the same size.
+  unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
+                    unsigned Size) const override;
+
+  /// Get a register bank that covers \p RC.
+  ///
+  /// \pre \p RC is a user-defined register class (as opposed as one
+  /// generated by TableGen).
+  ///
+  /// \note The mapping RC -> RegBank could be built while adding the
+  /// coverage for the register banks. However, we do not do it, because,
+  /// at least for now, we only need this information for register classes
+  /// that are used in the description of instruction. In other words,
+  /// there are just a handful of them and we do not want to waste space.
+  ///
+  /// \todo This should be TableGen'ed.
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  /// Get the alternative mappings for \p MI.
+  /// Alternative in the sense different from getInstrMapping.
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+} // End llvm namespace.
+#endif
Index: lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -0,0 +1,236 @@
+//===- AMDGPURegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for
+/// AMDGPU.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterBankInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+// This file will be TableGen'ed at some point.
+#include "AMDGPUGenRegisterBankInfo.def"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
+    : RegisterBankInfo(AMDGPU::RegBanks, AMDGPU::NumRegisterBanks),
+      TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
+
+  // HACK: Until this is fully tablegen'd
+  static bool AlreadyInit = false;
+  if (AlreadyInit)
+    return;
+
+  AlreadyInit = true;
+
+  createRegisterBank(AMDGPU::SGPRRegBankID, "SGPR");
+  addRegBankCoverage(AMDGPU::SGPRRegBankID, AMDGPU::SGPR_32RegClassID, TRI);
+  addRegBankCoverage(AMDGPU::SGPRRegBankID, AMDGPU::SReg_64RegClassID, TRI);
+  addRegBankCoverage(AMDGPU::SGPRRegBankID, AMDGPU::SGPR_64RegClassID, TRI);
+  const RegisterBank &RBSGPR = getRegBank(AMDGPU::SGPRRegBankID);
+  assert(RBSGPR.getSize() == 64);
+  assert(&RBSGPR == &AMDGPU::SGPRRegBank);
+
+  createRegisterBank(AMDGPU::VGPRRegBankID, "VGPR");
+  addRegBankCoverage(AMDGPU::VGPRRegBankID, AMDGPU::VGPR_32RegClassID, TRI);
+  addRegBankCoverage(AMDGPU::VGPRRegBankID, AMDGPU::VReg_64RegClassID, TRI);
+  addRegBankCoverage(AMDGPU::VGPRRegBankID, AMDGPU::VReg_96RegClassID, TRI);
+  const RegisterBank &RBVGPR = getRegBank(AMDGPU::VGPRRegBankID);
+  assert(RBVGPR.getSize() == 96);
+  assert(&RBVGPR == &AMDGPU::VGPRRegBank);
+
+}
+
+unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &A,
+                                           const RegisterBank &B,
+                                           unsigned Size) const {
+  return RegisterBankInfo::copyCost(A, B, Size);
+}
+
+const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+
+  if (TRI->isSGPRClass(&RC))
+    return getRegBank(AMDGPU::SGPRRegBankID);
+
+  return getRegBank(AMDGPU::VGPRRegBankID);
+}
+
+RegisterBankInfo::InstructionMappings
+AMDGPURegisterBankInfo::getInstrAlternativeMappings(
+    const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+
+  InstructionMappings AltMappings;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD: {
+    // FIXME: Should we be hard coding the size for these mappings?
+    InstructionMapping SSMapping(1, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+      2); // Num Operands
+    AltMappings.emplace_back(std::move(SSMapping));
+
+    InstructionMapping VVMapping(2, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
+      2); // Num Operands
+    AltMappings.emplace_back(std::move(VVMapping));
+
+    // FIXME: Should this be the pointer-size (64-bits) or the size of the
+    // register that will hold the bufffer resourc (128-bits).
+    InstructionMapping VSMapping(3, 1,
+      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+      2); // Num Operands
+    AltMappings.emplace_back(std::move(VSMapping));
+
+    return AltMappings;
+
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
+
+void AMDGPURegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  return applyDefaultMapping(OpdMapper);
+}
+
+static bool isInstrUniform(const MachineInstr &MI) {
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  const MachineMemOperand *MMO = *MI.memoperands_begin();
+  return AMDGPU::isUniformMMO(MMO);
+}
+
+RegisterBankInfo::InstructionMapping
+AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  RegisterBankInfo::InstructionMapping Mapping =
+      InstructionMapping{1, 1, nullptr, MI.getNumOperands()};
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+  unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+
+  const ValueMapping *ValMapping;
+  const ValueMapping *PtrMapping;
+
+  if (isInstrUniform(MI)) {
+    // We have a uniform instruction so we want to use an SMRD load
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    // FIXME: Don't hard code pointer size.
+    PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+  } else {
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    // FIXME: Don't hard code pointer size.
+    // FIXME: What would happen if we used SGPRRegBankID here?
+    PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+  }
+
+  OpdsMapping[0] = ValMapping;
+  OpdsMapping[1] = PtrMapping;
+  Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+  return Mapping;
+
+  // FIXME: Do we want to add a mapping for FLAT load, or should we just
+  // handle that during instruction selection?
+}
+
+RegisterBankInfo::InstructionMapping
+AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+
+  if (Mapping.isValid())
+    return Mapping;
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()};
+  SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+
+  switch (MI.getOpcode()) {
+  default: break;
+  case AMDGPU::G_CONSTANT: {
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+    return Mapping;
+  }
+  case AMDGPU::G_GEP: {
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (!MI.getOperand(i).isReg())
+        continue;
+
+      unsigned Size = MRI.getType(MI.getOperand(i).getReg()).getSizeInBits();
+      OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+    }
+    Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+    return Mapping;
+  }
+  case AMDGPU::G_STORE: {
+    assert(MI.getOperand(0).isReg());
+    unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    // FIXME: We need to specify a different reg bank once scalar stores
+    // are supported.
+    const ValueMapping *ValMapping =
+        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+    // FIXME: Depending on the type of store, the pointer could be in
+    // the SGPR Reg bank.
+    // FIXME: Pointer size should be based on the address space.
+    const ValueMapping *PtrMapping =
+        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
+
+    OpdsMapping[0] = ValMapping;
+    OpdsMapping[1] = PtrMapping;
+    Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+    return Mapping;
+  }
+
+  case AMDGPU::G_LOAD:
+    return getInstrMappingForLoad(MI);
+  }
+
+  unsigned BankID = AMDGPU::SGPRRegBankID;
+
+  Mapping = InstructionMapping{1, 1, nullptr, MI.getNumOperands()};
+  unsigned Size = 0;
+  for (unsigned Idx = 0; Idx < MI.getNumOperands(); ++Idx) {
+    // If the operand is not a register default to the size of the previous
+    // operand.
+    // FIXME: Can't we pull the types from the MachineInstr rather than the
+    // operands.
+    if (MI.getOperand(Idx).isReg())
+      Size = getSizeInBits(MI.getOperand(Idx).getReg(), MRI, *TRI);
+    OpdsMapping.push_back(AMDGPU::getValueMapping(BankID, Size));
+  }
+  Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
+
+  return Mapping;
+}
Index: lib/Target/AMDGPU/AMDGPUSubtarget.h
===================================================================
--- lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -496,6 +496,21 @@
     return GISel->getCallLowering();
   }
 
+  const InstructionSelector *getInstructionSelector() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getInstructionSelector();
+  }
+
+  const LegalizerInfo *getLegalizerInfo() const {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getLegalizerInfo();
+  }
+
+  const RegisterBankInfo *getRegBankInfo() const override {
+    assert(GISel && "Access to GlobalISel APIs not set");
+    return GISel->getRegBankInfo();
+  }
+
   const SIRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,9 @@
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
 #include "AMDGPUCallLowering.h"
+#include "AMDGPUInstructionSelector.h"
+#include "AMDGPULegalizerInfo.h"
+#include "AMDGPURegisterBankInfo.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNSchedStrategy.h"
@@ -25,7 +28,12 @@
 #include "SIISelLowering.h"
 #include "SIInstrInfo.h"
 #include "SIMachineScheduler.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -221,9 +229,21 @@
 namespace {
 struct SIGISelActualAccessor : public GISelAccessor {
   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
   const AMDGPUCallLowering *getCallLowering() const override {
     return CallLoweringInfo.get();
   }
+  const InstructionSelector *getInstructionSelector() const override {
+    return InstSelector.get();
+  }
+  const class LegalizerInfo *getLegalizerInfo() const override {
+    return Legalizer.get();
+  }
+  const RegisterBankInfo *getRegBankInfo() const override {
+    return RegBankInfo.get();
+  }
 };
 } // End anonymous namespace.
 #endif
@@ -256,6 +276,11 @@
     SIGISelActualAccessor *GISel = new SIGISelActualAccessor();
     GISel->CallLoweringInfo.reset(
       new AMDGPUCallLowering(*I->getTargetLowering()));
+    GISel->Legalizer.reset(new AMDGPULegalizerInfo());
+
+    GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo()));
+    GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I,
+				*static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get())));
 #endif
 
     I->setGISelAccessor(*GISel);
@@ -546,16 +571,20 @@
 }
 
 bool GCNPassConfig::addLegalizeMachineIR() {
+  addPass(new Legalizer());
   return false;
 }
 
 bool GCNPassConfig::addRegBankSelect() {
+  addPass(new RegBankSelect());
   return false;
 }
 
 bool GCNPassConfig::addGlobalInstructionSelect() {
+  addPass(new InstructionSelect());
   return false;
 }
+
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -39,14 +39,17 @@
   AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
   AMDGPUISelDAGToDAG.cpp
+  AMDGPULegalizerInfo.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUOpenCLImageTypeLoweringPass.cpp
+  AMDGPURegisterBankInfo.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
   AMDGPUTargetTransformInfo.cpp
   AMDGPUISelLowering.cpp
   AMDGPUInstrInfo.cpp
+  AMDGPUInstructionSelector.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
   GCNHazardRecognizer.cpp
Index: lib/Target/AMDGPU/LLVMBuild.txt
===================================================================
--- lib/Target/AMDGPU/LLVMBuild.txt
+++ lib/Target/AMDGPU/LLVMBuild.txt
@@ -30,5 +30,5 @@
 type = Library
 name = AMDGPUCodeGen
 parent = AMDGPU
-required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize
+required_libraries = Analysis AsmPrinter CodeGen Core IPO MC AMDGPUAsmPrinter AMDGPUDesc AMDGPUInfo AMDGPUUtils Scalar SelectionDAG Support Target TransformUtils Vectorize GlobalISel
 add_to_library_groups = AMDGPU
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -611,18 +611,8 @@
 
 bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
   const MemSDNode *MemNode = cast<MemSDNode>(N);
-  const Value *Ptr = MemNode->getMemOperand()->getValue();
-
-  // UndefValue means this is a load of a kernel input.  These are uniform.
-  // Sometimes LDS instructions have constant pointers.
-  // If Ptr is null, then that means this mem operand contains a
-  // PseudoSourceValue like GOT.
-  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
-      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
-    return true;
 
-  const Instruction *I = dyn_cast<Instruction>(Ptr);
-  return I && I->getMetadata("amdgpu.uniform");
+  return AMDGPU::isUniformMMO(MemNode->getMemOperand());
 }
 
 TargetLoweringBase::LegalizeTypeAction
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -424,6 +424,12 @@
     return MI.getDesc().TSFlags & SIInstrFlags::SOPK_ZEXT;
   }
 
+  static int64_t getSMRDEncodedOffset(const AMDGPUSubtarget &ST,
+                                      int64_t ByteOffset);
+
+  static bool isLegalSMRDImmOffset(const AMDGPUSubtarget &ST,
+                                   int64_t ByteOffset);
+
   bool sopkIsZext(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::SOPK_ZEXT;
   }
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2529,6 +2529,21 @@
   return DstReg;
 }
 
+int64_t SIInstrInfo::getSMRDEncodedOffset(const AMDGPUSubtarget &ST,
+                                          int64_t ByteOffset) {
+  if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+    return ByteOffset >> 2;
+
+  return ByteOffset;
+}
+
+bool SIInstrInfo::isLegalSMRDImmOffset(const AMDGPUSubtarget &ST,
+                                       int64_t ByteOffset) {
+  int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
+  return ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ?
+    isUInt<8>(EncodedOffset) : isUInt<20>(EncodedOffset);
+}
+
 void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
                                        MachineInstr &MI) const {
 
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -22,6 +22,7 @@
 class FeatureBitset;
 class Function;
 class GlobalValue;
+class MachineMemOperand;
 class MCContext;
 class MCInstrDesc;
 class MCRegisterClass;
@@ -171,6 +172,8 @@
 bool isInlinableLiteral64(int64_t Literal, bool IsVI);
 bool isInlinableLiteral32(int32_t Literal, bool IsVI);
 
+bool isUniformMMO(const MachineMemOperand *MMO);
+
 } // end namespace AMDGPU
 } // end namespace llvm
 
Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
===================================================================
--- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -9,6 +9,8 @@
 #include "AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
 #include "SIDefines.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -428,6 +430,19 @@
   return false;
 }
 
+bool isUniformMMO(const MachineMemOperand *MMO) {
+  const Value *Ptr = MMO->getValue();
+  // UndefValue means this is a load of a kernel input.  These are uniform.
+  // Sometimes LDS instructions have constant pointers.
+  // If Ptr is null, then that means this mem operand contains a
+  // PseudoSourceValue like GOT.
+  if (!Ptr || isa<UndefValue>(Ptr) || isa<Argument>(Ptr) ||
+      isa<Constant>(Ptr) || isa<GlobalValue>(Ptr))
+    return true;
+
+  const Instruction *I = dyn_cast<Instruction>(Ptr);
+  return I && I->getMetadata("amdgpu.uniform");
+}
 
 } // End namespace AMDGPU
 } // End namespace llvm
Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -0,0 +1,29 @@
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+
+--- |
+  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+...
+---
+
+name:            global_addrspace
+legalized:       true
+regBankSelected: true
+
+registers:
+  - { id: 0, class: vgpr }
+  - { id: 1, class: vgpr }
+
+# GCN: global_addrspace
+# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
+# GCN: FLAT_LOAD_DWORD  [[PTR]], 0, 0, 0
+
+body: |
+  bb.0:
+    liveins:  %vgpr0_vgpr1
+
+    %0(p1) = COPY %vgpr0_vgpr1
+    %1(s32) = G_LOAD %0 :: (load 4 from %ir.global0)
+
+...
+---
Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -0,0 +1,171 @@
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,SI,SICI,SIVI
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,CI,SICI
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN,VI,SIVI
+
+--- |
+  define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+...
+---
+
+name:            smrd_imm
+legalized:       true
+regBankSelected: true
+
+registers:
+  - { id: 0, class: sgpr }
+  - { id: 1, class: sgpr }
+  - { id: 2, class: sgpr }
+  - { id: 3, class: sgpr }
+  - { id: 4, class: sgpr }
+  - { id: 5, class: sgpr }
+  - { id: 6, class: sgpr }
+  - { id: 7, class: sgpr }
+  - { id: 8, class: sgpr }
+  - { id: 9, class: sgpr }
+  - { id: 10, class: sgpr }
+  - { id: 11, class: sgpr }
+  - { id: 12, class: sgpr }
+  - { id: 13, class: sgpr }
+  - { id: 14, class: sgpr }
+  - { id: 15, class: sgpr }
+  - { id: 16, class: sgpr }
+  - { id: 17, class: sgpr }
+  - { id: 18, class: sgpr }
+  - { id: 19, class: sgpr }
+  - { id: 20, class: sgpr }
+  - { id: 21, class: sgpr }
+  - { id: 22, class: sgpr }
+  - { id: 23, class: sgpr }
+  - { id: 24, class: sgpr }
+  - { id: 25, class: sgpr }
+  - { id: 26, class: sgpr }
+  - { id: 27, class: sgpr }
+
+# GCN: body:
+# GCN: [[PTR:%[0-9]+]] = COPY %sgpr0_sgpr1
+
+# Immediate offset:
+# SICI: S_LOAD_DWORD_IMM [[PTR]], 1, 0
+# VI:   S_LOAD_DWORD_IMM [[PTR]], 4, 0 
+
+# Max immediate offset for SI
+# SICI: S_LOAD_DWORD_IMM [[PTR]], 255, 0
+# VI:   S_LOAD_DWORD_IMM [[PTR]], 1020, 0
+
+# Immediate overflow for SI
+# FIXME: The immediate gets selected twice, once into the
+# S_LOAD_DWORD instruction and once just as a normal constat.
+# SI: S_MOV_B32 1024
+# SI: [[K1024:%[0-9]+]] = S_MOV_B32 1024
+# SI: S_LOAD_DWORD_SGPR [[PTR]], [[K1024]], 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 256, 0
+# VI: S_LOAD_DWORD_IMM [[PTR]], 1024, 0
+
+# Max immediate offset for VI
+# SI: S_MOV_B32 1048572
+# SI: [[K1048572:%[0-9]+]] = S_MOV_B32 1048572
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262143
+# VI: S_LOAD_DWORD_IMM [[PTR]], 1048572
+
+#
+# Immediate overflow for VI
+# FIXME: The immediate gets selected twice, once into the
+# S_LOAD_DWORD instruction and once just as a normal constat.
+# SIVI: S_MOV_B32 1048576
+# SIVI: [[K1048576:%[0-9]+]] = S_MOV_B32 1048576
+# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K1048576]], 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0
+
+# Max immediate for CI
+# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 4294967292
+# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 3
+# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# SIVI: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
+# SIVI: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# SIVI: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
+# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 4294967295, 0
+
+# Immediate overflow for CI
+# GCN: [[K_LO:%[0-9]+]] = S_MOV_B32 0
+# GCN: [[K_HI:%[0-9]+]] = S_MOV_B32 4
+# GCN: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# GCN: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# GCN: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# GCN: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
+# GCN: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# GCN: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# GCN: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
+# GCN: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
+
+# Max 32-bit byte offset
+# FIXME: The immediate gets selected twice, once into the
+# S_LOAD_DWORD instruction and once just as a normal constat.
+# SIVI: S_MOV_B32 4294967292
+# SIVI: [[K4294967292:%[0-9]+]] = S_MOV_B32 4294967292
+# SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K4294967292]], 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0
+
+# Overflow 32-bit byte offset
+# SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 0
+# SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 1
+# SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
+# SIVI: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# SIVI: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
+# SIVI: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# SIVI: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
+# SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
+# SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
+# CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741824, 0
+
+body: |
+  bb.0:
+    liveins: %sgpr0_sgpr1
+   
+    %0(p2) = COPY %sgpr0_sgpr1
+
+    %1(s64) = G_CONSTANT 4
+    %2(p2) = G_GEP %0, %1
+    %3(s32) = G_LOAD %2 :: (load 4 from %ir.const0)
+
+    %4(s64) = G_CONSTANT 1020
+    %5(p2) = G_GEP %0, %4
+    %6(s32) = G_LOAD %5 :: (load 4 from %ir.const0)
+
+    %7(s64) = G_CONSTANT 1024
+    %8(p2) = G_GEP %0, %7
+    %9(s32) = G_LOAD %8 :: (load 4 from %ir.const0)
+
+    %10(s64) = G_CONSTANT 1048572
+    %11(p2) = G_GEP %0, %10
+    %12(s32) = G_LOAD %11 :: (load 4 from %ir.const0)
+
+    %13(s64) = G_CONSTANT 1048576
+    %14(p2) = G_GEP %0, %13
+    %15(s32) = G_LOAD %14 :: (load 4 from %ir.const0)
+
+    %16(s64) = G_CONSTANT 17179869180
+    %17(p2) = G_GEP %0, %16
+    %18(s32) = G_LOAD %17 :: (load 4 from %ir.const0)
+
+    %19(s64) = G_CONSTANT 17179869184
+    %20(p2) = G_GEP %0, %19
+    %21(s32) = G_LOAD %20 :: (load 4 from %ir.const0)
+
+    %22(s64) = G_CONSTANT 4294967292
+    %23(p2) = G_GEP %0, %22
+    %24(s32) = G_LOAD %23 :: (load 4 from %ir.const0)
+
+    %25(s64) = G_CONSTANT 4294967296
+    %26(p2) = G_GEP %0, %25
+    %27(s32) = G_LOAD %26 :: (load 4 from %ir.const0)
+
+...
+---
Index: test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -0,0 +1,31 @@
+# RUN: llc -march=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefixes=GCN
+
+--- |
+  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+...
+---
+
+name:            global_addrspace
+legalized:       true
+regBankSelected: true
+
+registers:
+  - { id: 0, class: vgpr }
+  - { id: 1, class: vgpr }
+
+# GCN: global_addrspace
+# GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1
+# GCN: [[VAL:%[0-9]+]] = COPY %vgpr2
+# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0
+
+body: |
+  bb.0:
+    liveins:  %vgpr0_vgpr1, %vgpr2
+
+    %0(p1) = COPY %vgpr0_vgpr1
+    %1(s32) = COPY %vgpr2
+    G_STORE %1, %0 :: (store 4 into %ir.global0)
+
+...
+---
Index: test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/GlobalISel/shader-epilogs.ll
@@ -0,0 +1,9 @@
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=GCN %s
+
+; GCN-LABEL: vs_epilog
+; GCN: s_endpgm
+
+define amdgpu_vs void @vs_epilog() {
+main_body:
+  ret void
+}
Index: test/CodeGen/AMDGPU/GlobalISel/smrd.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=CI --check-prefix=GCN %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs -global-isel | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
+
+; SMRD load with an immediate offset.
+; GCN-LABEL: {{^}}smrd0:
+; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
+define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset.
+; GCN-LABEL: {{^}}smrd1:
+; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
+define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate.
+; GCN-LABEL: {{^}}smrd2:
+; SI: s_movk_i32 s[[OFFSET:[0-9]]], 0x400
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
+; GCN: s_endpgm
+define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with a 64-bit offset
+; GCN-LABEL: {{^}}smrd3:
+; FIXME: There are too many copies here because we don't fold immediates
+;        through REG_SEQUENCE
+; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
+; TODO: Add VI checks
+; XGCN: s_endpgm
+define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with the largest possible immediate offset on VI
+; GCN-LABEL: {{^}}smrd4:
+; SI: s_mov_b32 [[OFFSET:s[0-9]+]], 0xffffc
+; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
+; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
+define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; SMRD load with an offset greater than the largest possible immediate on VI
+; GCN-LABEL: {{^}}smrd5:
+; SIVI: s_mov_b32 [[OFFSET:s[0-9]+]], 0x100000
+; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
+; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
+; GCN: s_endpgm
+define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+entry:
+  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
+  %1 = load i32, i32 addrspace(2)* %0
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+