Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -41,6 +41,7 @@
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIFixupVectorISelPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -118,6 +119,9 @@
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
+void initializeSIFixupVectorISelPass(PassRegistry &);
+extern char &SIFixupVectorISelID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -161,6 +161,7 @@
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
+  initializeSIFixupVectorISelPass(*PR);
   initializeSIFoldOperandsPass(*PR);
   initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
@@ -813,6 +814,7 @@
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
+  addPass(createSIFixupVectorISelPass());
   return false;
 }
 
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -95,6 +95,7 @@
   SIAnnotateControlFlow.cpp
   SIDebuggerInsertNops.cpp
   SIFixSGPRCopies.cpp
+  SIFixupVectorISel.cpp
   SIFixVGPRCopies.cpp
   SIFixWWMLiveness.cpp
   SIFoldOperands.cpp
Index: lib/Target/AMDGPU/FLATInstructions.td
===================================================================
--- lib/Target/AMDGPU/FLATInstructions.td
+++ lib/Target/AMDGPU/FLATInstructions.td
@@ -121,6 +121,11 @@
   let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
 }
 
+class GlobalSaddrTable <bit is_saddr, string Name = ""> {
+  bit IsSaddr = is_saddr;
+  string SaddrOp = Name;
+}
+
 // TODO: Is exec allowed for saddr? The disabled value 0x7f is the
 // same encoding value as exec_hi, so it isn't possible to use that if
 // saddr is 32-bit (which isn't handled here yet).
@@ -171,15 +176,19 @@
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
-    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
+      GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>,
+      GlobalSaddrTable<1, opName>;
   }
 }
 
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
-    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
+      GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>,
+      GlobalSaddrTable<1, opName>;
   }
 }
 
@@ -287,6 +296,7 @@
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, off$offset$slc">,
+    GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
     let PseudoInstr = NAME;
@@ -298,6 +308,7 @@
     " $vdst, $vaddr, $vdata, off$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      GlobalSaddrTable<0, opName#"_rtn">,
       AtomicNoRet <opName, 1> {
     let has_saddr = 1;
   }
@@ -306,6 +317,7 @@
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
+    GlobalSaddrTable<1, opName>,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
     let enabled_saddr = 1;
@@ -316,6 +328,7 @@
     (outs vdst_rc:$vdst),
       (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+    GlobalSaddrTable<1, opName#"_rtn">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
      let enabled_saddr = 1;
Index: lib/Target/AMDGPU/SIFixupVectorISel.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -0,0 +1,212 @@
+//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
+/// Currently this will convert GLOBAL_{LOAD|STORE}_*
+/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
+/// feeding the sreg into the saddr field of the new instruction.
+/// We currently handle a REG_SEQUENCE feeding the vaddr
+/// and decompose it into a base and index.
+///
+/// Transform:
+/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
+/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
+///                                    %24:vgpr_32, %19:sreg_64_xexec
+/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
+/// %11:vreg_64 = COPY %16:vreg_64
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
+/// Into:
+/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
+/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-fixup-vector-isel"
+
+using namespace llvm;
+
+STATISTIC(NumSGPRGlobalOccurs,  "Number of global ld/st opportunities");
+STATISTIC(NumSGPRGlobalSaddrs,  "Number of global sgpr instructions converted");
+
+namespace {
+
+class SIFixupVectorISel : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixupVectorISel() : MachineFunctionPass(ID) {
+    initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
+                "SI Fixup Vector ISel", false, false)
+
+char SIFixupVectorISel::ID = 0;
+
+char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
+
+FunctionPass *llvm::createSIFixupVectorISelPass() {
+  return new SIFixupVectorISel();
+}
+
+static bool findSRegBaseAndIndex(MachineOperand *Op,
+                                 unsigned &BaseReg,
+                                 unsigned &IndexReg,
+                                 MachineRegisterInfo &MRI,
+                                 const SIRegisterInfo *TRI) {
+  SmallVector<MachineOperand *, 8> Worklist;
+  Worklist.push_back(Op);
+  while (!Worklist.empty()) {
+    MachineOperand *WOp = Worklist.pop_back_val();
+    if (!WOp->isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+      continue;
+    MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
+    switch (DefInst->getOpcode()) {
+    default:
+      continue;
+    case AMDGPU::COPY:
+      Worklist.push_back(&DefInst->getOperand(1));
+      break;
+    case AMDGPU::REG_SEQUENCE:
+      if (DefInst->getNumOperands() != 5)
+        continue;
+      Worklist.push_back(&DefInst->getOperand(1));
+      Worklist.push_back(&DefInst->getOperand(3));
+      break;
+    case AMDGPU::V_ADD_I32_e64:
+      if (DefInst->getOperand(2).getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      BaseReg = DefInst->getOperand(2).getReg();
+      if (DefInst->getOperand(3).getSubReg() != AMDGPU::NoSubRegister)
+        continue;
+      IndexReg = DefInst->getOperand(3).getReg();
+      // Chase the IndexReg.
+      MachineInstr * MI = MRI.getUniqueVRegDef(IndexReg);
+      if (!MI || !MI->isCopy())
+        continue;
+      IndexReg = MI->getOperand(1).getReg();
+      // Chase the BaseReg.
+      MI = MRI.getUniqueVRegDef(BaseReg);
+      if (!MI || !MI->isCopy())
+        continue;
+      BaseReg = MI->getOperand(1).getReg();
+      // Make sure Base is SReg and Index is VReg.
+      if (!TRI->isSGPRReg(MRI, BaseReg))
+        return false;
+      if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
+        return false;
+      // clear any killed flags on Index and Base regs, used later.
+      MRI.clearKillFlags(IndexReg);
+      MRI.clearKillFlags(BaseReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
+static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
+                             MachineFunction &MF,
+                             MachineRegisterInfo &MRI,
+                             const GCNSubtarget &ST,
+                             const SIInstrInfo *TII,
+                             const SIRegisterInfo *TRI) {
+  bool FuncModified = false;
+  MachineBasicBlock::iterator I, Next;
+  for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MI = *I;
+    int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
+    if (NewOpcd < 0)
+      continue;
+    // Update our statistics on opportunities seen.
+    ++NumSGPRGlobalOccurs;
+    LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
+    // Need a Base and Index or we cant transform to _SADDR.
+    unsigned BaseReg = 0;
+    unsigned IndexReg = 0;
+    MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+    if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
+      continue;
+    ++NumSGPRGlobalSaddrs;
+    FuncModified = true;
+    // Create the new _SADDR Memory instruction.
+    bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
+    MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+    MachineInstr *NewGlob = nullptr;
+    if (HasVdst)
+      NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd),
+                        MI.getOperand(0).getReg());
+    else
+      // No vdst field.
+      NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
+    if (VData)
+      NewGlob->addOperand(MF, *VData);
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
+
+    MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
+    // Atomics dont have a GLC, so omit the field if not there.
+    if (Glc)
+      NewGlob->addOperand(MF, *Glc);
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
+    // _D16 have an vdst_in operand, copy it in.
+    MachineOperand *VDstInOp = TII->getNamedOperand(MI,
+                                      AMDGPU::OpName::vdst_in);
+    if (VDstInOp)
+      NewGlob->addOperand(MF, *VDstInOp);
+    NewGlob->copyImplicitOps(MF, MI);
+    NewGlob->cloneMemRefs(MF, MI);
+    // Remove the old Global Memop instruction.
+    MI.eraseFromParent();
+    LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
+  }
+  return FuncModified;
+}
+
+bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  bool FuncModified = false;
+  for (MachineBasicBlock &MBB : MF)
+    // Cleanup missed Saddr opportunites from ISel.
+    FuncModified |= fixupGlobalSaddr(MBB, MF, MRI, ST, TII, TRI);
+  return FuncModified;
+}
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -954,6 +954,9 @@
   LLVM_READONLY
   int getSOPKOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getGlobalSaddrOp(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -2017,6 +2017,15 @@
   let ValueCols = [["0"]];
 }
 
+// Maps a GLOBAL to its SADDR form.
+def getGlobalSaddrOp : InstrMapping {
+  let FilterClass = "GlobalSaddrTable";
+  let RowFields = ["SaddrOp"];
+  let ColFields = ["IsSaddr"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
Index: test/CodeGen/AMDGPU/conv2d-saddr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/conv2d-saddr.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SADDRGFX9 %s
+
+; SADDRGFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
+; SADDRGFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:-16{{$}}
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+; SADDRGFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:4{{$}}
+
+define hidden amdgpu_kernel void @simpleSaddrs(i32 addrspace(1)* nocapture %dst_image, i32 addrspace(1)* nocapture readonly %src_image, i32 addrspace(1)* nocapture readonly %conv_kernel)  local_unnamed_addr #0 {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep = getelementptr i32, i32 addrspace(1)* %src_image, i64 %idx
+  %ptr0 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 1
+  %load0 = load i32, i32 addrspace(1)* %ptr0
+  %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 2
+  %load1 = load i32, i32 addrspace(1)* %ptr1
+  %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 3
+  %load2 = load i32, i32 addrspace(1)* %ptr2
+  %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 4
+  %load3 = load i32, i32 addrspace(1)* %ptr3
+  %ptr4 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -4
+  %load4 = load i32, i32 addrspace(1)* %ptr4
+  %ptr5 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -3
+  %load5 = load i32, i32 addrspace(1)* %ptr5
+  %ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -2
+  %load6 = load i32, i32 addrspace(1)* %ptr6
+  %ptr7 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 -1
+  %load7 = load i32, i32 addrspace(1)* %ptr7
+  %ptr8 = getelementptr inbounds i32, i32 addrspace(1)* %gep, i64 0
+  %load8 = load i32, i32 addrspace(1)* %ptr8
+  %add0 = add i32 %load1, %load0
+  %add1 = add i32 %load3, %load2
+  %add2 = add i32 %load5, %load4
+  %add3 = add i32 %load7, %load6
+  %add4 = add i32 %add0, %load8
+  %add5 = add i32 %add2, %add1
+  %add6 = add i32 %add4, %add3
+  %add7 = add i32 %add6, %add5
+  %gep9 = getelementptr i32, i32 addrspace(1)* %dst_image, i64 %idx
+  %ptr9 = getelementptr inbounds i32, i32 addrspace(1)* %gep9, i64 1
+  store volatile i32 %add7, i32 addrspace(1)* %ptr9
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind readnone speculatable }
Index: test/CodeGen/AMDGPU/ds_write2.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_write2.ll
+++ test/CodeGen/AMDGPU/ds_write2.ll
@@ -31,8 +31,8 @@
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
@@ -177,8 +177,8 @@
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
@@ -362,8 +362,8 @@
 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
Index: test/CodeGen/AMDGPU/ds_write2st64.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_write2st64.ll
+++ test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -30,8 +30,8 @@
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
@@ -59,8 +59,8 @@
 ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 
-; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
@@ -87,8 +87,8 @@
 ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8
 
-; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
-; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
 ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
Index: test/CodeGen/AMDGPU/global-load_stores.mir
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/global-load_stores.mir
@@ -0,0 +1,86 @@
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-fixup-vector-isel %s -o - | FileCheck -check-prefix=GCN %s
+
+# Coverage tests for GLOBAL_* to their _SADDR equivalent.
+
+# GCN-LABEL: name: global_loads_stores
+# GCN:      GLOBAL_LOAD_DWORD_SADDR
+# GCN:      GLOBAL_STORE_DWORD_SADDR
+# GCN:      GLOBAL_LOAD_DWORDX2_SADDR
+# GCN:      GLOBAL_STORE_DWORDX2_SADDR
+# GCN:      GLOBAL_LOAD_DWORDX3_SADDR
+# GCN:      GLOBAL_STORE_DWORDX3_SADDR
+# GCN:      GLOBAL_LOAD_DWORDX4_SADDR
+# GCN:      GLOBAL_STORE_DWORDX4_SADDR
+# GCN:      GLOBAL_LOAD_SSHORT_SADDR
+# GCN:      GLOBAL_STORE_SHORT_SADDR
+# GCN:      GLOBAL_LOAD_USHORT_SADDR
+# GCN:      GLOBAL_STORE_SHORT_SADDR
+# GCN:      GLOBAL_LOAD_UBYTE_SADDR
+# GCN:      GLOBAL_STORE_BYTE_SADDR
+# GCN:      GLOBAL_LOAD_SBYTE_SADDR
+# GCN:      GLOBAL_STORE_BYTE_SADDR
+# GCN:      GLOBAL_LOAD_SBYTE_D16_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_UBYTE_D16_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_SBYTE_D16_HI_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_UBYTE_D16_HI_SADDR
+# GCN:      GLOBAL_STORE_BYTE_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_SHORT_D16_HI_SADDR
+# GCN:      GLOBAL_STORE_SHORT_D16_HI_SADDR
+# GCN:      GLOBAL_LOAD_SHORT_D16_SADDR
+# GCN:      GLOBAL_STORE_SHORT_D16_HI_SADDR
+
+name:            global_loads_stores
+body:             |
+  bb.0:
+    liveins: $vgpr0, $sgpr0_sgpr1
+
+    %1:sgpr_64 = COPY $sgpr0_sgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1, 36, 0 :: (dereferenceable invariant load 8 )
+    %5:sreg_32_xm0 = S_MOV_B32 2
+    %6:vgpr_32 = V_LSHLREV_B32_e64 killed %5, %0, implicit $exec
+    %7:sreg_32_xm0 = S_MOV_B32 0
+    %15:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %14:vreg_64 = REG_SEQUENCE killed %6, %subreg.sub0, killed %15, %subreg.sub1
+    %21:sgpr_32 = COPY %4.sub0
+    %22:vgpr_32 = COPY %14.sub0
+    %23:sgpr_32 = COPY %4.sub1
+    %24:vgpr_32 = COPY %14.sub1
+    %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21, %22, implicit $exec
+    %25:vgpr_32 = COPY %23
+    %18:vgpr_32, dead %20:sreg_64_xexec = V_ADDC_U32_e64 %25, %24, killed %19, implicit $exec
+    %16:vreg_64 = REG_SEQUENCE %17, %subreg.sub0, %18, %subreg.sub1
+    %11:vreg_64 = COPY %16
+    %10:vgpr_32 = GLOBAL_LOAD_DWORD %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORD %11, %10, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %40:vreg_64 = GLOBAL_LOAD_DWORDX2 %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORDX2 %11, %40, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %41:vreg_96 = GLOBAL_LOAD_DWORDX3 %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORDX3 %11, %41, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %42:vreg_128 = GLOBAL_LOAD_DWORDX4 %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_DWORDX4 %11, %42, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %43:vgpr_32 = GLOBAL_LOAD_SSHORT %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT %11, %43, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %44:vgpr_32 = GLOBAL_LOAD_USHORT %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT %11, %44, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %45:vgpr_32 = GLOBAL_LOAD_UBYTE %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE %11, %45, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %46:vgpr_32 = GLOBAL_LOAD_SBYTE %11, 16, 0, 0, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE %11, %46, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %47:vgpr_32 = GLOBAL_LOAD_SBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %47, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %48:vgpr_32 = GLOBAL_LOAD_UBYTE_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %48, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %49:vgpr_32 = GLOBAL_LOAD_SBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %49, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %50:vgpr_32 = GLOBAL_LOAD_UBYTE_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_BYTE_D16_HI %11, %50, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %51:vgpr_32 = GLOBAL_LOAD_SHORT_D16_HI %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT_D16_HI %11, %51, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    %52:vgpr_32 = GLOBAL_LOAD_SHORT_D16 %11, 16, 0, 0, %46, implicit $exec :: (load 4)
+    GLOBAL_STORE_SHORT_D16_HI %11, %52, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1)
+    S_ENDPGM
+...
Index: test/CodeGen/AMDGPU/global-saddr-atomics.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/global-saddr-atomics.ll
@@ -0,0 +1,359 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+
+; GFX9-LABEL: {{^}}atomic_xor_i32_offset:
+; GFX9: global_atomic_xor v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile xor i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile xor i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_xor_i64_offset:
+; GFX9: global_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile xor i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile xor i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_smin_i32_offset:
+; GFX9: global_atomic_smin v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_smin_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile min i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile min i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_smin_i64_offset:
+; GFX9: global_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_smin_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile min i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile min i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_and_i32_offset:
+; GFX9: global_atomic_and v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile and i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile and i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_and_i64_offset:
+; GFX9: global_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile and i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile and i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_swap_i32_offset:
+; GFX9: global_atomic_swap v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_swap_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile xchg i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile xchg i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_swap_i64_offset:
+; GFX9: global_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_swap_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile xchg i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile xchg i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_smax_i32_offset:
+; GFX9: global_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_smax_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile max i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile max i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_smax_i64_offset:
+; GFX9: global_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_smax_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile max i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile max i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_umin_i32_offset:
+; GFX9: global_atomic_umin v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile umin i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile umin i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_umin_i64_offset:
+; GFX9: global_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile umin i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile umin i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_umax_i32_offset:
+; GFX9: global_atomic_umax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile umax i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile umax i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_umax_i64_offset:
+; GFX9: global_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile umax i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile umax i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_or_i32_offset:
+; GFX9: global_atomic_or v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile or i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile or i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_or_i64_offset:
+; GFX9: global_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile or i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile or i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_add_i32_offset:
+; GFX9: global_atomic_add v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile add i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile add i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_add_i64_offset:
+; GFX9: global_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile add i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile add i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_sub_i32_offset:
+; GFX9: global_atomic_sub v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:16 glc{{$}}
+; GFX9: global_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %ptr, i32 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile sub i32 addrspace(1)* %gep1, i32 %in seq_cst
+
+  store volatile i32 %val, i32 addrspace(1)* undef
+  %val1 = atomicrmw volatile sub i32 addrspace(1)* %gep1, i32 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_sub_i64_offset:
+; GFX9: global_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}}
+; GFX9: global_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %ptr, i64 %in)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = atomicrmw volatile sub i64 addrspace(1)* %gep1, i64 %in seq_cst
+
+  store volatile i64 %val, i64 addrspace(1)* undef
+  %val1 = atomicrmw volatile sub i64 addrspace(1)* %gep1, i64 %in seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
+; GFX9: global_atomic_cmpswap v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:16 glc{{$}}
+; GFX9: global_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:16{{$}}
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out
+, i32 addrspace(1)* %out2, i32%in, i32 %index, i32 %old) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %out, i64 %idx
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 4
+  %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  %extract0 = extractvalue { i32, i1 } %val, 0
+  store i32 %extract0, i32 addrspace(1)* %out2
+  %val2 = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
+  ret void
+}
+
+; GFX9-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
+; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32 glc{{$}}
+; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}}
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out
+, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+entry:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i64, i64 addrspace(1)* %out, i64 %idx
+  %ptr = getelementptr inbounds i64, i64 addrspace(1)* %gep0, i64 4
+  %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, i64 addrspace(1)* %out2
+  %val2 = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
Index: test/CodeGen/AMDGPU/global-saddr-misc.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/global-saddr-misc.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SADDRGFX9 %s
+
+; SADDRGFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; SADDRGFX9-NEXT: s_waitcnt
+; SADDRGFX9-NOT: global_load_dword
+
+define amdgpu_cs void @_amdgpu_cs_main(<3 x i32> inreg %arg) {
+bb:
+  %tmp = extractelement <3 x i32> %arg, i32 1
+  %tmp1 = inttoptr i32 %tmp to <4 x i32> addrspace(1)*
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp1, align 16
+  store volatile <4 x i32> %tmp2, <4 x i32> addrspace(1)* undef
+  ret void
+}
Index: test/CodeGen/AMDGPU/global-saddr-offsets.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/global-saddr-offsets.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4092{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2044{{$}}
+define amdgpu_kernel void @test_offsets(i32 addrspace(1)* %ptr)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 1023
+  %load = load i32, i32 addrspace(1)* %gep1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 2047
+  %load1 = load i32, i32 addrspace(1)* %gep2
+  %gep3 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 511
+  %load2 = load i32, i32 addrspace(1)* %gep3
+  %add1 = add i32 %load, %load1
+  %add = add i32 %add1, %load2
+  store volatile i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-4096{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:-2048{{$}}
+define amdgpu_kernel void @test_offsets_neg(i32 addrspace(1)* %ptr)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 -1024
+  %load = load i32, i32 addrspace(1)* %gep1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 -2048
+  %load1 = load i32, i32 addrspace(1)* %gep2
+  %gep3 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 -512
+  %load2 = load i32, i32 addrspace(1)* %gep3
+  %add1 = add i32 %load, %load1
+  %add = add i32 %add1, %load2
+  store volatile i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+
+
+; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2048{{$}}
+; GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2056{{$}}
+define amdgpu_kernel void @test_offsets_adjoin(i32 addrspace(1)* %ptr)  {
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %idx = zext i32 %id to i64
+  %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i64 %idx
+  %gep1 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 514
+  %load = load i32, i32 addrspace(1)* %gep1
+  %gep2 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 512
+  %load1 = load i32, i32 addrspace(1)* %gep2
+  %gep3 = getelementptr inbounds i32, i32 addrspace(1)* %gep0, i64 513
+  %load2 = load i32, i32 addrspace(1)* %gep3
+  %add1 = add i32 %load, %load1
+  %add = add i32 %add1, %load2
+  store volatile i32 %add, i32 addrspace(1)* undef
+  ret void
+}
+declare i32 @llvm.amdgcn.workitem.id.x()
+
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
===================================================================
--- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -444,40 +444,12 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
-
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-
-; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
-
-; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
-
-; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %idx = load i32, i32 addrspace(1)* %idx.gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
-  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
-  ret void
-}
-
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
 
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/madak.ll
===================================================================
--- test/CodeGen/AMDGPU/madak.ll
+++ test/CodeGen/AMDGPU/madak.ll
@@ -8,8 +8,10 @@
 ; GCN-LABEL: {{^}}madak_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GCN:    v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -88,8 +90,10 @@
 ; GCN-LABEL: {{^}}madak_inline_imm_f32:
 ; GFX6:   buffer_load_dword [[VA:v[0-9]+]]
 ; GFX6:   buffer_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
-; GFX8_9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VB:v[0-9]+]]
+; GFX8: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VA:v[0-9]+]]
+; GFX9: {{flat|global}}_load_dword [[VB:v[0-9]+]]
 ; GCN:    v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
 define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -319,7 +319,7 @@
 
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
+; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 define amdgpu_kernel void @nontemporal_global_1(
     i32 addrspace(1)* %in, i32* %out) {
 entry:
Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ test/CodeGen/AMDGPU/memory-legalizer-store.ll
@@ -240,7 +240,7 @@
 
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 define amdgpu_kernel void @nontemporal_global_1(
     i32* %in, i32 addrspace(1)* %out) {
 entry:
Index: test/CodeGen/AMDGPU/memory_clause.ll
===================================================================
--- test/CodeGen/AMDGPU/memory_clause.ll
+++ test/CodeGen/AMDGPU/memory_clause.ll
@@ -105,7 +105,7 @@
 }
 
 ; GCN-LABEL: {{^}}vector_clause_indirect:
-; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], off
+; GCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}]
 ; GCN-NEXT: s_nop 0
 ; GCN-NEXT: s_waitcnt vmcnt(0)
 ; GCN-NEXT: s_nop 0
Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
===================================================================
--- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -265,16 +265,15 @@
 ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}}
 ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}}
 
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}{{$}}
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
+; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
 
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28
-; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
+; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52
 
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}}
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20
-
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36
-; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52
 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64