Index: lib/Target/AMDGPU/AMDGPU.h
===================================================================
--- lib/Target/AMDGPU/AMDGPU.h
+++ lib/Target/AMDGPU/AMDGPU.h
@@ -41,6 +41,7 @@
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createSIFixupVectorISelPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -114,6 +115,9 @@
 void initializeSIFixVGPRCopiesPass(PassRegistry &);
 extern char &SIFixVGPRCopiesID;
 
+void initializeSIFixupVectorISelPass(PassRegistry &);
+extern char &SIFixupVectorISelID;
+
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -154,6 +154,7 @@
   initializeSILowerI1CopiesPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
+  initializeSIFixupVectorISelPass(*PR);
   initializeSIFoldOperandsPass(*PR);
   initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
@@ -801,6 +802,7 @@
   AMDGPUPassConfig::addInstSelector();
   addPass(createSILowerI1CopiesPass());
   addPass(&SIFixSGPRCopiesID);
+  addPass(createSIFixupVectorISelPass());
   return false;
 }
 
Index: lib/Target/AMDGPU/CMakeLists.txt
===================================================================
--- lib/Target/AMDGPU/CMakeLists.txt
+++ lib/Target/AMDGPU/CMakeLists.txt
@@ -94,6 +94,7 @@
   SIAnnotateControlFlow.cpp
   SIDebuggerInsertNops.cpp
   SIFixSGPRCopies.cpp
+  SIFixupVectorISel.cpp
   SIFixVGPRCopies.cpp
   SIFixWWMLiveness.cpp
   SIFoldOperands.cpp
Index: lib/Target/AMDGPU/FLATInstructions.td
===================================================================
--- lib/Target/AMDGPU/FLATInstructions.td
+++ lib/Target/AMDGPU/FLATInstructions.td
@@ -121,6 +121,11 @@
   let Inst{63-56} = !if(ps.has_vdst, vdst, ?);
 }
 
+class GlobalSaddrTable <bit is_saddr, string Name = ""> {
+  bit IsSaddr = is_saddr;
+  string SaddrOp = Name;
+}
+
 // TODO: Is exec allowed for saddr? The disabled value 0x7f is the
 // same encoding value as exec_hi, so it isn't possible to use that if
 // saddr is 32-bit (which isn't handled here yet).
@@ -171,15 +176,15 @@
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>;
-    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>;
+    def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>, GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1, 1>, GlobalSaddrTable<1, opName>;
   }
 }
 
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   let is_flat_global = 1 in {
-    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>;
-    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>;
+    def "" : FLAT_Store_Pseudo<opName, regClass, 1, 1>, GlobalSaddrTable<0, opName>;
+    def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1, 1>, GlobalSaddrTable<1, opName>;
   }
 }
 
@@ -287,6 +292,7 @@
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, off$offset$slc">,
+    GlobalSaddrTable<0, opName>,
     AtomicNoRet <opName, 0> {
     let has_saddr = 1;
     let PseudoInstr = NAME;
@@ -298,6 +304,7 @@
     " $vdst, $vaddr, $vdata, off$offset glc$slc",
     [(set vt:$vdst,
       (atomic (FLATSignedAtomic i64:$vaddr, i16:$offset, i1:$slc), data_vt:$vdata))]>,
+      GlobalSaddrTable<0, opName#"_rtn">,
       AtomicNoRet <opName, 1> {
     let has_saddr = 1;
   }
@@ -306,6 +313,7 @@
     (outs),
     (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vaddr, $vdata, $saddr$offset$slc">,
+    GlobalSaddrTable<1, opName>,
     AtomicNoRet <opName#"_saddr", 0> {
     let has_saddr = 1;
     let enabled_saddr = 1;
@@ -316,6 +324,7 @@
     (outs vdst_rc:$vdst),
       (ins VReg_64:$vaddr, data_rc:$vdata, SReg_64:$saddr, offset_s13:$offset, SLC:$slc),
     " $vdst, $vaddr, $vdata, $saddr$offset glc$slc">,
+    GlobalSaddrTable<1, opName#"_rtn">,
     AtomicNoRet <opName#"_saddr", 1> {
      let has_saddr = 1;
      let enabled_saddr = 1;
Index: lib/Target/AMDGPU/SIFixupVectorISel.cpp
===================================================================
--- /dev/null
+++ lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -0,0 +1,223 @@
+//===-- SIFixupVectorISel.cpp - Fixup post ISel vector issues -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+/// SIFixupVectorISel pass cleans up post ISEL Vector issues.
+/// Currently this will convert GLOBAL_{LOAD|STORE}_*
+/// and GLOBAL_Atomic_* instructions into their _SADDR variants,
+/// feeding the sreg into the saddr field of the new instruction.
+/// We currently handle a REG_SEQUENCE feeding the vaddr
+/// and decompose it into a base and index.
+///
+/// Transform:
+/// %17:vgpr_32, %19:sreg_64_xexec = V_ADD_I32_e64 %21:sgpr_32, %22:vgpr_32
+/// %18:vgpr_32, %20:sreg_64_xexec = V_ADDC_U32_e64 %25:vgpr_32,
+///                                    %24:vgpr_32, %19:sreg_64_xexec
+/// %16:vreg_64 = REG_SEQUENCE %17:vgpr_32, %sub0, %18:vgpr_32, %sub1
+/// %11:vreg_64 = COPY %16:vreg_64
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD killed %11:vreg_64, 16, 0, 0
+/// Into:
+/// %4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64, 36, 0
+/// %14:vreg_64 = REG_SEQUENCE %6:vgpr_32, %sub0, %15:vgpr_32, %sub1
+/// %10:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR %14:vreg_64, %4:sreg_64_xexec,16...
+///
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#define DEBUG_TYPE "si-fixup-vector-isel"
+
+using namespace llvm;
+
+
+// Opt used by a few lit tests to preserve older CHECK patterns.
+static cl::opt<bool> EnableSIGlobalSAddr(
+  "amdgpu-enable-si-global-saddr",
+  cl::desc("Enable use of SGPR regs for global load/store instructions"),
+  cl::init(true));
+
+STATISTIC(NumSGPRGlobalOccurs,  "Number of global ld/st opportunities");
+STATISTIC(NumSGPRGlobalSaddrs,  "Number of global sgpr instructions converted");
+
+namespace {
+
+class SIFixupVectorISel : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIFixupVectorISel() : MachineFunctionPass(ID) {
+    initializeSIFixupVectorISelPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIFixupVectorISel, DEBUG_TYPE,
+                "SI Fixup Vector ISel", false, false)
+
+char SIFixupVectorISel::ID = 0;
+
+char &llvm::SIFixupVectorISelID = SIFixupVectorISel::ID;
+
+FunctionPass *llvm::createSIFixupVectorISelPass() {
+  return new SIFixupVectorISel();
+}
+
+static bool findSRegBaseAndIndex(MachineOperand *Op,
+                                 unsigned &BaseReg,
+                                 unsigned &IndexReg,
+                                 MachineRegisterInfo &MRI,
+                                 const SIRegisterInfo *TRI) {
+  SmallVector<MachineOperand *, 8> Worklist;
+  Worklist.push_back(Op);
+  while (!Worklist.empty()) {
+    MachineOperand *WOp = Worklist.pop_back_val();
+    if (!WOp->isReg() ||
+        !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+      continue;
+    MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
+    switch (DefInst->getOpcode()) {
+    default:
+      continue;
+    case AMDGPU::COPY:
+      Worklist.push_back(&DefInst->getOperand(1));
+      break;
+    case AMDGPU::REG_SEQUENCE:
+      Worklist.push_back(&DefInst->getOperand(1));
+      Worklist.push_back(&DefInst->getOperand(3));
+      break;
+    case AMDGPU::V_ADD_I32_e64:
+      BaseReg = DefInst->getOperand(2).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
+        continue;
+      IndexReg = DefInst->getOperand(3).getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(IndexReg))
+        continue;
+      // Chase the IndexReg.
+      MachineInstr * MI = MRI.getUniqueVRegDef(IndexReg);
+      if (!(MI && MI->isCopy()))
+        continue;
+      IndexReg = MI->getOperand(1).getReg();
+      // Chase the BaseReg.
+      MI = MRI.getUniqueVRegDef(BaseReg);
+      if (!(MI && MI->isCopy()))
+        continue;
+      BaseReg = MI->getOperand(1).getReg();
+      // Make sure Base is SReg and Index is VReg.
+      if (!TRI->isSGPRReg(MRI, BaseReg))
+        return false;
+      if (!TRI->hasVGPRs(MRI.getRegClass(IndexReg)))
+        return false;
+      // clear any killed flags on Index and Base regs, used later.
+      MRI.clearKillFlags(IndexReg);
+      MRI.clearKillFlags(BaseReg);
+      return true;
+    }
+  }
+  return false;
+}
+
+// Identify Global LOAD|STORE/ATOMIC and try to convert to _SADDR.
+static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
+                             MachineFunction &MF,
+                             MachineRegisterInfo &MRI,
+                             const GCNSubtarget &ST,
+                             const SIInstrInfo *TII,
+                             const SIRegisterInfo *TRI) {
+  if (!EnableSIGlobalSAddr)
+    return false;
+
+  bool FuncModified = false;
+  MachineBasicBlock::iterator I, Next;
+  for (I = MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MI = *I;
+    int NewOpcd = AMDGPU::getGlobalSaddrOp(MI.getOpcode());
+    if (NewOpcd < 0)
+      continue;
+    // Update our statistics on opportunities seen.
+    ++NumSGPRGlobalOccurs;
+    LLVM_DEBUG(dbgs() << "Global Mem opp " << MI << '\n');
+    // Need a Base and Index or we cant transform to _SADDR.
+    unsigned BaseReg = 0;
+    unsigned IndexReg = 0;
+    MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+    if (!findSRegBaseAndIndex(Op, BaseReg, IndexReg, MRI, TRI))
+      continue;
+    ++NumSGPRGlobalSaddrs;
+    FuncModified = true;
+    // Create the new _SADDR Memory instruction.
+    bool HasVdst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst) != nullptr;
+    MachineOperand *VData = TII->getNamedOperand(MI, AMDGPU::OpName::vdata);
+    MachineInstr *NewGlob = nullptr;
+    if (HasVdst)
+      NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd),
+                        MI.getOperand(0).getReg());
+    else
+      // No vdst field.
+      NewGlob = BuildMI(MBB, I, MI.getDebugLoc(), TII->get(NewOpcd));
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(IndexReg, false));
+    if (VData)
+      NewGlob->addOperand(MF, *VData);
+    NewGlob->addOperand(MF, MachineOperand::CreateReg(BaseReg, false));
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::offset));
+
+    MachineOperand *Glc = TII->getNamedOperand(MI, AMDGPU::OpName::glc);
+    // Atomics dont have a GLC, so omit the field if not there.
+    if (Glc)
+      NewGlob->addOperand(MF, *Glc);
+    NewGlob->addOperand(*TII->getNamedOperand(MI, AMDGPU::OpName::slc));
+    // _D16 have an vdst_in operand, copy it in.
+    MachineOperand *VDstInOp = TII->getNamedOperand(MI,
+                                      AMDGPU::OpName::vdst_in);
+    if (VDstInOp)
+      NewGlob->addOperand(MF, *VDstInOp);
+    NewGlob->copyImplicitOps(MF, MI);
+    NewGlob->cloneMemRefs(MF, MI);
+    // Remove the old Global Memop instruction.
+    MI.eraseFromParent();
+    LLVM_DEBUG(dbgs() << "New Global Mem " << *NewGlob << '\n');
+  }
+  return FuncModified;
+}
+
+bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+  // This pass does not run at -O0, avoid putting correctness patches here.
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+
+  bool FuncModified = false;
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+    // Cleanup missed Saddr opportunites from ISel.
+    FuncModified |= fixupGlobalSaddr(*BI, MF, MRI, ST, TII, TRI);
+  }
+  return FuncModified;
+}
Index: lib/Target/AMDGPU/SIInstrInfo.h
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.h
+++ lib/Target/AMDGPU/SIInstrInfo.h
@@ -944,6 +944,9 @@
   LLVM_READONLY
   int getSOPKOp(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getGlobalSaddrOp(uint16_t Opcode);
+
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
   const uint64_t RSRC_ELEMENT_SIZE_SHIFT = (32 + 19);
   const uint64_t RSRC_INDEX_STRIDE_SHIFT = (32 + 21);
Index: lib/Target/AMDGPU/SIInstrInfo.td
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.td
+++ lib/Target/AMDGPU/SIInstrInfo.td
@@ -2009,6 +2009,15 @@
   let ValueCols = [["0"]];
 }
 
+// Maps a GLOBAL to its SADDR form.
+def getGlobalSaddrOp : InstrMapping {
+  let FilterClass = "GlobalSaddrTable";
+  let RowFields = ["SaddrOp"];
+  let ColFields = ["IsSaddr"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
 include "SIInstructions.td"
 
 include "DSInstructions.td"
Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll
===================================================================
--- test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -5,7 +5,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=SICIVI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-scalarize-global-loads=false -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -amdgpu-scalarize-global-loads=false -amdgpu-sroa=0 -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -check-prefix=GCN -check-prefix=SADDRGFX9 %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
@@ -45,6 +46,7 @@
 ; GCN: s_and_saveexec_b64
 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
+; GFX9-SADDR: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s[[0-9]+:[0-9]+\]}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
 define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
Index: test/CodeGen/AMDGPU/conv2d-saddr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/conv2d-saddr.ll
@@ -0,0 +1,171 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SADDRGFX9 %s
+
+; test generation of SADDR variants of instructions, this is a simple conv2d.
+
+; SADDRGFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
+
+target triple = "amdgcn-unknown-amdhsa"
+
+; Function Attrs: convergent nounwind
+define hidden amdgpu_kernel void @simpleConv2d(i32 addrspace(1)* nocapture %dst_image, i32 addrspace(1)* nocapture readonly %src_image, i32 addrspace(1)* nocapture readonly %conv_kernel) local_unnamed_addr #0 !kernel_arg_addr_space !5 !kernel_arg_access_qual !6 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !8 {
+entry:
+  %0 = tail call i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #2
+  %1 = tail call i32 @llvm.amdgcn.workitem.id.x() #3, !range !9
+  %2 = tail call i32 @llvm.amdgcn.workgroup.id.x() #3
+  %3 = getelementptr inbounds i8, i8 addrspace(4)* %0, i64 4
+  %4 = bitcast i8 addrspace(4)* %3 to i16 addrspace(4)*
+  %5 = load i16, i16 addrspace(4)* %4, align 4, !tbaa !10
+  %6 = zext i16 %5 to i32
+  %7 = mul i32 %2, %6
+  %8 = add i32 %7, %1
+  %9 = tail call i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #2
+  %10 = bitcast i8 addrspace(4)* %9 to i64 addrspace(4)*
+  %11 = load i64, i64 addrspace(4)* %10, align 8, !tbaa !19
+  %12 = zext i32 %8 to i64
+  %13 = add i64 %11, %12
+  %conv = trunc i64 %13 to i32
+  %14 = tail call i32 @llvm.amdgcn.workitem.id.y() #3, !range !9
+  %15 = tail call i32 @llvm.amdgcn.workgroup.id.y() #3
+  %16 = getelementptr inbounds i8, i8 addrspace(4)* %0, i64 6
+  %17 = bitcast i8 addrspace(4)* %16 to i16 addrspace(4)*
+  %18 = load i16, i16 addrspace(4)* %17, align 2, !tbaa !20
+  %19 = zext i16 %18 to i32
+  %20 = mul i32 %15, %19
+  %21 = add i32 %20, %14
+  %22 = getelementptr inbounds i8, i8 addrspace(4)* %9, i64 8
+  %23 = bitcast i8 addrspace(4)* %22 to i64 addrspace(4)*
+  %24 = load i64, i64 addrspace(4)* %23, align 8, !tbaa !19
+  %25 = zext i32 %21 to i64
+  %26 = add i64 %24, %25
+  %conv2 = trunc i64 %26 to i32
+  %27 = mul i32 %conv, 3
+  %mul = add i32 %27, -3
+  %add9 = add i32 %mul, %conv2
+  %add10 = add i32 %add9, -1
+  %idxprom = sext i32 %add10 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom
+  %28 = load i32, i32 addrspace(1)* %arrayidx, align 4, !tbaa !21
+  %29 = load i32, i32 addrspace(1)* %conv_kernel, align 4, !tbaa !21
+  %mul17 = mul nsw i32 %29, %28
+  %idxprom.1 = sext i32 %add9 to i64
+  %arrayidx.1 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.1
+  %30 = load i32, i32 addrspace(1)* %arrayidx.1, align 4, !tbaa !21
+  %arrayidx16.1 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 3
+  %31 = load i32, i32 addrspace(1)* %arrayidx16.1, align 4, !tbaa !21
+  %mul17.1 = mul nsw i32 %31, %30
+  %add18.1 = add nsw i32 %mul17.1, %mul17
+  %add10.2 = add i32 %add9, 1
+  %idxprom.2 = sext i32 %add10.2 to i64
+  %arrayidx.2 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.2
+  %32 = load i32, i32 addrspace(1)* %arrayidx.2, align 4, !tbaa !21
+  %arrayidx16.2 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 6
+  %33 = load i32, i32 addrspace(1)* %arrayidx16.2, align 4, !tbaa !21
+  %mul17.2 = mul nsw i32 %33, %32
+  %add18.2 = add nsw i32 %mul17.2, %add18.1
+  %mul.1 = mul nsw i32 %conv, 3
+  %add9.1 = add i32 %mul.1, %conv2
+  %add10.1 = add i32 %add9.1, -1
+  %idxprom.145 = sext i32 %add10.1 to i64
+  %arrayidx.146 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.145
+  %34 = load i32, i32 addrspace(1)* %arrayidx.146, align 4, !tbaa !21
+  %arrayidx16.148 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 1
+  %35 = load i32, i32 addrspace(1)* %arrayidx16.148, align 4, !tbaa !21
+  %mul17.149 = mul nsw i32 %35, %34
+  %add18.150 = add nsw i32 %mul17.149, %add18.2
+  %idxprom.1.1 = sext i32 %add9.1 to i64
+  %arrayidx.1.1 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.1.1
+  %36 = load i32, i32 addrspace(1)* %arrayidx.1.1, align 4, !tbaa !21
+  %arrayidx16.1.1 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 4
+  %37 = load i32, i32 addrspace(1)* %arrayidx16.1.1, align 4, !tbaa !21
+  %mul17.1.1 = mul nsw i32 %37, %36
+  %add18.1.1 = add nsw i32 %mul17.1.1, %add18.150
+  %add10.2.1 = add i32 %add9.1, 1
+  %idxprom.2.1 = sext i32 %add10.2.1 to i64
+  %arrayidx.2.1 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.2.1
+  %38 = load i32, i32 addrspace(1)* %arrayidx.2.1, align 4, !tbaa !21
+  %arrayidx16.2.1 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 7
+  %39 = load i32, i32 addrspace(1)* %arrayidx16.2.1, align 4, !tbaa !21
+  %mul17.2.1 = mul nsw i32 %39, %38
+  %add18.2.1 = add nsw i32 %mul17.2.1, %add18.1.1
+  %40 = mul i32 %conv, 3
+  %mul.2 = add i32 %40, 3
+  %add9.2 = add i32 %mul.2, %conv2
+  %add10.251 = add i32 %add9.2, -1
+  %idxprom.252 = sext i32 %add10.251 to i64
+  %arrayidx.253 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.252
+  %41 = load i32, i32 addrspace(1)* %arrayidx.253, align 4, !tbaa !21
+  %arrayidx16.255 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 2
+  %42 = load i32, i32 addrspace(1)* %arrayidx16.255, align 4, !tbaa !21
+  %mul17.256 = mul nsw i32 %42, %41
+  %add18.257 = add nsw i32 %mul17.256, %add18.2.1
+  %idxprom.1.2 = sext i32 %add9.2 to i64
+  %arrayidx.1.2 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.1.2
+  %43 = load i32, i32 addrspace(1)* %arrayidx.1.2, align 4, !tbaa !21
+  %arrayidx16.1.2 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 5
+  %44 = load i32, i32 addrspace(1)* %arrayidx16.1.2, align 4, !tbaa !21
+  %mul17.1.2 = mul nsw i32 %44, %43
+  %add18.1.2 = add nsw i32 %mul17.1.2, %add18.257
+  %add10.2.2 = add i32 %add9.2, 1
+  %idxprom.2.2 = sext i32 %add10.2.2 to i64
+  %arrayidx.2.2 = getelementptr inbounds i32, i32 addrspace(1)* %src_image, i64 %idxprom.2.2
+  %45 = load i32, i32 addrspace(1)* %arrayidx.2.2, align 4, !tbaa !21
+  %arrayidx16.2.2 = getelementptr inbounds i32, i32 addrspace(1)* %conv_kernel, i64 8
+  %46 = load i32, i32 addrspace(1)* %arrayidx16.2.2, align 4, !tbaa !21
+  %mul17.2.2 = mul nsw i32 %46, %45
+  %add18.2.2 = add nsw i32 %mul17.2.2, %add18.1.2
+  %mul22 = mul i64 %13, 3
+  %add23 = add i64 %26, %mul22
+  %sext = shl i64 %add23, 32
+  %idxprom24 = ashr exact i64 %sext, 32
+  %arrayidx25 = getelementptr inbounds i32, i32 addrspace(1)* %dst_image, i64 %idxprom24
+  store i32 %add18.2.2, i32 addrspace(1)* %arrayidx25, align 4, !tbaa !21
+  ret void
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i8 addrspace(4)* @llvm.amdgcn.implicitarg.ptr() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workitem.id.y() #1
+
+; Function Attrs: nounwind readnone speculatable
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+
+attributes #0 = { convergent nounwind "amdgpu-implicitarg-num-bytes"="48" "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx900" "target-features"="+16-bit-insts,+ci-insts,+dpp,+fp32-denormals,+fp64-fp16-denormals,+gfx9-insts,+s-memrealtime,+vi-insts" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { convergent nounwind readnone }
+attributes #3 = { nounwind }
+
+
+!5 = !{i32 1, i32 1, i32 1}
+!6 = !{!"none", !"none", !"none"}
+!7 = !{!"int*", !"int*", !"int*"}
+!8 = !{!"", !"", !""}
+!9 = !{i32 0, i32 1024}
+!10 = !{!11, !12, i64 4}
+!11 = !{!"hsa_kernel_dispatch_packet_s", !12, i64 0, !12, i64 2, !12, i64 4, !12, i64 6, !12, i64 8, !12, i64 10, !15, i64 12, !15, i64 16, !15, i64 20, !15, i64 24, !15, i64 28, !16, i64 32, !17, i64 40, !16, i64 48, !18, i64 56}
+!12 = !{!"short", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!"int", !13, i64 0}
+!16 = !{!"long", !13, i64 0}
+!17 = !{!"any pointer", !13, i64 0}
+!18 = !{!"hsa_signal_s", !16, i64 0}
+!19 = !{!16, !16, i64 0}
+!20 = !{!11, !12, i64 6}
+!21 = !{!15, !15, i64 0}
Index: test/CodeGen/AMDGPU/ds_write2.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_write2.ll
+++ test/CodeGen/AMDGPU/ds_write2.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SADDRGFX9 %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 @lds.f64 = addrspace(3) global [512 x double] undef, align 8
@@ -33,6 +34,8 @@
 
 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; SADDRGFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; SADDRGFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
@@ -107,6 +110,8 @@
 
 ; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
 ; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; SADDRGFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; SADDRGFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
@@ -179,6 +184,8 @@
 
 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; SADDRGFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; SADDRGFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
@@ -364,6 +371,8 @@
 
 ; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; SADDRGFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; SADDRGFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
Index: test/CodeGen/AMDGPU/ds_write2st64.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_write2st64.ll
+++ test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SADDRGFX9 %s
 
 @lds = addrspace(3) global [512 x float] undef, align 4
 
@@ -32,6 +33,8 @@
 
 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; SADDRGFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; SADDRGFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
@@ -61,6 +64,8 @@
 
 ; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4
+; SADDRGFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; SADDRGFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:4
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
 ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
@@ -89,6 +94,8 @@
 
 ; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}}
 ; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8
+; SADDRGFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}{{$}}
+; SADDRGFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} offset:8
 
 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}}
 ; GCN: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]]
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
===================================================================
--- test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1,4 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
 ; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
 
@@ -444,40 +445,12 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
-
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
-
-; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
-
-; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
-; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
-
-; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
-; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
-  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
-  %tid.ext = sext i32 %tid to i64
-  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
-  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
-  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
-  %idx = load i32, i32 addrspace(1)* %idx.gep
-  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
-  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
-  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
-  ret void
-}
-
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
 ; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
 ; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
 
-; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
-; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN-DAG: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
 
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-nosaddr.ll
@@ -0,0 +1,37 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll
@@ -0,0 +1,36 @@
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
+
+; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+
+; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
+; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
+
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GCN-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+
+; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
+; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %idx.gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %idx = load i32, i32 addrspace(1)* %idx.gep
+  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/madak.ll
===================================================================
--- test/CodeGen/AMDGPU/madak.ll
+++ test/CodeGen/AMDGPU/madak.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6 %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX8_9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -check-prefixes=FAILGCN,GFX9,FAILGFX8_9 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare float @llvm.fabs.f32(float) nounwind readnone
@@ -213,6 +214,7 @@
 ; GCN:    v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
 ; GFX6:   buffer_store_dword [[MUL]]
 ; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
+; FAILGFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}],
 define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
Index: test/CodeGen/AMDGPU/memory-legalizer-load.ll
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-load.ll
+++ test/CodeGen/AMDGPU/memory-legalizer-load.ll
@@ -319,7 +319,7 @@
 
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
-; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}}
+; GFX9:  global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 define amdgpu_kernel void @nontemporal_global_1(
     i32 addrspace(1)* %in, i32* %out) {
 entry:
Index: test/CodeGen/AMDGPU/memory-legalizer-store.ll
===================================================================
--- test/CodeGen/AMDGPU/memory-legalizer-store.ll
+++ test/CodeGen/AMDGPU/memory-legalizer-store.ll
@@ -240,7 +240,7 @@
 
 ; GCN-LABEL: {{^}}nontemporal_global_1:
 ; GFX8:  flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
-; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
+; GFX9:  global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
 define amdgpu_kernel void @nontemporal_global_1(
     i32* %in, i32 addrspace(1)* %out) {
 entry:
Index: test/CodeGen/AMDGPU/memory_clause.ll
===================================================================
--- test/CodeGen/AMDGPU/memory_clause.ll
+++ test/CodeGen/AMDGPU/memory_clause.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-si-global-saddr=0< %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -amdgpu-enable-si-global-saddr=1< %s | FileCheck -check-prefix=SADDRGCN %s
 
 ; GCN-LABEL: {{^}}vector_clause:
 ; GCN:      global_load_dwordx4
@@ -111,6 +112,13 @@
 ; GCN-NEXT: s_nop 0
 ; GCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off
 ; GCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off offset:16
+; SADDRGCN-LABEL: {{^}}vector_clause_indirect:
+; SADDRGCN: global_load_dwordx2 [[ADDR:v\[[0-9:]+\]]], v[{{[0-9:]+}}], s[{{[0-9:]+}}]
+; SADDRGCN-NEXT: s_nop 0
+; SADDRGCN-NEXT: s_waitcnt vmcnt(0)
+; SADDRGCN-NEXT: s_nop 0
+; SADDRGCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off
+; SADDRGCN-NEXT: global_load_dwordx4 v[{{[0-9:]+}}], [[ADDR]], off offset:16
 define amdgpu_kernel void @vector_clause_indirect(i64 addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture readnone %arg1, <4 x i32> addrspace(1)* noalias nocapture %arg2) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
===================================================================
--- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-si-global-saddr=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=gfx900 -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi -amdgpu-enable-si-global-saddr=1 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SADDRGFX9 %s
 
 declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
 declare void @llvm.amdgcn.tbuffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i1, i1)
@@ -275,6 +276,14 @@
 
 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36
 ; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52
+; SADDRGFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:12
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:28
+; SADDRGFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} offset:44
+
+; SADDRGFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36
+; SADDRGFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52
+
 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64