Index: include/llvm/CodeGen/AsmPrinter.h
===================================================================
--- include/llvm/CodeGen/AsmPrinter.h
+++ include/llvm/CodeGen/AsmPrinter.h
@@ -620,6 +620,13 @@
   virtual void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                                 const MCSubtargetInfo *EndInfo) const;
 
+  /// This emits visibility information about symbol, if this is supported by
+  /// the target.
+  void EmitVisibility(MCSymbol *Sym, unsigned Visibility,
+                      bool IsDefinition = true) const;
+
+  void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const;
+
 private:
   /// Private state for PrintSpecial()
   // Assign a unique ID to this machine instruction.
@@ -650,13 +657,6 @@
   // Internal Implementation Details
   //===------------------------------------------------------------------===//
 
-  /// This emits visibility information about symbol, if this is supported by
-  /// the target.
-  void EmitVisibility(MCSymbol *Sym, unsigned Visibility,
-                      bool IsDefinition = true) const;
-
-  void EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const;
-
   void EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                           const MachineBasicBlock *MBB, unsigned uid) const;
   void EmitLLVMUsedList(const ConstantArray *InitList);
Index: include/llvm/CodeGen/MachineOperand.h
===================================================================
--- include/llvm/CodeGen/MachineOperand.h
+++ include/llvm/CodeGen/MachineOperand.h
@@ -713,6 +713,10 @@
   /// ChangeToES - Replace this operand with a new external symbol operand.
   void ChangeToES(const char *SymName, unsigned char TargetFlags = 0);
 
+  /// ChangeToGA - Replace this operand with a new global address operand.
+  void ChangeToGA(const GlobalValue *GV, int64_t Offset,
+                  unsigned char TargetFlags = 0);
+
   /// ChangeToMCSymbol - Replace this operand with a new MC symbol operand.
   void ChangeToMCSymbol(MCSymbol *Sym);
 
Index: lib/CodeGen/MachineOperand.cpp
===================================================================
--- lib/CodeGen/MachineOperand.cpp
+++ lib/CodeGen/MachineOperand.cpp
@@ -181,6 +181,19 @@
   setTargetFlags(TargetFlags);
 }
 
+void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset,
+                                unsigned char TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into a global address");
+
+  removeRegFromUses();
+
+  OpKind = MO_GlobalAddress;
+  Contents.OffsetedInfo.Val.GV = GV;
+  setOffset(Offset);
+  setTargetFlags(TargetFlags);
+}
+
 void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into an MCSymbol");
Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
===================================================================
--- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -285,10 +285,39 @@
 }
 
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  if (GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    const Triple::OSType OS = TM.getTargetTriple().getOS();
+
+    // LDS variables aren't emitted in HSA or PAL yet.
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return;
+
+    if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
+      OutContext.reportError({},
+                             Twine(GV->getName()) +
+                                 ": unsupported initializer for address space");
+      return;
+    }
+
+    MCSymbol *GVSym = getSymbol(GV);
+
+    GVSym->redefineIfPossible();
+    if (GVSym->isDefined() || GVSym->isVariable())
+      report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                         "' is already defined");
 
-  // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+    uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+    unsigned Align = GV->getAlignment();
+    if (!Align)
+      Align = 4;
+
+    EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+    EmitLinkage(GV, GVSym);
+    OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
+    getTargetStreamer()->emitAMDGPULDS(GVSym, Align);
     return;
+  }
 
   AsmPrinter::EmitGlobalVariable(GV);
 }
Index: lib/Target/AMDGPU/SIFoldOperands.cpp
===================================================================
--- lib/Target/AMDGPU/SIFoldOperands.cpp
+++ lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -50,7 +50,7 @@
     } else if (FoldOp->isFI()) {
       FrameIndexToFold = FoldOp->getIndex();
     } else {
-      assert(FoldOp->isReg());
+      assert(FoldOp->isReg() || FoldOp->isGlobal());
       OpToFold = FoldOp;
     }
   }
@@ -67,6 +67,8 @@
     return Kind == MachineOperand::MO_Register;
   }
 
+  bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }
+
   bool isCommuted() const {
     return Commuted;
   }
@@ -252,6 +254,12 @@
 
   assert(!Fold.needsShrink() && "not handled");
 
+  if (Fold.isGlobal()) {
+    Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),
+                   Fold.OpToFold->getTargetFlags());
+    return true;
+  }
+
   if (Fold.isFI()) {
     Old.ChangeToFrameIndex(Fold.FrameIndexToFold);
     return true;
@@ -443,8 +451,7 @@
     return;
   }
 
-
-  bool FoldingImm = OpToFold.isImm();
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isGlobal();
 
   if (FoldingImm && UseMI->isCopy()) {
     unsigned DestReg = UseMI->getOperand(0).getReg();
@@ -805,7 +812,7 @@
   SmallVector<FoldCandidate, 4> FoldList;
   MachineOperand &Dst = MI.getOperand(0);
 
-  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+  bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
   if (FoldingImm) {
     unsigned NumLiteralUses = 0;
     MachineOperand *NonInlineUse = nullptr;
@@ -1151,7 +1158,8 @@
       }
 
       MachineOperand &OpToFold = MI.getOperand(1);
-      bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
+      bool FoldingImm =
+          OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
 
       // FIXME: We could also be folding things like TargetIndexes.
       if (!FoldingImm && !OpToFold.isReg())
Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -349,6 +349,10 @@
 
   void finalizeLowering(MachineFunction &MF) const override;
 
+  void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+                                     const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
   void computeKnownBitsForFrameIndex(const SDValue Op,
                                      KnownBits &Known,
                                      const APInt &DemandedElts,
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -3437,6 +3437,8 @@
   }
 
   case AMDGPU::GET_GROUPSTATICSIZE: {
+    assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+           getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
     DebugLoc DL = MI.getDebugLoc();
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
         .add(MI.getOperand(0))
@@ -4548,7 +4550,9 @@
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSD->getGlobal();
-  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+  if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+       (getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
+        getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -4557,6 +4561,15 @@
   EVT PtrVT = Op.getValueType();
 
   // FIXME: Should not make address space based decisions here.
+
+  if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
+                                            SIInstrInfo::MO_ABS32_LO);
+    GA = DAG.getNode(ISD::AssertZext, DL, MVT::i32, GA,
+                     DAG.getValueType(MVT::i16));
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
+
   if (shouldEmitFixup(GV))
     return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
   else if (shouldEmitPCReloc(GV))
@@ -5479,6 +5492,18 @@
   case Intrinsic::amdgcn_fmad_ftz:
     return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
                        Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_groupstaticsize: {
+    Triple::OSType OS = getTargetMachine().getTargetTriple().getOS();
+    if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
+      return Op;
+
+    const Module *M = MF.getFunction().getParent();
+    const GlobalValue *GV =
+        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+    SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
+                                            SIInstrInfo::MO_ABS32_LO);
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -9874,6 +9899,27 @@
   TargetLoweringBase::finalizeLowering(MF);
 }
 
+void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                     KnownBits &Known,
+                                                     const APInt &DemandedElts,
+                                                     const SelectionDAG &DAG,
+                                                     unsigned Depth) const {
+  if (Op->isMachineOpcode()) {
+    switch (Op->getMachineOpcode()) {
+    case AMDGPU::S_MOV_B32:
+    case AMDGPU::V_MOV_B32_e32:
+      Known = DAG.computeKnownBits(Op->getOperand(0), DemandedElts, Depth + 1);
+      break;
+    default:
+      break;
+    }
+    return;
+  }
+
+  AMDGPUTargetLowering::computeKnownBitsForTargetNode(Op, Known, DemandedElts,
+                                                      DAG, Depth);
+}
+
 void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
                                                      KnownBits &Known,
                                                      const APInt &DemandedElts,
Index: lib/Target/AMDGPU/SIInstrInfo.cpp
===================================================================
--- lib/Target/AMDGPU/SIInstrInfo.cpp
+++ lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2598,7 +2598,7 @@
                                     const MachineOperand &MO) const {
   const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo];
 
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
 
   if (OpInfo.OperandType == MCOI::OPERAND_IMMEDIATE)
     return true;
@@ -3471,7 +3471,7 @@
     return isLegalRegOperand(MRI, OpInfo, MO);
 
   // Handle non-register types that are treated like immediates.
-  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI());
+  assert(MO.isImm() || MO.isTargetIndex() || MO.isFI() || MO.isGlobal());
   return true;
 }
 
@@ -3512,7 +3512,7 @@
   }
 
   // Handle non-register types that are treated like immediates.
-  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI());
+  assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
 
   if (!DefinedRC) {
     // This operand expects an immediate.
Index: test/CodeGen/AMDGPU/32-bit-local-address-space.ll
===================================================================
--- test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -81,8 +81,8 @@
 @g_lds = addrspace(3) global float undef, align 4
 
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
-; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
-; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
+; SI: v_mov_b32_e32 [[PTR:v[0-9]+]], g_lds@abs32@lo
+; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
 define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
Index: test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
===================================================================
--- test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -1,90 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}fold_mi_v_and_0:
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
-  %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %and = and i32 %size, %x
-  store i32 %and, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fold_mi_s_and_0:
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %and = and i32 %size, %x
-  store i32 %and, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fold_mi_v_or_0:
-; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
-  %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %or = or i32 %size, %x
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fold_mi_s_or_0:
-; GCN: s_load_dword [[SVAL:s[0-9]+]]
-; GCN-NOT: [[SVAL]]
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
-; GCN-NOT: [[VVAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %or = or i32 %size, %x
-  store i32 %or, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fold_mi_v_xor_0:
-; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]]
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
-  %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %xor = xor i32 %size, %x
-  store i32 %xor, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fold_mi_s_xor_0:
-; GCN: s_load_dword [[SVAL:s[0-9]+]]
-; GCN-NOT: [[SVAL]]
-; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
-; GCN-NOT: [[VVAL]]
-; GCN: buffer_store_dword [[VVAL]]
-define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %xor = xor i32 %size, %x
-  store i32 %xor, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}fold_mi_s_not_0:
-; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
-; GCN-NOT: [[RESULT]]
-; GCN: buffer_store_dword [[RESULT]]
-define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %xor = xor i32 %size, -1
-  store i32 %xor, i32 addrspace(1)* %out
-  ret void
-}
-
 ; GCN-LABEL: {{^}}fold_mi_v_not_0:
 ; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}}
 ; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}}
@@ -138,7 +54,6 @@
 
 declare i64 @llvm.ctpop.i64(i64) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-declare i32 @llvm.amdgcn.groupstaticsize() #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/ds-sub-offset.ll
===================================================================
--- test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -7,8 +7,8 @@
 
 ; GCN-LABEL: {{^}}write_ds_sub0_offset0_global:
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v0
-; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
-; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], 0, [[SHL]]
+; CI: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, lds.obj@abs32@lo, [[SHL]]
+; GFX9: v_sub_u32_e32 [[BASEPTR:v[0-9]+]], lds.obj@abs32@lo, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
 define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
Index: test/CodeGen/AMDGPU/ds_read2.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_read2.ll
+++ test/CodeGen/AMDGPU/ds_read2.ll
@@ -355,7 +355,8 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 3, {{v[0-9]+}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VOFS]]
 ; GCN: ds_read2_b64 v{{\[}}[[LO_VREG:[0-9]+]]:[[HI_VREG:[0-9]+]]{{\]}}, [[VPTR]] offset1:8
 ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 
@@ -441,8 +442,8 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
 define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -455,8 +456,8 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:2
 define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -471,9 +472,9 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset1:1
+; GCN: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:2 offset1:3
 define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@@ -488,10 +489,13 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000
-; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
-; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
+; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
+; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]
+; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE0]] offset1:1
+; GCN-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VBASE1]] offset1:1
 ; GCN: s_endpgm
 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
Index: test/CodeGen/AMDGPU/ds_write2.ll
===================================================================
--- test/CodeGen/AMDGPU/ds_write2.ll
+++ test/CodeGen/AMDGPU/ds_write2.ll
@@ -103,10 +103,17 @@
 ; CI: buffer_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 ; CI-DAG: s_mov_b32 m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
-
-; GFX9: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
-; GFX9: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
+;
+; TODO: This should be an s_mov_b32. The v_mov_b32 gets introduced by an
+;       early legalization of the constant bus constraint on the v_lshl_add_u32,
+;       and then SIFoldOperands folds in an unlucky order.
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], {{v[0-9]+}}, 2, [[VBASE]]
+
+; GFX9-DAG: global_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:{{[0-9]+\]}}
+; GFX9-DAG: global_load_dwordx2 v{{\[[0-9]+}}:[[VAL1:[0-9]+]]{{\]}}
 
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
@@ -131,7 +138,12 @@
 ; GFX9-NOT: m0
 
 ; GCN-DAG: {{buffer|global}}_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
+
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
 define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
@@ -153,7 +165,12 @@
 ; GFX9-NOT: m0
 
 ; GCN-DAG: {{buffer|global}}_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
-; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
+
+; CI-DAG: v_lshlrev_b32_e32 [[VOFS:v[0-9]+]], 2, v{{[0-9]+}}
+; CI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]], vcc, lds@abs32@lo, [[VOFS]]
+; GFX9-DAG: v_mov_b32_e32 [[VBASE:v[0-9]+]], lds@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[VPTR:v[0-9]+]], v{{[0-9]+}}, 2, [[VBASE]]
+
 ; GCN: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; GCN: s_endpgm
 define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
@@ -389,8 +406,8 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
@@ -402,8 +419,8 @@
 ; GFX9-NOT: m0
 
 ; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], foo@abs32@lo{{$}}
+; GCN: ds_write2_b32 [[PTR]], [[VAL]], [[VAL]] offset1:2
 define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
@@ -416,9 +433,9 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; GCN-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
+; GCN-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], bar@abs32@lo{{$}}
+; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[PTR]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN: s_endpgm
 define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
@@ -432,10 +449,13 @@
 ; CI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_mov_b32_e32 [[BASE0:v[0-9]+]], 0x7ff8{{$}}
-; GCN-DAG: v_mov_b32_e32 [[BASE1:v[0-9]+]], 0x4000{{$}}
-; GCN-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-; GCN-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], bar.large@abs32@lo
+; GCN-DAG: s_add_i32 [[SBASE1:s[0-9]+]], [[SBASE0]], 0x4000{{$}}
+; GCN-DAG: s_addk_i32 [[SBASE0]], 0x7ff8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE0:v[0-9]+]], [[SBASE0]]{{$}}
+; GCN-DAG: v_mov_b32_e32 [[VBASE1:v[0-9]+]], [[SBASE1]]{{$}}
+; GCN-DAG: ds_write2_b32 [[VBASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
+; GCN-DAG: ds_write2_b32 [[VBASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
 define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
Index: test/CodeGen/AMDGPU/lds-initializer.ll
===================================================================
--- test/CodeGen/AMDGPU/lds-initializer.ll
+++ test/CodeGen/AMDGPU/lds-initializer.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
-; CHECK: in function load_init_lds_global{{.*}}: unsupported initializer for address space
+; CHECK: lds: unsupported initializer for address space
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
Index: test/CodeGen/AMDGPU/lds-relocs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AMDGPU/lds-relocs.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -filetype=obj < %s | llvm-readobj -r -t | FileCheck -check-prefixes=ELF %s
+
+@lds.external = external unnamed_addr addrspace(3) global [0 x i32]
+@lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8
+
+; ELF:      Relocations [
+; ELF-NEXT:   Section (3) .rel.text {
+; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.external 0x0
+; ELF-NEXT:     0x{{[0-9a-f]*}} R_AMDGPU_ABS32 lds.defined 0x0
+; ELF-NEXT:   }
+; ELF-NEXT: ]
+
+; ELF:      Symbol {
+; ELF:        Name: lds.defined
+; ELF-NEXT:   Value: 0x0
+; ELF-NEXT:   Size: 32
+; ELF-NEXT:   Binding: Global (0x1)
+; ELF-NEXT:   Type: AMDGPU_LDS (0xD)
+; ELF-NEXT:   Align: 8
+; ELF-NEXT:   Other: 24
+; ELF-NEXT:   Section: Undefined (0x0)
+; ELF-NEXT: }
+
+; ELF:      Symbol {
+; ELF:        Name: lds.external
+; ELF-NEXT:   Value: 0x0
+; ELF-NEXT:   Size: 0
+; ELF-NEXT:   Binding: Global (0x1)
+; ELF-NEXT:   Type: AMDGPU_LDS (0xD)
+; ELF-NEXT:   Align: 4
+; ELF-NEXT:   Other: 16
+; ELF-NEXT:   Section: Undefined (0x0)
+; ELF-NEXT: }
+
+; GCN-LABEL: {{^}}test_basic:
+; GCN: v_mov_b32_e32 v1, lds.external@abs32@lo ; encoding: [0xff,0x02,0x02,0x7e,A,A,A,A]
+; GCN-NEXT:              ; fixup A - offset: 4, value: lds.external@abs32@lo, kind: FK_Data_4{{$}}
+;
+; GCN: s_add_i32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x81,A,A,A,A]
+; GCN-NEXT:          ; fixup A - offset: 4, value: lds.defined@abs32@lo, kind: FK_Data_4{{$}}
+;
+; GCN: .globl lds.external
+; GCN: .size lds.external, 0
+; GCN: .amdgpu_lds lds.external, 4
+; GCN: .globl lds.defined
+; GCN: .size lds.defined, 32
+; GCN: .amdgpu_lds lds.defined, 8
+define amdgpu_gs float @test_basic(i32 inreg %wave, i32 %arg1) #0 {
+main_body:
+  %gep0 = getelementptr [0 x i32], [0 x i32] addrspace(3)* @lds.external, i32 0, i32 %arg1
+  %tmp = load i32, i32 addrspace(3)* %gep0
+
+  %mask = call i64 @llvm.amdgcn.icmp.i64.i32(i32 %tmp, i32 0, i32 0)
+  %mask.32 = trunc i64 %mask to i32
+  %gep1 = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds.defined, i32 0, i32 %wave
+  store i32 %mask.32, i32 addrspace(3)* %gep1
+
+  %r = bitcast i32 %tmp to float
+  ret float %r
+}
+
+; Function Attrs: convergent nounwind readnone
+declare i64 @llvm.amdgcn.icmp.i64.i32(i32, i32, i32) #4
+
+attributes #0 = { "no-signed-zeros-fp-math"="true" }
+attributes #4 = { convergent nounwind readnone }
Index: test/CodeGen/AMDGPU/lds-size.ll
===================================================================
--- test/CodeGen/AMDGPU/lds-size.ll
+++ test/CodeGen/AMDGPU/lds-size.ll
@@ -1,4 +1,3 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=ALL -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=ALL -check-prefix=HSA %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=ALL -check-prefix=EG %s
 
Index: test/CodeGen/AMDGPU/lds-zero-initializer.ll
===================================================================
--- test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck %s
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck %s
 
-; CHECK: in function load_zeroinit_lds_global{{.*}}: unsupported initializer for address space
+; CHECK: lds: unsupported initializer for address space
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -268,7 +268,11 @@
 ; CIVI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
+
 ; GCN: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -412,7 +416,11 @@
 ; CIVI-DAG: s_mov_b32 m0
 ; GFX9-NOT: m0
 
-; GCN-DAG: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
+; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
+
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
 define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
Index: test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -131,7 +131,10 @@
 @lds0 = addrspace(3) global [512 x i32] undef, align 4
 
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+; GFX9-DAG: s_mov_b32 [[BASE:s[0-9]+]], lds0@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 2, [[BASE]]
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@@ -325,7 +328,10 @@
 @lds1 = addrspace(3) global [512 x i64] undef, align 8
 
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 3, {{v[0-9]+}}
+; CIVI-DAG: v_add_{{[ui]}}32_e32 [[PTR:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
+; GFX9-DAG: v_mov_b32_e32 [[BASE:v[0-9]+]], lds1@abs32@lo
+; GFX9-DAG: v_lshl_add_u32 [[PTR:v[0-9]+]], {{v[0-9]+}}, 3, [[BASE]]
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
 define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
Index: test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,NOHSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=CHECK,HSA %s
 
 @lds0 = addrspace(3) global [512 x float] undef, align 4
 @lds1 = addrspace(3) global [256 x float] undef, align 4
@@ -8,7 +8,8 @@
 @large = addrspace(3) global [4096 x i32] undef, align 4
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
-; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
+; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
 define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
@@ -22,7 +23,8 @@
 }
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
-; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
+; HSA: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
 define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -50,7 +52,8 @@
 
 ; Exceeds 16-bit simm limit of s_movk_i32
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
-; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
+; NOHSA: v_mov_b32_e32 v{{[0-9]+}}, llvm.amdgcn.groupstaticsize@abs32@lo
+; HSA: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
 define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
   %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
   store volatile i32 0, i32 addrspace(3)* %gep
Index: test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
===================================================================
--- test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
+++ test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll
@@ -17,8 +17,9 @@
 ; VI-LABEL: {{^}}dpp_test1:
 ; VI: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0
-; VI-NEXT: s_nop 0
-; VI-NEXT: s_nop 0
+; VI-OPT: s_nop 1
+; VI-NOOPT: s_nop 0
+; VI-NOOPT: s_nop 0
 ; VI-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf
 @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4
 define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr {
Index: test/CodeGen/AMDGPU/local-memory.amdgcn.ll
===================================================================
--- test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -3,12 +3,6 @@
 
 @local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
 
-; Check that the LDS size emitted correctly
-; SI: .long 47180
-; SI-NEXT: .long 65668
-; CI: .long 47180
-; CI-NEXT: .long 32900
-
 ; GCN-LABEL: {{^}}local_memory:
 
 ; GCN-NOT: s_wqm_b64
@@ -36,27 +30,28 @@
 @local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 @local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
 
-; Check that the LDS size emitted correctly
-; EG: .long 166120
-; EG-NEXT: .long 8
-; GCN: .long 47180
-; GCN-NEXT: .long 32900
-
 ; GCN-LABEL: {{^}}local_memory_two_objects:
-; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
-; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
-; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, v0
+; GCN-DAG: s_mov_b32 [[SBASE0:s[0-9]+]], local_memory_two_objects.local_mem0@abs32@lo
+; GCN-DAG: s_mov_b32 [[SBASE1:s[0-9]+]], local_memory_two_objects.local_mem1@abs32@lo
+; GCN-DAG: v_add_i32_e32 [[VPTR0:v[0-9]+]], vcc, [[SBASE0]], [[OFS]]
+; GCN-DAG: v_add_i32_e32 [[VPTR1:v[0-9]+]], vcc, [[SBASE1]], [[OFS]]
+; GCN-DAG: ds_write_b32 [[VPTR0]], {{v[0-9]+}}
+; GCN-DAG: ds_write_b32 [[VPTR1]], {{v[0-9]+}}
 
 ; GCN: s_barrier
 
-; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
-; SI-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
+; GCN-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, [[SBASE0]], [[OFS]]
+; GCN-DAG: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, [[SBASE1]], [[OFS]]
+
+; SI-DAG: v_add_i32_e32 [[RPTR0:v[0-9]+]], vcc, 12, [[SUB0]]
+; SI-DAG: v_add_i32_e32 [[RPTR1:v[0-9]+]], vcc, 12, [[SUB1]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[RPTR0]]
+; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[RPTR1]]
 
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
-; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
+; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]] offset:12
+; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]] offset:12
 
-; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
-; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
 define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/local-memory.ll
===================================================================
--- test/CodeGen/AMDGPU/local-memory.ll
+++ test/CodeGen/AMDGPU/local-memory.ll
@@ -10,8 +10,8 @@
 ; not an immediate.
 
 ; FUNC-LABEL: {{^}}load_i32_local_const_ptr:
-; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
-; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
+; GCN: v_mov_b32_e32 v[[PTR:[0-9]+]], lds@abs32@lo
+; GCN: ds_read_b32 v{{[0-9]+}}, v[[PTR]] offset:4
 
 ; R600: LDS_READ_RET
 define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
Index: test/CodeGen/AMDGPU/merge-store-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/merge-store-crash.ll
+++ test/CodeGen/AMDGPU/merge-store-crash.ll
@@ -7,7 +7,8 @@
 @tess_lds = external addrspace(3) global [8192 x i32]
 
 ; CHECK-LABEL: {{^}}main:
-; CHECK: ds_write2_b32
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
 ; CHECK: v_mov_b32_e32 v1, v0
 ; CHECK: tbuffer_store_format_xyzw v[0:3],
 define amdgpu_vs void @main(i32 inreg %arg) {
Index: test/CodeGen/AMDGPU/over-max-lds-size.ll
===================================================================
--- test/CodeGen/AMDGPU/over-max-lds-size.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: not llc -march=amdgcn -mcpu=tahiti < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn -mcpu=hawaii < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-; RUN: not llc -march=amdgcn -mcpu=fiji < %s 2>&1 | FileCheck -check-prefix=ERROR %s
-
-; ERROR: error: local memory limit exceeded (400000) in use_huge_lds
-
-@huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
-
-define amdgpu_kernel void @use_huge_lds() {
-entry:
-  %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
-  store i32 0, i32 addrspace(3)* %v0
-  ret void
-}
Index: test/CodeGen/AMDGPU/promote-alloca-globals.ll
===================================================================
--- test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -8,7 +8,10 @@
 ; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
-; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
+; ASM: .size global_array0, 30000
+; ASM: .amdgpu_lds global_array0, 4
+; ASM: .size global_array1, 30000
+; ASM: .amdgpu_lds global_array1, 4
 
 define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
Index: test/CodeGen/AMDGPU/s_addk_i32.ll
===================================================================
--- test/CodeGen/AMDGPU/s_addk_i32.ll
+++ test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -101,18 +101,5 @@
   ret void
 }
 
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
-; SI-LABEL: {{^}}commute_s_addk_i32:
-; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %add = add i32 %size, %b
-  call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
-  ret void
-}
-
-declare i32 @llvm.amdgcn.groupstaticsize() #1
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/s_mulk_i32.ll
===================================================================
--- test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -40,18 +40,5 @@
   ret void
 }
 
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
-; SI-LABEL: {{^}}commute_s_mulk_i32:
-; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %add = mul i32 %size, %b
-  call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
-  ret void
-}
-
-declare i32 @llvm.amdgcn.groupstaticsize() #1
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/shl_add_ptr.ll
===================================================================
--- test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -33,7 +33,11 @@
 ; remaining add use goes through the normal shl + add constant fold.
 
 ; GCN-LABEL: {{^}}load_shl_base_lds_1:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+
+; TODO: integrate into the ds_read_b32 offset using a 16-bit relocation
+; GCN: v_add_{{[iu]}}32_e32 [[PTR:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+
 ; GCN: ds_read_b32 [[RESULT:v[0-9]+]], [[PTR]] offset:8
 ; GCN: v_add_{{[iu]}}32_e32 [[ADDUSE:v[0-9]+]], vcc, 8, v{{[0-9]+}}
 ; GCN-DAG: buffer_store_dword [[RESULT]]
@@ -68,10 +72,18 @@
 ; The two globals are placed adjacent in memory, so the same base
 ; pointer can be used with an offset into the second one.
 
+; TODO: Recover the optimization of using ds_read2st64_b32 using alignment hints
+
 ; GCN-LABEL: {{^}}load_shl_base_lds_2:
-; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN: v_lshlrev_b32_e32 [[OFS:v[0-9]+]], 2, {{v[0-9]+}}
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR0:v[0-9]+]], vcc, lds0@abs32@lo, [[OFS]]
+; GCN-DAG: v_add_{{[iu]}}32_e32 [[PTR1:v[0-9]+]], vcc, lds1@abs32@lo, [[OFS]]
 ; GCN: s_mov_b32 m0, -1
-; GCN-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR0]] offset:256
+; GCN-DAG: ds_read_b32 {{v[0-9]+}}, [[PTR1]] offset:256
+; TODO: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
+
 ; GCN: s_endpgm
 define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll
===================================================================
--- test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -4,16 +4,11 @@
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
 ; GCN-LABEL: {{^}}main:
 ; GCN: s_wqm
 
 ; Make sure not emitting unused scratch resource descriptor setup
 ; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
-; GCN-NOT: s_mov_b32
 
 ; GCN: s_mov_b32 m0
 
@@ -26,6 +21,7 @@
 ; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main([17 x <4 x i32>] addrspace(4)* byval %arg, [32 x <4 x i32>] addrspace(4)* byval %arg1, [16 x <8 x i32>] addrspace(4)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
+  %lds = inttoptr i32 0 to [64 x i32] addrspace(3)*
   %tmp = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(4)* %arg, i64 0, i32 0
   %tmp21 = load <4 x i32>, <4 x i32> addrspace(4)* %tmp, !tbaa !0
   %tmp22 = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %tmp21, i32 96, i32 0)
@@ -203,18 +199,18 @@
   %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
   %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
-  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
+  %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp109
   %tmp111 = bitcast float %p2.i to i32
   store i32 %tmp111, i32 addrspace(3)* %tmp110
   %tmp112 = bitcast float %p2.i96 to i32
   store i32 %tmp112, i32 addrspace(3)* %tmp110
   %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
-  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp113
+  %tmp114 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp113
   %tmp115 = and i32 %tmp113, -4
-  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
+  %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp115
   %tmp117 = add i32 %tmp115, 1
-  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
+  %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp117
   %tmp119 = bitcast float %p2.i to i32
   store i32 %tmp119, i32 addrspace(3)* %tmp114
   %tmp120 = load i32, i32 addrspace(3)* %tmp116
@@ -241,7 +237,7 @@
   %tmp140 = fmul float %tmp59, %p2.i96
   %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
-  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
+  %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp141
   %tmp143 = bitcast float %tmp137 to i32
   store i32 %tmp143, i32 addrspace(3)* %tmp142
   %tmp144 = bitcast float %tmp138 to i32
@@ -252,11 +248,11 @@
   store i32 %tmp146, i32 addrspace(3)* %tmp142
   %mbcnt.lo.3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp147 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.3)
-  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp147
+  %tmp148 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp147
   %tmp149 = and i32 %tmp147, -4
-  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp149
+  %tmp150 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp149
   %tmp151 = add i32 %tmp149, 2
-  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp151
+  %tmp152 = getelementptr [64 x i32], [64 x i32] addrspace(3)* %lds, i32 0, i32 %tmp151
   %tmp153 = bitcast float %tmp137 to i32
   store i32 %tmp153, i32 addrspace(3)* %tmp148
   %tmp154 = load i32, i32 addrspace(3)* %tmp150
Index: test/CodeGen/AMDGPU/sopk-compares.ll
===================================================================
--- test/CodeGen/AMDGPU/sopk-compares.ll
+++ test/CodeGen/AMDGPU/sopk-compares.ll
@@ -1,12 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-; Since this intrinsic is exposed as a constant after isel, use it to
-; defeat the DAG's compare with constant canonicalizations.
-declare i32 @llvm.amdgcn.groupstaticsize() #1
-
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
 define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
@@ -232,23 +226,6 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}br_scc_sge_i32:
-; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp sge i32 %cond, %size
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
 ; GCN-LABEL: {{^}}br_scc_slt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
 define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
@@ -265,57 +242,6 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}br_scc_sle_i32:
-; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp sle i32 %cond, %size
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}br_scc_ugt_i32:
-; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp ugt i32 %cond, %size
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}br_scc_uge_i32:
-; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp uge i32 %cond, %size
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
 ; GCN-LABEL: {{^}}br_scc_ult_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
 define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
@@ -364,211 +290,6 @@
   ret void
 }
 
-; GCN-LABEL: {{^}}br_scc_ule_i32:
-; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp ule i32 %cond, %size
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
-; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp eq i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
-; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp ne i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
-; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp sgt i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
-; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp sge i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
-; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp slt i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
-; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp sle i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
-; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp ugt i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
-; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp uge i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
-; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp ult i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
-; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %cmp0 = icmp ule i32 %size, %cond
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
-; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
-; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
-define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
-entry:
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %not.size = xor i32 %size, -1
-  %cmp0 = icmp ult i32 %cond, %not.size
-  br i1 %cmp0, label %endif, label %if
-
-if:
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  br label %endif
-
-endif:
-  store volatile i32 1, i32 addrspace(1)* %out
-  ret void
-}
-
 ; GCN-LABEL: {{^}}br_scc_eq_i64_inline_imm:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
Index: test/CodeGen/AMDGPU/sub.i16.ll
===================================================================
--- test/CodeGen/AMDGPU/sub.i16.ll
+++ test/CodeGen/AMDGPU/sub.i16.ll
@@ -144,26 +144,7 @@
   ret void
 }
 
-@lds = addrspace(3) global [512 x i32] undef, align 4
-
-; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
-; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
-; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
-define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
-  %size = call i32 @llvm.amdgcn.groupstaticsize()
-  %size.trunc = trunc i32 %size to i16
-  call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
-  %tid = call i32 @llvm.amdgcn.workitem.id.x()
-  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
-  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
-  %a = load volatile i16, i16 addrspace(1)* %gep.in0
-  %add = sub i16 %a, %size.trunc
-  store i16 %add, i16 addrspace(1)* %out
-  ret void
-}
-
 declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare i32 @llvm.amdgcn.groupstaticsize() #0
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
Index: test/CodeGen/AMDGPU/target-cpu.ll
===================================================================
--- test/CodeGen/AMDGPU/target-cpu.ll
+++ test/CodeGen/AMDGPU/target-cpu.ll
@@ -78,7 +78,6 @@
 
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
-; CHECK: ; LDSByteSize: 5120
 define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4, addrspace(5)
Index: test/CodeGen/MIR/AMDGPU/machine-function-info.ll
===================================================================
--- test/CodeGen/MIR/AMDGPU/machine-function-info.ll
+++ test/CodeGen/MIR/AMDGPU/machine-function-info.ll
@@ -10,7 +10,7 @@
 ; CHECK: machineFunctionInfo:
 ; CHECK-NEXT: explicitKernArgSize: 128
 ; CHECK-NEXT: maxKernArgAlign: 64
-; CHECK-NEXT: ldsSize: 2048
+; CHECK-NEXT: ldsSize: 0
 ; CHECK-NEXT: isEntryFunction: true
 ; CHECK-NEXT: noSignedZerosFPMath: false
 ; CHECK-NEXT: memoryBound: false