Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1305,7 +1305,8 @@
 
   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
-    if (!MFI->isModuleEntryFunction()) {
+    if (!MFI->isModuleEntryFunction() &&
+        !GV->getName().equals("llvm.amdgcn.module.lds")) {
       SDLoc DL(Op);
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
Index: llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2286,7 +2286,8 @@
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    if (!MFI->isModuleEntryFunction()) {
+    if (!MFI->isModuleEntryFunction() &&
+        !GV->getName().equals("llvm.amdgcn.module.lds")) {
       const Function &Fn = MF.getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
Index: llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -212,8 +212,6 @@
 
     Align MaxAlign =
         AMDGPU::getAlign(DL, LocalVars[0]); // was sorted on alignment
-    Constant *InstanceAddress = Constant::getIntegerValue(
-        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
 
     GlobalVariable *SGV = new GlobalVariable(
         M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
@@ -236,7 +234,7 @@
       GlobalVariable *GV = LocalVars[I];
       Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
       GV->replaceAllUsesWith(
-          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
+          ConstantExpr::getGetElementPtr(LDSTy, SGV, GEPIdx));
       GV->eraseFromParent();
     }
 
Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -64,7 +64,7 @@
 
 void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
   if (isModuleEntryFunction()) {
-    GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds");
+    const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
     if (GV) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
       (void)Offset;
Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -19,7 +19,7 @@
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
 
 ; CHECK-LABEL: @get_func()
-; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define i32 @get_func() local_unnamed_addr #0 {
 entry:
   %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -27,7 +27,7 @@
 }
 
 ; CHECK-LABEL: @set_func(i32 %x)
-; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+; CHECK:  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define void @set_func(i32 %x) local_unnamed_addr #1 {
 entry:
   store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
@@ -36,9 +36,9 @@
 
 ; CHECK-LABEL: @timestwo()
 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 ; CHECK: %mul = mul i32 %ld, 2
-; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
 define amdgpu_kernel void @timestwo() {
   %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
   %mul = mul i32 %ld, 2
Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
+++ llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
@@ -3,9 +3,9 @@
 
 ; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
 
-; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
+; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to float*), align 8
 
-; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
+; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to double*), align 8
 
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
 
Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Check that module LDS is allocated at address 0 and kernel starts its
+; allocation past module LDS.
+
+@lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
+@lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
+
+; GCN-LABEL: {{^}}k0:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:     ds_write_b8 [[NULL]], [[ONE]]
+; GCN:     v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN:     ds_write_b8 [[NULL]], [[TWO]] offset:16
+define amdgpu_kernel void @k0() {
+; OPT-LABEL: @k0(
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"([[LLVM_AMDGCN_MODULE_LDS_T:%.*]] addrspace(3)* @llvm.amdgcn.module.lds) ]
+; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
+; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
+; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
+; OPT-NEXT:    ret void
+;
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+  ret void
+}
+
+; GCN-LABEL: {{^}}f0:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
+; GCN:     ds_write_b8 [[NULL]], [[TREE]]
+define void @f0() {
+; OPT-LABEL: @f0(
+; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
+; OPT-NEXT:    store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
+; OPT-NEXT:    ret void
+;
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+  ret void
+}
Index: llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -29,7 +29,7 @@
 @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
 
 ; CHECK-LABEL: @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4
 define void @func() {
   %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
   %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
Index: llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -21,12 +21,12 @@
 ; Instance of new type, aligned to max of element alignment
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
 
-; Use in func rewritten to access struct at address zero, which prints as null
+; Use in func rewritten to access struct at address zero
 ; CHECK-LABEL: @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
-; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0
+; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
 ; CHECK: %val1 = add i32 %val0, 4
-; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
 ; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
 define void @func() {
   %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
@@ -41,7 +41,7 @@
 ; CHECK-LABEL: @kern_call()
 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK: call void @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4
 define amdgpu_kernel void @kern_call() {
   call void @func()
   %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic