diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -30,6 +30,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "Utils/AMDGPUMemoryUtils.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -163,9 +164,10 @@
   }
 
   bool runOnModule(Module &M) override {
+    CallGraph CG = CallGraph(M);
     UsedList = getUsedList(M);
     bool Changed = superAlignLDSGlobals(M);
-    Changed |= processUsedLDS(M);
+    Changed |= processUsedLDS(CG, M);
 
     for (Function &F : M.functions()) {
       if (F.isDeclaration())
@@ -174,7 +176,7 @@
       // Only lower compute kernels' LDS.
       if (!AMDGPU::isKernel(F.getCallingConv()))
         continue;
-      Changed |= processUsedLDS(M, &F);
+      Changed |= processUsedLDS(CG, M, &F);
     }
 
     UsedList.clear();
@@ -226,7 +228,7 @@
     return Changed;
   }
 
-  bool processUsedLDS(Module &M, Function *F = nullptr) {
+  bool processUsedLDS(CallGraph const &CG, Module &M, Function *F = nullptr) {
     LLVMContext &Ctx = M.getContext();
     const DataLayout &DL = M.getDataLayout();
 
@@ -374,7 +376,20 @@
       IRBuilder<> Builder(Ctx);
       for (Function &Func : M.functions()) {
         if (!Func.isDeclaration() && AMDGPU::isKernelCC(&Func)) {
-          markUsedByKernel(Builder, &Func, SGV);
+          const CallGraphNode *N = CG[&Func];
+          const bool CalleesRequireModuleLDS = N->size() > 0;
+
+          if (CalleesRequireModuleLDS) {
+            // If a function this kernel might call requires module LDS,
+            // annotate the kernel to let later passes know it will allocate
+            // this structure, even if not apparent from the IR.
+            markUsedByKernel(Builder, &Func, SGV);
+          } else {
+            // However if we are certain this kernel cannot call a function that
+            // requires module LDS, annotate the kernel so the backend can elide
+            // the allocation without repeating callgraph walks.
+            Func.addFnAttr("amdgpu-elide-module-lds", "true");
+          }
         }
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -56,6 +56,9 @@
   // Kernel may need limited waves per EU for better performance.
   bool WaveLimiter = false;
 
+  // This kernel calls no functions that require the module lds struct
+  bool CanElideModuleLDS = false;
+
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
 
@@ -91,6 +94,8 @@
     return WaveLimiter;
   }
 
+  bool canElideModuleLDS() const { return CanElideModuleLDS; }
+
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
   void allocateModuleLDSGlobal(const Module *M);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -32,6 +32,9 @@
   Attribute WaveLimitAttr = F.getFnAttribute("amdgpu-wave-limiter");
   WaveLimiter = WaveLimitAttr.getValueAsBool();
 
+  const char *Attr = "amdgpu-elide-module-lds";
+  CanElideModuleLDS = F.getFnAttribute(Attr).getValueAsBool();
+
   CallingConv::ID CC = F.getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@@ -64,7 +67,7 @@
 void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
   if (isModuleEntryFunction()) {
     const GlobalVariable *GV = M->getNamedGlobal("llvm.amdgcn.module.lds");
-    if (GV) {
+    if (GV && !canElideModuleLDS()) {
       unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
       (void)Offset;
       assert(Offset == 0 &&
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -21,9 +21,8 @@
 ; CHECK: @llvm.amdgcn.kernel..lds = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t undef, align 2
 ; CHECK: @llvm.amdgcn.kernel..lds.1 = internal addrspace(3) global %llvm.amdgcn.kernel..lds.t.0 undef, align 4
 ;.
-define amdgpu_kernel void @k0() {
+define amdgpu_kernel void @k0() #0 {
 ; CHECK-LABEL: @k0(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
@@ -49,9 +48,8 @@
   ret void
 }
 
-define amdgpu_kernel void @k1() {
+define amdgpu_kernel void @k1() #0 {
 ; CHECK-LABEL: @k1(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
 ; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
@@ -72,9 +70,8 @@
   ret void
 }
 
-define amdgpu_kernel void @0() {
+define amdgpu_kernel void @0() #0 {
 ; CHECK-LABEL: @0(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t, %llvm.amdgcn.kernel..lds.t addrspace(3)* @llvm.amdgcn.kernel..lds, i32 0, i32 0) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
 ; CHECK-NEXT:    ret void
@@ -85,9 +82,8 @@
   ret void
 }
 
-define amdgpu_kernel void @1() {
+define amdgpu_kernel void @1() #0 {
 ; CHECK-LABEL: @1(
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
 ; CHECK-NEXT:    %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel..lds.t.0, %llvm.amdgcn.kernel..lds.t.0 addrspace(3)* @llvm.amdgcn.kernel..lds.1, i32 0, i32 0) to i8 addrspace(3)*
 ; CHECK-NEXT:    store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
 ; CHECK-NEXT:    ret void
@@ -114,6 +110,6 @@
 
   ret void
 }
-;.
-; CHECK: attributes #0 = { nofree nosync nounwind readnone willreturn }
-;.
+
+attributes #0 = { "amdgpu-elide-module-lds"="true" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -35,8 +35,8 @@
   ret void
 }
 
-; CHECK-LABEL: @timestwo()
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK-LABEL: @timestwo() #0
+; CHECK-NOT: call void @llvm.donothing()
 ; CHECK: %1 = bitcast float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.timestwo.lds.t, %llvm.amdgcn.kernel.timestwo.lds.t addrspace(3)* @llvm.amdgcn.kernel.timestwo.lds, i32 0, i32 0) to i32 addrspace(3)*
 ; CHECK: %2 = addrspacecast i32 addrspace(3)* %1 to i32*
 ; CHECK: %3 = ptrtoint i32* %2 to i64
@@ -56,3 +56,6 @@
   store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
   ret void
 }
+
+attributes #0 = { "amdgpu-elide-module-lds"="true" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-offsets.ll
@@ -4,7 +4,7 @@
 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; Check that module LDS is allocated at address 0 and kernel starts its
-; allocation past module LDS.
+; allocation past module LDS when a call is present.
 
 @lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
 @lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
@@ -22,12 +22,14 @@
 ; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
 ; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K0_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K0_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
+; OPT-NEXT:    call void @f0()
 ; OPT-NEXT:    ret void
 ;
   %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
   store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
   %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
   store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+  call void @f0()
   ret void
 }
 
@@ -36,7 +38,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[TREE:v[0-9]+]], 3
 ; GCN:     ds_write_b8 [[NULL]], [[TREE]]
 define void @f0() {
-; OPT-LABEL: @f0(
+; OPT-LABEL: @f0() {
 ; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], [[LLVM_AMDGCN_MODULE_LDS_T]] addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
 ; OPT-NEXT:    store i8 3, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
 ; OPT-NEXT:    ret void
@@ -45,3 +47,35 @@
   store i8 3, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
   ret void
 }
+
+; Without the function call, no module.lds.t at address zero is necessary so the
+; two variables are allocated in unspecified order. This is a weakness in current
+; codegen - the variable is moved into module.lds.t as a different function required
+; that, then module.lds.t is allocated as a normal variable. Coincidentally codegen
+; currently puts the higher alignment variable at zero. A later patch will avoid
+; moving variables into module.lds.t in kernels where that is not required, leaving
+; the variables to be optimally allocated as part of the kernel specific struct.
+; GCN-LABEL: {{^}}k1:
+; GCN-DAG: v_mov_b32_e32 [[NULL:v[0-9]+]], 0
+; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
+; GCN:     ds_write_b8 [[NULL]], [[ONE]] offset:16
+; GCN:     v_mov_b32_e32 [[TWO:v[0-9]+]], 2
+; GCN:     ds_write_b8 [[NULL]], [[TWO]]
+define amdgpu_kernel void @k1() #0 {
+; OPT-LABEL: @k1(
+; OPT-NOT:     call void @llvm.donothing()
+; OPT-NEXT:    [[LDS_SIZE_1_ALIGN_1_BC:%.*]] = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
+; OPT-NEXT:    store i8 1, i8 addrspace(3)* [[LDS_SIZE_1_ALIGN_1_BC]], align 1
+; OPT-NEXT:    [[LDS_SIZE_16_ALIGN_16_BC:%.*]] = bitcast [16 x i8] addrspace(3)* getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K1_LDS_T:%.*]], [[LLVM_AMDGCN_KERNEL_K1_LDS_T]] addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
+; OPT-NEXT:    store i8 2, i8 addrspace(3)* [[LDS_SIZE_16_ALIGN_16_BC]], align 16
+; OPT-NEXT:    ret void
+;
+  %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*
+  store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
+  %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @lds.size.16.align.16 to i8 addrspace(3)*
+  store i8 2, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
+  ret void
+}
+
+attributes #0 = { "amdgpu-elide-module-lds"="true" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -48,13 +48,16 @@
   ret void
 }
 
-; This kernel does not need to alloc the LDS block as it makes no calls
+; This kernel does alloc the LDS block as it makes no calls
 ; CHECK-LABEL: @kern_empty()
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-define spir_kernel void @kern_empty() {
+; CHECK-NOT: call void @llvm.donothing()
+define spir_kernel void @kern_empty() #0{
   ret void
 }
 
 ; Make sure we don't crash trying to insert code into a kernel
 ; declaration.
 declare amdgpu_kernel void @kernel_declaration()
+
+attributes #0 = { "amdgpu-elide-module-lds"="true" }
+; CHECK: attributes #0 = { "amdgpu-elide-module-lds"="true" }