diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -71,6 +71,7 @@
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+ModulePass *createAMDGPULowerModuleLDSPass();
 FunctionPass *createSIModeRegisterPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
@@ -145,6 +146,13 @@
   TargetMachine &TM;
 };
 
+void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
+extern char &AMDGPULowerModuleLDSID;
+
+struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -500,9 +500,10 @@
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-
   const DataLayout &DL = F.getParent()->getDataLayout();
 
+  Info->allocateModuleLDSGlobal(F.getParent());
+
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
 
@@ -591,6 +592,7 @@
   const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const DataLayout &DL = F.getParent()->getDataLayout();
 
+  Info->allocateModuleLDSGlobal(F.getParent());
 
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -0,0 +1,380 @@
+//===-- AMDGPULowerModuleLDSPass.cpp ------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates LDS uses from non-kernel functions.
+//
+// The strategy is to create a new struct with a field for each LDS variable
+// and allocate that struct at the same address for every kernel. Uses of the
+// original LDS variables are then replaced with compile time offsets from that
+// known address. AMDGPUMachineFunction allocates the LDS global.
+//
+// Local variables with constant annotation or non-undef initializer are passed
+// through unchanged for simplication or error diagnostics in later passes.
+//
+// To reduce the memory overhead variables that are only used by kernels are
+// excluded from this transform. The analysis to determine whether a variable
+// is only used by a kernel is cheap and conservative so this may allocate
+// a variable in every kernel when it was not strictly necessary to do so.
+//
+// A possible future refinement is to specialise the structure per-kernel, so
+// that fields can be elided based on more expensive analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-lower-module-lds"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerModuleLDS : public ModulePass {
+
+  static bool isKernelCC(Function *Func) {
+    return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+  }
+
+  static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+    return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+                                         GV->getValueType());
+  }
+
+  static bool
+  userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                       User *InitialUser) {
+    // Any LDS variable can be lowered by moving into the created struct
+    // Each variable so lowered is allocated in every kernel, so variables
+    // whose users are all known to be safe to lower without the transform
+    // are left unchanged.
+    SmallPtrSet<User *, 8> Visited;
+    SmallVector<User *, 16> Stack;
+    Stack.push_back(InitialUser);
+
+    while (!Stack.empty()) {
+      User *V = Stack.pop_back_val();
+      Visited.insert(V);
+
+      if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
+        if (UsedList.contains(G)) {
+          continue;
+        }
+      }
+
+      if (auto *I = dyn_cast<Instruction>(V)) {
+        if (isKernelCC(I->getFunction())) {
+          continue;
+        }
+      }
+
+      if (auto *E = dyn_cast<ConstantExpr>(V)) {
+        for (Value::user_iterator EU = E->user_begin(); EU != E->user_end();
+             ++EU) {
+          if (Visited.insert(*EU).second) {
+            Stack.push_back(*EU);
+          }
+        }
+        continue;
+      }
+
+      // Unknown user, conservatively lower the variable
+      return true;
+    }
+
+    return false;
+  }
+
+  static std::vector<GlobalVariable *>
+  findVariablesToLower(Module &M,
+                       const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+    std::vector<llvm::GlobalVariable *> LocalVars;
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // addrspace(3) without initializer implies cuda/hip extern __shared__
+        // the semantics for such a variable appears to be that all extern
+        // __shared__ variables alias one another, in which case this transform
+        // is not required
+        continue;
+      }
+      if (!isa<UndefValue>(GV.getInitializer())) {
+        // Initializers are unimplemented for local address space.
+        // Leave such variables in place for consistent error reporting.
+        continue;
+      }
+      if (GV.isConstant()) {
+        // A constant undef variable can't be written to, and any load is
+        // undef, so it should be eliminated by the optimizer. It could be
+        // dropped by the back end if not. This pass skips over it.
+        continue;
+      }
+      if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
+            return userRequiresLowering(UsedList, U);
+          })) {
+        continue;
+      }
+      LocalVars.push_back(&GV);
+    }
+    return LocalVars;
+  }
+
+  static void removeFromUsedList(Module &M, StringRef Name,
+                                 SmallPtrSetImpl<Constant *> &ToRemove) {
+    GlobalVariable *GV = M.getGlobalVariable(Name);
+    if (!GV || ToRemove.empty()) {
+      return;
+    }
+
+    SmallVector<Constant *, 16> Init;
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
+    for (auto &Op : CA->operands()) {
+      // ModuleUtils::appendToUsed only inserts Constants
+      Constant *C = cast<Constant>(Op);
+      if (!ToRemove.contains(C->stripPointerCasts())) {
+        Init.push_back(C);
+      }
+    }
+
+    if (Init.size() == CA->getNumOperands()) {
+      return; // none to remove
+    }
+
+    GV->eraseFromParent();
+
+    if (!Init.empty()) {
+      ArrayType *ATy =
+          ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
+      GV =
+          new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                   ConstantArray::get(ATy, Init), Name);
+      GV->setSection("llvm.metadata");
+    }
+  }
+
+  static void
+  removeFromUsedLists(Module &M,
+                      const std::vector<GlobalVariable *> &LocalVars) {
+    SmallPtrSet<Constant *, 32> LocalVarsSet;
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+        LocalVarsSet.insert(C);
+      }
+    }
+    removeFromUsedList(M, "llvm.used", LocalVarsSet);
+    removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
+  }
+
+  static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+                               GlobalVariable *SGV) {
+    // The llvm.amdgcn.module.lds instance is implicitly used by all kernels
+    // that might call a function which accesses a field within it. This is
+    // presently approximated to 'all kernels' if there are any such functions
+    // in the module. This implicit use is reified as an explicit use here so
+    // that later passes, specifically PromoteAlloca, account for the required
+    // memory without any knowledge of this transform.
+
+    // An operand bundle on llvm.donothing works because the call instruction
+    // survives until after the last pass that needs to account for LDS. It is
+    // better than inline asm as the latter survives until the end of codegen. A
+    // totally robust solution would be a function with the same semantics as
+    // llvm.donothing that takes a pointer to the instance and is lowered to a
+    // no-op after LDS is allocated, but that is not presently necessary.
+
+    LLVMContext &Ctx = Func->getContext();
+
+    Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
+
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
+
+    Function *Decl =
+        Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
+
+    Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
+        SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
+
+    Builder.CreateCall(FTy, Decl, {},
+                       {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
+                       "");
+  }
+
+  static SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+    SmallPtrSet<GlobalValue *, 32> UsedList;
+
+    SmallVector<GlobalValue *, 32> TmpVec;
+    collectUsedGlobalVariables(M, TmpVec, true);
+    UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+    TmpVec.clear();
+    collectUsedGlobalVariables(M, TmpVec, false);
+    UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+    return UsedList;
+  }
+
+public:
+  static char ID;
+
+  AMDGPULowerModuleLDS() : ModulePass(ID) {
+    initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    const DataLayout &DL = M.getDataLayout();
+    SmallPtrSet<GlobalValue *, 32> UsedList = getUsedList(M);
+
+    // Find variables to move into new struct instance
+    std::vector<GlobalVariable *> FoundLocalVars =
+        findVariablesToLower(M, UsedList);
+
+    if (FoundLocalVars.empty()) {
+      // No variables to rewrite, no changes made.
+      return false;
+    }
+
+    // Sort by alignment, descending, to minimise padding.
+    // On ties, sort by size, descending, then by name, lexicographical.
+    llvm::stable_sort(
+        FoundLocalVars,
+        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+          Align ALHS = getAlign(DL, LHS);
+          Align ARHS = getAlign(DL, RHS);
+          if (ALHS != ARHS) {
+            return ALHS > ARHS;
+          }
+
+          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+          if (SLHS != SRHS) {
+            return SLHS > SRHS;
+          }
+
+          // By variable name on tie for predictable order in test cases.
+          return LHS->getName() < RHS->getName();
+        });
+
+    std::vector<GlobalVariable *> LocalVars;
+    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+    {
+      // This usually won't need to insert any padding, perhaps avoid the alloc
+      uint64_t CurrentOffset = 0;
+      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
+        GlobalVariable *FGV = FoundLocalVars[I];
+        Align DataAlign = getAlign(DL, FGV);
+
+        uint64_t DataAlignV = DataAlign.value();
+        if (uint64_t Rem = CurrentOffset % DataAlignV) {
+          uint64_t Padding = DataAlignV - Rem;
+
+          // Append an array of padding bytes to meet alignment requested
+          // Note (o +      (a - (o % a)) ) % a == 0
+          //      (offset + Padding       ) % align == 0
+
+          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+          LocalVars.push_back(new GlobalVariable(
+              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+              false));
+          CurrentOffset += Padding;
+        }
+
+        LocalVars.push_back(FGV);
+        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+      }
+    }
+
+    std::vector<Type *> LocalVarTypes;
+    LocalVarTypes.reserve(LocalVars.size());
+    std::transform(
+        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+    StructType *LDSTy = StructType::create(
+        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.module.lds.t"));
+
+    Align MaxAlign = getAlign(DL, LocalVars[0]); // was sorted on alignment
+    Constant *InstanceAddress = Constant::getIntegerValue(
+        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
+
+    GlobalVariable *SGV = new GlobalVariable(
+        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+        "llvm.amdgcn.module.lds", nullptr, GlobalValue::NotThreadLocal,
+        AMDGPUAS::LOCAL_ADDRESS, false);
+    SGV->setAlignment(MaxAlign);
+    appendToCompilerUsed(
+        M, {static_cast<GlobalValue *>(
+               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    // The verifier rejects used lists containing an inttoptr of a constant
+    // so remove the variables from these lists before replaceAllUsesWith
+    removeFromUsedLists(M, LocalVars);
+
+    // Replace uses of ith variable with a constantexpr to the ith field of the
+    // instance that will be allocated by AMDGPUMachineFunction
+    Type *I32 = Type::getInt32Ty(Ctx);
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      GV->replaceAllUsesWith(
+          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
+      GV->eraseFromParent();
+    }
+
+    // Mark kernels with asm that reads the address of the allocated structure
+    // This is not necessary for lowering. This lets other passes, specifically
+    // PromoteAlloca, accurately calculate how much LDS will be used by the
+    // kernel after lowering.
+    {
+      IRBuilder<> Builder(Ctx);
+      SmallPtrSet<Function *, 32> Kernels;
+      for (auto &I : M.functions()) {
+        Function *Func = &I;
+        if (isKernelCC(Func) && !Kernels.contains(Func)) {
+          markUsedByKernel(Builder, Func, SGV);
+          Kernels.insert(Func);
+        }
+      }
+    }
+    return true;
+  }
+};
+
+} // namespace
+char AMDGPULowerModuleLDS::ID = 0;
+
+char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
+                "Lower uses of LDS variables from non-kernel functions", false,
+                false)
+
+ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
+  return new AMDGPULowerModuleLDS();
+}
+
+PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
+                                                ModuleAnalysisManager &) {
+  return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
+                                               : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -94,6 +94,7 @@
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
+  void allocateModuleLDSGlobal(const Module *M);
 
   Align getDynLDSAlign() const { return DynLDSAlign; }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -64,6 +64,18 @@
   return Offset;
 }
 
+void AMDGPUMachineFunction::allocateModuleLDSGlobal(const Module *M) {
+  if (isModuleEntryFunction()) {
+    GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.module.lds");
+    if (GV) {
+      unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV);
+      (void)Offset;
+      assert(Offset == 0 &&
+             "Module LDS expected to be allocated before other LDS");
+    }
+  }
+}
+
 void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
                                            const GlobalVariable &GV) {
   assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -126,8 +126,13 @@
 char AMDGPUPromoteAlloca::ID = 0;
 char AMDGPUPromoteAllocaToVector::ID = 0;
 
-INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
-                "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                      "AMDGPU promote alloca to vector or LDS", false, false)
+// Move LDS uses from functions to kernels before promote alloca for accurate
+// estimation of LDS available
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
+INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                    "AMDGPU promote alloca to vector or LDS", false, false)
 
 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
                 "AMDGPU promote alloca to vector", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -193,6 +193,11 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool>
+    DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden,
+                          cl::desc("Disable lower module lds pass"),
+                          cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -235,6 +240,7 @@
   initializeAMDGPULateCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
+  initializeAMDGPULowerModuleLDSPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
@@ -506,6 +512,10 @@
           PM.addPass(AMDGPUAlwaysInlinePass());
           return true;
         }
+        if (PassName == "amdgpu-lower-module-lds") {
+          PM.addPass(AMDGPULowerModuleLDSPass());
+          return true;
+        }
         return false;
       });
   PB.registerPipelineParsingCallback(
@@ -535,7 +545,6 @@
           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
           return true;
         }
-
         return false;
       });
 
@@ -884,6 +893,10 @@
   // Replace OpenCL enqueued block function pointers with global variables.
   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
 
+  // Can increase LDS used by kernel so runs before PromoteAlloca
+  if (!DisableLowerModuleLDS)
+    addPass(createAMDGPULowerModuleLDSPass());
+
   if (TM.getOptLevel() > CodeGenOpt::None) {
     addPass(createInferAddressSpacesPass());
     addPass(createAMDGPUPromoteAlloca());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -67,6 +67,7 @@
   AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
+  AMDGPULowerModuleLDSPass.cpp
   AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2263,6 +2263,8 @@
     return DAG.getEntryNode();
   }
 
+  Info->allocateModuleLDSGlobal(Fn.getParent());
+
   SmallVector<ISD::InputArg, 16> Splits;
   SmallVector<CCValAssign, 16> ArgLocs;
   BitVector Skipped(Ins.size());
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*)
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, float }
+
+@func = addrspace(3) global float undef, align 4
+
+; @kern is only used from a kernel so it is left unchanged
+; CHECK: @kern = addrspace(3) global float undef, align 4
+@kern = addrspace(3) global float undef, align 4
+
+; @func is only used from a non-kernel function so is rewritten
+; CHECK-NOT: @func
+; @both is used from a non-kernel function so is rewritten
+; CHECK-NOT: @both
+; sorted both < func, so @both at null and @func at 4
+@both = addrspace(3) global float undef, align 4
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
+
+; CHECK-LABEL: @get_func()
+; CHECK: %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+define i32 @get_func() local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @func to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: @set_func(i32 %x)
+; CHECK: store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+define void @set_func(i32 %x) local_unnamed_addr #1 {
+entry:
+  store i32 %x, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
+
+; CHECK-LABEL: @timestwo()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+; CHECK: %mul = mul i32 %ld, 2
+; CHECK: store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* null to i32*) to i64)) to i32*), align 4
+define amdgpu_kernel void @timestwo() {
+  %ld = load i32, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  %mul = mul i32 %ld, 2
+  store i32 %mul, i32* inttoptr (i64 add (i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @kern to i32 addrspace(3)*) to i32*) to i64), i64 ptrtoint (i32* addrspacecast (i32 addrspace(3)* bitcast (float addrspace(3)* @both to i32 addrspace(3)*) to i32*) to i64)) to i32*), align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -0,0 +1,68 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Variables that are not lowered by this pass are left unchanged
+; CHECK-NOT: asm
+; CHECK-NOT: llvm.amdgcn.module.lds
+; CHECK-NOT: llvm.amdgcn.module.lds.t
+
+; var1, var2 would be transformed were they used from a non-kernel function
+; CHECK: @var1 = addrspace(3) global i32 undef
+; CHECK: @var2 = addrspace(3) global float undef
+@var1 = addrspace(3) global i32 undef
+@var2 = addrspace(3) global float undef
+
+; constant variables are left to the optimizer / error diagnostics
+; CHECK: @const_undef = addrspace(3) constant i32 undef
+; CHECK: @const_with_init = addrspace(3) constant i64 8
+@const_undef = addrspace(3) constant i32 undef
+@const_with_init = addrspace(3) constant i64 8
+
+; External and constant are both left to the optimizer / error diagnostics
+; CHECK: @extern = external addrspace(3) global i32
+@extern = external addrspace(3) global i32
+
+; Use of an addrspace(3) variable with an initializer is skipped,
+; so as to preserve the unimplemented error from llc
+; CHECK: @with_init = addrspace(3) global i64 0
+@with_init = addrspace(3) global i64 0
+
+; Only local addrspace variables are transformed
+; CHECK: @addr4 = addrspace(4) global i64 undef
+@addr4 = addrspace(4) global i64 undef
+
+; Assign to self is treated as any other initializer, i.e. ignored by this pass
+; CHECK: @toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+@toself = addrspace(3) global float addrspace(3)* bitcast (float addrspace(3)* addrspace(3)* @toself to float addrspace(3)*), align 8
+
+; Use by .used lists doesn't trigger lowering
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; Access from a function would cause lowering for non-excluded cases
+; CHECK-LABEL: @use_variables()
+; CHECK: %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+; CHECK: %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+; CHECK: %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
+; CHECK: %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
+; CHECK: %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+define void @use_variables() {
+  %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+  %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+  %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 seq_cst
+  %v1 = cmpxchg i32 addrspace(3)* @extern, i32 4, i32 %c0 acq_rel monotonic
+  %v2 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+  ret void
+}
+
+; Use by kernel doesn't trigger lowering
+; CHECK-LABEL: @kern_use()
+; CHECK: %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+define amdgpu_kernel void @kern_use() {
+  %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+  call void @use_variables()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
+
+; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
+
+; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+@function_target = addrspace(3) global float undef, align 4
+@function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8
+
+@kernel_target = addrspace(3) global double undef, align 8
+@kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8
+
+; CHECK-LABEL: @function(float %x)
+; CHECK: %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+define void @function(float %x) local_unnamed_addr #5 {
+entry:
+  %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+  store float %x, float* %0, align 4
+  ret void
+}
+
+; CHECK-LABEL: @kernel(double %x)
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+define amdgpu_kernel void @kernel(double %x) local_unnamed_addr #5 {
+entry:
+  %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+  store double %x, double* %0, align 8
+  ret void
+}
+
+
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Check new struct is added to compiler.used and that the replaced variable is removed
+
+; CHECK: %llvm.amdgcn.module.lds.t = type { float }
+; CHECK: @ignored = addrspace(1) global i64 0
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+; CHECK-NOT: @tolower
+
+@tolower = addrspace(3) global float undef, align 8
+
+; A variable that is unchanged by pass
+@ignored = addrspace(1) global i64 0
+
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
+; Start with one value to replace and one to ignore in the .use list
+
+; @ignored still in list, @tolower removed
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
+; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
+  %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -0,0 +1,56 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2
+; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4).
+; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
+
+; Variables removed by pass
+; CHECK-NOT: @var0
+; CHECK-NOT: @var1
+
+@var0 = addrspace(3) global float undef, align 8
+@var1 = addrspace(3) global i32 undef, align 8
+
+@ptr =  addrspace(1) global i32 addrspace(3)* @var1, align 4
+
+; A variable that is unchanged by pass
+; CHECK: @with_init = addrspace(3) global i64 0
+@with_init = addrspace(3) global i64 0
+
+; Instance of new type, aligned to max of element alignment
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+
+; Use in func rewritten to access struct at address zero, which prints as null
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %val1 = add i32 %val0, 4
+; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
+  %val0 = load i32, i32 addrspace(3)* @var1, align 4
+  %val1 = add i32 %val0, 4
+  store i32 %val1, i32 addrspace(3)* @var1, align 4
+  %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+  ret void
+}
+
+; This kernel calls a function that uses LDS so needs the block
+; CHECK-LABEL: @kern_call()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: call void @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
+define amdgpu_kernel void @kern_call() {
+  call void @func()
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
+  ret void
+}
+
+; This kernel does not need to alloc the LDS block as it makes no calls
+; CHECK-LABEL: @kern_empty()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+define spir_kernel void @kern_empty() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s
 
 target datalayout = "A5"