diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -72,6 +72,7 @@
 FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
 ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
 FunctionPass *createAMDGPURewriteOutArgumentsPass();
+ModulePass *createAMDGPULowerFunctionLDSPass();
 FunctionPass *createSIModeRegisterPass();
 
 struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
@@ -146,6 +147,13 @@
   TargetMachine &TM;
 };
 
+void initializeAMDGPULowerFunctionLDSPass(PassRegistry &);
+extern char &AMDGPULowerFunctionLDSID;
+
+struct AMDGPULowerFunctionLDSPass : PassInfoMixin<AMDGPULowerFunctionLDSPass> {
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 void initializeAMDGPURewriteOutArgumentsPass(PassRegistry &);
 extern char &AMDGPURewriteOutArgumentsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLDSPass.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLDSPass.cpp
@@ -0,0 +1,340 @@
+//===-- AMDGPULowerFunctionLDSPass.cpp ----------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates LDS uses from non-kernel functions.
+//
+// The strategy is to create a new struct with a field for each LDS variable
+// and allocate that struct at the same address for every kernel. Uses of the
+// original LDS variables are then replaced with compile time offsets from that
+// known address. AMDGPUMachineFunction allocates the LDS global.
+//
+// Local variables with constant annotation or non-undef initializer are passed
+// through unchanged for simplication or error diagnostics in later passes.
+//
+// To reduce the memory overhead variables that are only used by kernels are
+// excluded from this transform. The analysis to determine whether a variable
+// is only used by a kernel is cheap and conservative so this may allocate
+// a variable in every kernel when it was not strictly necessary to do so.
+//
+// A possible future refinement is to specialise the structure per-kernel, so
+// that fields can be elided based on more expensive analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <vector>
+
+#define DEBUG_TYPE "amdgpu-lower-function-lds"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPULowerFunctionLDS : public ModulePass {
+
+  static bool isKernelCC(Function *Func) {
+    return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
+  }
+
+  static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
+    return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+                                         GV->getValueType());
+  }
+
+  static bool
+  useByKernelOrUsedList(const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                        const Use &U, uint64_t Depth = 0) {
+    // Variables used by kernels or llvm.used are not moved into the new struct
+    User *V = U.getUser();
+
+    if (Depth > 4) {
+      return false;
+    }
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      if (isKernelCC(I->getFunction())) {
+        return true;
+      }
+    }
+    if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
+      if (UsedList.contains(G)) {
+        return true;
+      }
+    }
+    if (auto *E = dyn_cast<ConstantExpr>(V)) {
+      return std::all_of(E->use_begin(), E->use_end(), [&](const Use &E) {
+        return useByKernelOrUsedList(UsedList, E, Depth + 1);
+      });
+    }
+    return false;
+  }
+
+  static std::vector<GlobalVariable *>
+  findVariablesToLower(Module &M,
+                       const SmallPtrSetImpl<GlobalValue *> &UsedList) {
+    std::vector<llvm::GlobalVariable *> LocalVars;
+    for (auto &GV : M.globals()) {
+      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+        continue;
+      }
+      if (!GV.hasInitializer()) {
+        // addrspace(3) without initializer implies cuda/hip extern __shared__
+        // the semantics for such a variable appears to be that all extern
+        // __shared__ variables alias one another, in which case this transform
+        // is not required
+        continue;
+      }
+      if (!isa<UndefValue>(GV.getInitializer())) {
+        // Initializers are unimplemented for local address space.
+        // Leave such variables in place for consistent error reporting.
+        continue;
+      }
+      if (GV.isConstant()) {
+        // A constant undef variable can't be written to, and any load is
+        // undef, so it should be eliminated by the optimizer. It could be
+        // dropped by the back end if not. This pass skips over it.
+        continue;
+      }
+      if (std::all_of(GV.use_begin(), GV.use_end(), [&](const Use &U) {
+            return useByKernelOrUsedList(UsedList, U);
+          })) {
+        continue;
+      }
+      LocalVars.push_back(&GV);
+    }
+    return LocalVars;
+  }
+
+  static void removeFromUsedList(Module &M, StringRef Name,
+                                 SmallPtrSetImpl<Constant *> &ToRemove) {
+    GlobalVariable *GV = M.getGlobalVariable(Name);
+    if (!GV || ToRemove.empty()) {
+      return;
+    }
+
+    SmallVector<Constant *, 16> Init;
+    auto *CA = cast<ConstantArray>(GV->getInitializer());
+    for (auto &Op : CA->operands()) {
+      // ModuleUtils::appendToUsed only inserts Constants
+      Constant *C = cast<Constant>(Op);
+      if (!ToRemove.contains(C->stripPointerCasts())) {
+        Init.push_back(C);
+      }
+    }
+
+    if (Init.size() == CA->getNumOperands()) {
+      return; // none to remove
+    }
+
+    GV->eraseFromParent();
+
+    if (!Init.empty()) {
+      ArrayType *ATy =
+          ArrayType::get(Type::getInt8PtrTy(M.getContext()), Init.size());
+      GV =
+          new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+                                   ConstantArray::get(ATy, Init), Name);
+      GV->setSection("llvm.metadata");
+    }
+  }
+
+  static void
+  removeFromUsedLists(Module &M,
+                      const std::vector<GlobalVariable *> &LocalVars) {
+    SmallPtrSet<Constant *, 32> LocalVarsSet;
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      if (Constant *C = dyn_cast<Constant>(LocalVars[I]->stripPointerCasts())) {
+        LocalVarsSet.insert(C);
+      }
+    }
+    removeFromUsedList(M, "llvm.used", LocalVarsSet);
+    removeFromUsedList(M, "llvm.compiler.used", LocalVarsSet);
+  }
+
+  static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
+                               GlobalVariable *SGV) {
+    LLVMContext &Ctx = Func->getContext();
+    Type *Ty = SGV->getValueType();
+
+    Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
+
+    FunctionType *FTy = FunctionType::get(
+        Type::getVoidTy(Ctx), {PointerType::get(Ty, AMDGPUAS::LOCAL_ADDRESS)},
+        false);
+
+    // Could use an intrinsic which is erased by the backend after LDS lowering
+    InlineAsm *IA =
+        InlineAsm::get(FTy, "// Alloc function lds block", "s", true);
+    Builder.CreateCall(
+        IA, {Builder.CreateInBoundsGEP(
+                Ty, SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))});
+  }
+
+public:
+  static char ID;
+
+  AMDGPULowerFunctionLDS() : ModulePass(ID) {
+    initializeAMDGPULowerFunctionLDSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    LLVMContext &Ctx = M.getContext();
+    DataLayout const &DL = M.getDataLayout();
+
+    SmallPtrSet<GlobalValue *, 32> UsedList;
+    collectUsedGlobalVariables(M, UsedList, true);
+    collectUsedGlobalVariables(M, UsedList, false);
+
+    // Find variables to move into new struct instance
+    std::vector<GlobalVariable *> FoundLocalVars =
+        findVariablesToLower(M, UsedList);
+
+    if (FoundLocalVars.empty()) {
+      // No variables to rewrite, no changes made.
+      return false;
+    }
+
+    // Sort by alignment, descending, to minimise padding.
+    // On ties, sort by size, descending, then by name, lexicographical.
+    llvm::sort(
+        FoundLocalVars.begin(), FoundLocalVars.end(),
+        [&](const GlobalVariable *LHS, const GlobalVariable *RHS) -> bool {
+          Align ALHS = getAlign(DL, LHS);
+          Align ARHS = getAlign(DL, RHS);
+          if (ALHS != ARHS) {
+            return ALHS > ARHS;
+          }
+
+          TypeSize SLHS = DL.getTypeAllocSize(LHS->getValueType());
+          TypeSize SRHS = DL.getTypeAllocSize(RHS->getValueType());
+          if (SLHS != SRHS) {
+            return SLHS > SRHS;
+          }
+
+          // By variable name on tie for predictable order in test cases.
+          return LHS->getName() < RHS->getName();
+        });
+
+    std::vector<GlobalVariable *> LocalVars;
+    LocalVars.reserve(FoundLocalVars.size()); // will be at least this large
+    {
+      // This usually won't need to insert any padding, perhaps avoid the alloc
+      uint64_t CurrentOffset = 0;
+      for (size_t I = 0; I < FoundLocalVars.size(); I++) {
+        GlobalVariable *FGV = FoundLocalVars[I];
+        Align DataAlign = getAlign(DL, FGV);
+
+        uint64_t DataAlignV = DataAlign.value();
+        if (uint64_t Rem = CurrentOffset % DataAlignV) {
+          uint64_t Padding = DataAlignV - Rem;
+
+          // Append an array of padding bytes to meet alignment requested
+          // Note (o +      (a - (o % a)) ) % a == 0
+          //      (offset + Padding       ) % align == 0
+
+          Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
+          LocalVars.push_back(new GlobalVariable(
+              M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
+              "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+              false));
+          CurrentOffset += Padding;
+        }
+
+        LocalVars.push_back(FGV);
+        CurrentOffset += DL.getTypeAllocSize(FGV->getValueType());
+      }
+    }
+
+    std::vector<Type *> LocalVarTypes;
+    LocalVarTypes.reserve(LocalVars.size());
+    std::transform(
+        LocalVars.cbegin(), LocalVars.cend(), std::back_inserter(LocalVarTypes),
+        [](const GlobalVariable *V) -> Type * { return V->getValueType(); });
+
+    StructType *LDSTy = StructType::create(
+        Ctx, LocalVarTypes, llvm::StringRef("llvm.amdgcn.function.lds.t"));
+
+    Align MaxAlign = getAlign(DL, LocalVars[0]); // was sorted on alignment
+    Constant *InstanceAddress = Constant::getIntegerValue(
+        PointerType::get(LDSTy, AMDGPUAS::LOCAL_ADDRESS), APInt(32, 0));
+
+    GlobalVariable *SGV = new GlobalVariable(
+        M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+        "llvm.amdgcn.function.lds", nullptr, GlobalValue::NotThreadLocal,
+        AMDGPUAS::LOCAL_ADDRESS, false);
+    SGV->setAlignment(MaxAlign);
+    appendToCompilerUsed(
+        M, {static_cast<GlobalValue *>(
+               ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+                   cast<Constant>(SGV), Type::getInt8PtrTy(Ctx)))});
+
+    // The verifier rejects used lists containing an inttoptr of a constant
+    // so remove the variables from these lists before replaceAllUsesWith
+    removeFromUsedLists(M, LocalVars);
+
+    // Replace uses of ith variable with a constantexpr to the ith field of the
+    // instance that will be allocated by AMDGPUMachineFunction
+    Type *I32 = Type::getInt32Ty(Ctx);
+    for (size_t I = 0; I < LocalVars.size(); I++) {
+      GlobalVariable *GV = LocalVars[I];
+      Constant *GEPIdx[] = {ConstantInt::get(I32, 0), ConstantInt::get(I32, I)};
+      GV->replaceAllUsesWith(
+          ConstantExpr::getGetElementPtr(LDSTy, InstanceAddress, GEPIdx));
+      GV->eraseFromParent();
+    }
+
+    // Mark kernels with asm that reads the address of the allocated structure
+    // This is not necessary for lowering. This lets other passes, specifically
+    // PromoteAlloca, accurately calculate how much LDS will be used by the
+    // kernel after lowering.
+    {
+      IRBuilder<> Builder(Ctx);
+      SmallPtrSet<Function *, 32> Kernels;
+      for (auto &I : M.functions()) {
+        Function *Func = &I;
+        if (isKernelCC(Func)) {
+          if (!Kernels.contains(Func)) {
+            markUsedByKernel(Builder, Func, SGV);
+            Kernels.insert(Func);
+          }
+        }
+      }
+    }
+    return true;
+  }
+};
+
+} // namespace
+char AMDGPULowerFunctionLDS::ID = 0;
+
+char &llvm::AMDGPULowerFunctionLDSID = AMDGPULowerFunctionLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerFunctionLDS, DEBUG_TYPE,
+                "Lower uses of LDS variables from non-kernel functions", false,
+                false)
+
+ModulePass *llvm::createAMDGPULowerFunctionLDSPass() {
+  return new AMDGPULowerFunctionLDS();
+}
+
+PreservedAnalyses AMDGPULowerFunctionLDSPass::run(Module &M,
+                                                  ModuleAnalysisManager &) {
+  return AMDGPULowerFunctionLDS().runOnModule(M) ? PreservedAnalyses::none()
+                                                 : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -38,6 +38,12 @@
   CallingConv::ID CC = F.getCallingConv();
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
     ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
+
+  const Module *M = F.getParent();
+  GlobalVariable *GV = M->getGlobalVariable("llvm.amdgcn.function.lds");
+  if (GV) {
+    allocateLDSGlobal(M->getDataLayout(), *GV);
+  }
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -126,8 +126,13 @@
 char AMDGPUPromoteAlloca::ID = 0;
 char AMDGPUPromoteAllocaToVector::ID = 0;
 
-INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
-                "AMDGPU promote alloca to vector or LDS", false, false)
+INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                      "AMDGPU promote alloca to vector or LDS", false, false)
+// Move LDS uses from functions to kernels before promote alloca for accurate
+// estimation of LDS available
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerFunctionLDS)
+INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
+                    "AMDGPU promote alloca to vector or LDS", false, false)
 
 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
                 "AMDGPU promote alloca to vector", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -193,6 +193,11 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool>
+    DisableLowerFunctionLDS("amdgpu-disable-lower-function-lds", cl::Hidden,
+                            cl::desc("Disable lower function lds pass"),
+                            cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -234,6 +239,7 @@
   initializeAMDGPULateCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
+  initializeAMDGPULowerFunctionLDSPass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
@@ -470,6 +476,10 @@
       // and before other cleanup optimizations.
       PM.add(createAMDGPULowerKernelAttributesPass());
 
+      if (!DisableLowerFunctionLDS) {
+        PM.add(createAMDGPULowerFunctionLDSPass());
+      }
+
       // Promote alloca to vector before SROA and loop unroll. If we manage
       // to eliminate allocas before unroll we may choose to unroll less.
       if (EnableOpt)
@@ -502,6 +512,12 @@
           PM.addPass(AMDGPUAlwaysInlinePass());
           return true;
         }
+        if (PassName == "amdgpu-lower-function-lds") {
+          if (!DisableLowerFunctionLDS) {
+            PM.addPass(AMDGPULowerFunctionLDSPass());
+          }
+          return true;
+        }
         return false;
       });
   PB.registerPipelineParsingCallback(
@@ -531,7 +547,6 @@
           PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
           return true;
         }
-
         return false;
       });
 
@@ -876,6 +891,11 @@
   // Replace OpenCL enqueued block function pointers with global variables.
   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
 
+  // Can increase LDS used by kernel so runs before PromoteAlloca
+  if (!DisableLowerFunctionLDS) {
+    addPass(createAMDGPULowerFunctionLDSPass());
+  }
+
   if (TM.getOptLevel() > CodeGenOpt::None) {
     addPass(createInferAddressSpacesPass());
     addPass(createAMDGPUPromoteAlloca());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -72,6 +72,7 @@
   AMDGPUMachineModuleInfo.cpp
   AMDGPUMacroFusion.cpp
   AMDGPUMCInstLower.cpp
+  AMDGPULowerFunctionLDSPass.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
   AMDGPUPostLegalizerCombiner.cpp
   AMDGPUPreLegalizerCombiner.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-function-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-function-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-function-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*)
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-function-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-function-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-function-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-function-lds-inactive.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-function-lds-inactive.ll
@@ -0,0 +1,47 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-function-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-function-lds < %s | FileCheck %s
+
+; Variables that are not lowered by this pass are left unchanged
+; CHECK-NOT: asm
+; CHECK-NOT: llvm.amdgcn.function.lds
+; CHECK-NOT: llvm.amdgcn.function.lds.t
+
+; var1, var2 would be transformed were they used from a non-kernel function
+@var1 = addrspace(3) global i32 undef
+@var2 = addrspace(3) global float undef
+
+; constant variables are left to the optimizer / error diagnostics
+@const_undef = addrspace(3) constant i32 undef
+@const_with_init = addrspace(3) constant i64 8
+
+; Use of an addrspace(3) variable with an initializer is skipped,
+; so as to preserve the unimplemented error from llc
+@with_init = addrspace(3) global i64 0
+
+; External and constant are both left to the optimizer / error diagnostics
+@extern = external addrspace(3) global i32
+
+; Only local addrspace variables are transformed
+@addr4 = addrspace(4) global i64 undef
+
+; Use by .used lists doesn't trigger lowering
+@llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @var1 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+@llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @var2 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; Access from a function would cause lowering for non-excluded cases
+define void @use_variables() {
+  %c0 = load i32, i32 addrspace(3)* @const_undef, align 4
+  %c1 = load i64, i64 addrspace(3)* @const_with_init, align 4
+  %v0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+  %v2 = atomicrmw add i32 addrspace(3)* @extern, i32 %c0  monotonic
+  %v1 = atomicrmw add i64 addrspace(4)* @addr4, i64 %c1 monotonic
+  ret void
+}
+
+; Use by kernel doesn't trigger lowering
+define amdgpu_kernel void @kern_use() {
+  %inc = atomicrmw add i32 addrspace(3)* @var1, i32 1 monotonic
+  call void @use_variables()
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/lower-function-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-function-lds-used-list.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-function-lds-used-list.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-function-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-function-lds < %s | FileCheck %s
+
+; Check new struct is added to compiler.used and that the replaced variable is removed
+
+; CHECK: %llvm.amdgcn.function.lds.t = type { float }
+; CHECK: @ignored = addrspace(1) global i64 0
+; CHECK: @llvm.amdgcn.function.lds = internal addrspace(3) global %llvm.amdgcn.function.lds.t undef, align 8
+
+; CHECK-NOT: @tolower
+
+@tolower = addrspace(3) global float undef, align 8
+
+; A variable that is unchanged by pass
+@ignored = addrspace(1) global i64 0
+
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.function.lds appended
+; Start with one value to replace and one to ignore in the .use list
+
+; @ignored still in list, @tolower removed
+; CHECK: @llvm.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+@llvm.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; @ignored still in list, @tolower removed, llvm.amdgcn.function.lds appended
+; CHECK: @llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.function.lds.t addrspace(3)* @llvm.amdgcn.function.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
+  %unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-function-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-function-lds.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-function-lds.ll
@@ -0,0 +1,54 @@
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-function-lds < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-function-lds < %s | FileCheck %s
+
+; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2
+; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4).
+; CHECK: %llvm.amdgcn.function.lds.t = type { float, [4 x i8], i32 }
+
+; Variables removed by pass
+; CHECK-NOT: @var0
+; CHECK-NOT: @var1
+
+@var0 = addrspace(3) global float undef, align 8
+@var1 = addrspace(3) global i32 undef, align 8
+
+; A variable that is unchanged by pass
+; CHECK: @with_init = addrspace(3) global i64 0
+@with_init = addrspace(3) global i64 0
+
+; Instance of new type, aligned to max of element alignment
+; CHECK: @llvm.amdgcn.function.lds = internal addrspace(3) global %llvm.amdgcn.function.lds.t undef, align 8
+
+; Use in func rewritten to access struct at address zero, which prints as null
+; CHECK-LABEL: @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
+; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.function.lds.t, %llvm.amdgcn.function.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %val1 = add i32 %val0, 4
+; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.function.lds.t, %llvm.amdgcn.function.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+define void @func() {
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
+  %val0 = load i32, i32 addrspace(3)* @var1, align 4
+  %val1 = add i32 %val0, 4
+  store i32 %val1, i32 addrspace(3)* @var1, align 4
+  %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+  ret void
+}
+
+; This kernel calls a function that uses LDS so needs the block
+; CHECK-LABEL: @kern_call()
+; CHECK: call void asm sideeffect "// Alloc function lds block", "s"(%llvm.amdgcn.function.lds.t addrspace(3)* @llvm.amdgcn.function.lds)
+; CHECK: call void @func()
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
+define amdgpu_kernel void @kern_call() {
+  call void @func()
+  %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
+  ret void
+}
+
+; This kernel does not need to alloc the LDS block as it makes no calls
+; CHECK-LABEL: @kern_empty()
+; CHECK: call void asm sideeffect "// Alloc function lds block", "s"(%llvm.amdgcn.function.lds.t addrspace(3)* @llvm.amdgcn.function.lds)
+define spir_kernel void @kern_empty() {
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-function-lds=true < %s | FileCheck -check-prefix=ASM %s
 
 target datalayout = "A5"