diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -340,6 +340,15 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPULowerFunctionLocalLDSPass();
+void initializeAMDGPULowerFunctionLocalLDSPass(PassRegistry &);
+extern char &AMDGPULowerFunctionLocalLDSID;
+struct AMDGPULowerFunctionLocalLDSPass
+    : PassInfoMixin<AMDGPULowerFunctionLocalLDSPass> {
+  AMDGPULowerFunctionLocalLDSPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -118,13 +118,15 @@
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
 
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  if (!AMDGPUTargetMachine::EnableFunctionLocalLDSLowering) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLocalLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLocalLDS.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerFunctionLocalLDS.cpp
@@ -0,0 +1,84 @@
+//===-- AMDGPULowerFunctionLocalLDS.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <set>
+
+#define DEBUG_TYPE "amdgpu-lower-function-local-lds"
+
+using namespace llvm;
+
+namespace {
+
+class LowerFunctionLocalLDSImpl {
+public:
+  // Constructs a LowerFunctionLocalLDSImpl object for the given Module.
+  LowerFunctionLocalLDSImpl(const Module &M) : M(M) {}
+
+  // Entry point function.
+  bool lower();
+
+private:
+  const Module &M;
+};
+
+// Entry point function.
+bool LowerFunctionLocalLDSImpl::lower() {
+  // TODO:
+  return false;
+}
+
+class AMDGPULowerFunctionLocalLDS : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPULowerFunctionLocalLDS() : ModulePass(ID) {
+    initializeAMDGPULowerFunctionLocalLDSPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerFunctionLocalLDS::ID = 0;
+char &llvm::AMDGPULowerFunctionLocalLDSID = AMDGPULowerFunctionLocalLDS::ID;
+
+INITIALIZE_PASS(AMDGPULowerFunctionLocalLDS, "amdgpu-lower-function-local-lds",
+                "Lower LDS Defined Within AMDGPU Non-kernel Device Function",
+                false /*only look at the cfg*/, false /*analysis pass*/)
+
+bool AMDGPULowerFunctionLocalLDS::runOnModule(Module &M) {
+  LowerFunctionLocalLDSImpl LDSLowerer{M};
+  return LDSLowerer.lower();
+}
+
+ModulePass *llvm::createAMDGPULowerFunctionLocalLDSPass() {
+  return new AMDGPULowerFunctionLocalLDS();
+}
+
+PreservedAnalyses
+AMDGPULowerFunctionLocalLDSPass::run(Module &M, ModuleAnalysisManager &AM) {
+  LowerFunctionLocalLDSImpl LDSLowerer{M};
+  LDSLowerer.lower();
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -34,6 +34,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableFunctionLocalLDSLowering;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -193,6 +193,13 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool, true> EnableFunctionLocalLDSLowering(
+    "amdgpu-enable-function-local-lds-lowering",
+    cl::desc(
+        "Enable lowering of LDS defined within non-kernel device function"),
+    cl::location(AMDGPUTargetMachine::EnableFunctionLocalLDSLowering),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -260,6 +267,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPULowerFunctionLocalLDSPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -389,6 +397,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableFunctionLocalLDSLowering = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -502,6 +511,10 @@
           PM.addPass(AMDGPUAlwaysInlinePass());
           return true;
         }
+        if (PassName == "amdgpu-lower-function-local-lds") {
+          PM.addPass(AMDGPULowerFunctionLocalLDSPass());
+          return true;
+        }
         return false;
       });
   PB.registerPipelineParsingCallback(
@@ -845,6 +858,12 @@
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  // We expect to run this pass as a first AMDGPU IR pass so that new
+  // instructions being added in this pass can possibly undergo further
+  // transformations via subsequent passes.
+  if (EnableFunctionLocalLDSLowering)
+    addPass(createAMDGPULowerFunctionLocalLDSPass());
+
   addPass(createAMDGPUPrintfRuntimeBinding());
 
   // This must occur before inlining, as the inliner will not look through
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPULowerFunctionLocalLDS.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp