diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -336,6 +336,14 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPULowerLDSGlobalPass();
+void initializeAMDGPULowerLDSGlobalPass(PassRegistry &);
+extern char &AMDGPULowerLDSGlobalID;
+struct AMDGPULowerLDSGlobalPass : PassInfoMixin<AMDGPULowerLDSGlobalPass> {
+  AMDGPULowerLDSGlobalPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -117,13 +117,15 @@
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
 
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  if (!AMDGPUTargetMachine::EnableLDSGlobalLowering) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerLDSGlobal.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerLDSGlobal.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerLDSGlobal.cpp
@@ -0,0 +1,1056 @@
+//===-- AMDGPULowerLDSGlobal.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <set>
+
+#define DEBUG_TYPE "amdgpu-lower-lds-global"
+
+using namespace llvm;
+
+// Helper function around `ValueMap` to detect if an element exists within it.
+template <typename R, typename E>
+static bool contains(R &&VMap, const E &Element) {
+  return VMap.find(Element) != VMap.end();
+}
+
+// Helper function to get the required alignment for the input LDS variable
+// `GV`.
+static Align getAlign(GlobalVariable *GV) {
+  return GV->getAlign().getValueOr(
+      GV->getParent()->getDataLayout().getPreferredAlign(GV));
+}
+
+namespace {
+
+class LowerLDSGlobalImpl {
+  Module &M;
+  LLVMContext &Ctx;
+  const DataLayout &DL;
+  CallGraph CG;
+  Twine PrefixStr;
+
+  // Holds kernels defined within the module `M`.
+  SmallPtrSet<Function *, 16> Kernels;
+
+  // Holds LDS globals defined within the module `M`.
+  SmallPtrSet<GlobalVariable *, 16> LDSGlobals;
+
+  // Holds call graph nodes associated with the functions whose addresses are
+  // taken within the module.
+  SmallPtrSet<CallGraphNode *, 16> AddressTakenSet;
+
+  // Associates LDS global to a list of functions which references that LDS.
+  ValueMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSGlobalToAccessors;
+
+  // Associates function to a list of LDS globals which are referenced within
+  // that function.
+  ValueMap<Function *, SmallPtrSet<GlobalVariable *, 8>> AccessorToLDSGlobals;
+
+  // Associates kernel to a list of functions which are reachable from that
+  // kernel.
+  ValueMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
+
+  // Associates kernel to a list LDS globals which are referenced along the call
+  // graph paths associated with that kernel.
+  ValueMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSGlobals;
+
+  // Associates kernel to a newly created per kernel LDS layout.
+  ValueMap<Function *, GlobalVariable *> KernelToLDSLayout;
+
+  // Associates kernel to a map which maps LDS globals to corresponding offsets.
+  ValueMap<Function *, std::map<GlobalVariable *, uint32_t>>
+      KernelToLDSToOffset;
+
+  // Associates kernel to instruction insertion point within that kernel.
+  ValueMap<Function *, Instruction *> KernelToInstInsertPt;
+
+  // Holds pointers to LDS layouts.
+  GlobalVariable *LDSLayouts;
+
+  // Holds 2D LDS offset table.
+  GlobalVariable *LDSOffsetTable;
+
+  // Associates kernel to unique integer id.
+  std::map<Function *, uint32_t> KernelToID;
+
+  // Associates an integer id uniquely to kernel.
+  std::map<GlobalVariable *, uint32_t> LDSToID;
+
+  // Associates LDS global to unique integer id.
+  std::map<uint32_t, Function *> IDToKernel;
+
+  // Associates an integer id uniquely to LDS global.
+  std::map<uint32_t, GlobalVariable *> IDToLDS;
+
+public:
+  explicit LowerLDSGlobalImpl(Module &M)
+      : M(M), Ctx(M.getContext()), DL(M.getDataLayout()), CG(CallGraph(M)),
+        PrefixStr(Twine("lds.lower.")) {
+    // Collect the functions whose address is taken within the module.
+    collectAddressTakenFunctions();
+  }
+
+  // Entry-point function.
+  bool lower();
+
+private:
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating new instructions which assess LDS layouts.
+  //===--------------------------------------------------------------------===//
+
+  // Convert `ConstantExpr CE` to `Instruction`, and update users of `CE` to use
+  // `Instruction`.
+  Instruction *replaceConstExprByInst(ConstantExpr *CE);
+
+  // Within instruction `I` replace the use(s) of `LDS` by `LayoutAccessInst`.
+  void replaceInstWhichUsesLDS(Instruction *I, GlobalVariable *LDS,
+                               Value *LayoutAccessInst,
+                               SmallPtrSetImpl<Instruction *> &ToBeErasedInsts);
+
+  // Replace all the users of original LDS global `LDS` with new ones which
+  // access corresponding LDS layouts.
+  void replaceAllUsersOfLDS(
+      GlobalVariable *LDS,
+      ValueMap<Function *, Instruction *> &FunctionToLDSLayoutAccessInst);
+
+  // Add instructions to access LDS layouts corresponding to LDS global `LDS`.
+  bool accessLDSLayouts(
+      GlobalVariable *LDS,
+      ValueMap<Function *, Instruction *> &FunctionToLDSLayoutAccessInst);
+
+  // Add instructions within kernels to initialize LDS layouts array to hold the
+  // starting address of LDS layouts.
+  void initializeArrayOfLDSLayouts();
+
+  // Insert various required instructions within kernels and within callees
+  // which aid accessing of newly inserted LDS layouts.
+  void insertNewInstructions();
+
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating new globals like kernel specific LDS layouts,
+  // 2D LDS offset table, etc.
+  //===--------------------------------------------------------------------===//
+
+  // Create 2D LDS offset table which will be referenced at run time to access
+  // the LDS specific offset within kernel specific LDS layout.
+  void constructLDSOffsetTable();
+
+  // Create 1D array which holds LDS layouts which will be referenced at run
+  // time to access kernel specific LDS layout.
+  void constructArrayOfLDSLayouts();
+
+  // Create a new LDS layout corresponding to kernel `K` whose size will be of
+  // `LayoutSize`.
+  GlobalVariable *constructLDSLayout(Function *K, uint32_t LayoutSize);
+
+  // Given a set of LDS globals, associate each LDS from the set to a
+  // corresponding offset within the kernel specific LDS layout.
+  uint32_t computeOffsets(SmallPtrSetImpl<GlobalVariable *> &LDSSet,
+                          std::map<GlobalVariable *, uint32_t> &LDSToOffSet);
+
+  // Create unified LDS layouts for each kernel.
+  void constructLDSLayouts();
+
+  // Insert various required globals (including per kernel specific unified LDS
+  // layouts) into the module.
+  void insertNewGlobals();
+
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in preparing for lowering.
+  //===--------------------------------------------------------------------===//
+
+  // Collect all call graph paths from kernel `K` to callee `Callee`.
+  void collectCallGraphPaths(Function *K, Function *Callee,
+                             SmallVectorImpl<SetVector<Function *>> &CGPaths);
+
+  // TODO: What is the neat way to implement this functionality?
+  void propogateKernelId();
+
+  // Associate kernel and LDS global to unique ids and vice versa.
+  void associateKernelAndLDSGlobalToUniqueId();
+
+  // Associate kernels and LDS globals with unique integer id, and implement a
+  // mechanism to access the kernel ids within callees at run time.
+  void prepareForLowering();
+
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating the various `map` data structures.
+  //===--------------------------------------------------------------------===//
+
+  // Associate each kernel K with LDS globals which are being accessed by K
+  // and/or by the callees of K.
+  void createKernelToLDSGlobalsMap();
+
+  // Traverse through the call graph nodes associated with the callees of
+  // current caller, and push them into stack.
+  void pushCallGraphNodes(CallGraphNode *CGNode,
+                          SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+                          SmallVectorImpl<CallBase *> &CallBaseStack);
+
+  // The call site associated with `CGNode` is a "direct call site", and is
+  // associated with a single callee, say,`Callee` represented by `CGNode`. Add
+  // `Callee` to `CalleeSet`, and push callees of `Callee` to `CGNodeStack` to
+  // further explore DFS search.
+  void collectCalleeAssociatedWithDirectCallSite(
+      CallGraphNode *CGNode, SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+      SmallVectorImpl<CallBase *> &CallBaseStack,
+      SmallPtrSetImpl<Function *> &CalleeSet);
+
+  // The call site `CB` is an "indirect call site". Resolve `CB` to a set of
+  // potential callees.
+  void AssociateIndirectCallSiteWithPotentialCallees(
+      CallGraphNode *CGNode, CallBase *CB,
+      SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+      SmallVectorImpl<CallBase *> &CallBaseStack,
+      SmallPtrSetImpl<Function *> &CalleeSet);
+
+  // Collect callee(s) associated with call site `CB`. If `CB` is a `direct`
+  // call site, then there is only one callee associated with it, collect it. If
+  // `CB` is an `indirect` call site, then potentially there could be more than
+  // one callee associated with it, collect all of them.
+  bool collectCallees(CallGraphNode *CGNode, CallBase *CB,
+                      SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+                      SmallVectorImpl<CallBase *> &CallBaseStack,
+                      SmallPtrSet<CallBase *, 16> &VisitedIndirectCallSites,
+                      SmallPtrSetImpl<Function *> &CalleeSet);
+
+  // Traverse `CallGraph` starting from the `CallGraphNode` associated with each
+  // kernel `K` in DFS manner and collect all the callees which are reachable
+  // from K (including indirectly called callees).
+  void createKernelToCalleesMap();
+
+  // Associate each kernel/function with the LDS globals which are being
+  // accessed within them.
+  void createAccessorToLDSGlobalsMap();
+
+  // For each `LDS`, recursively visit its user list and find all those
+  // kernels/functions within which the `LDS` is being accessed.
+  void createLDSGlobalToAccessorsMap();
+
+  // For each kernel `K`, collect LDS globals which are being accessed during
+  // the execution of `K`.
+  bool collectPerKernelAccessibleLDSGlobals();
+
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating the various `set` data structures.
+  //===--------------------------------------------------------------------===//
+
+  // Collect all the amdgpu kernels defined within the current module.
+  bool collectKernels();
+
+  // Collect all the (static) LDS globals defined within the current module.
+  bool collectLDSGlobals();
+
+  // Collect functions whose address is taken within the module.
+  void collectAddressTakenFunctions();
+};
+
+// Convert `ConstantExpr CE` to `Instruction`, and update users of `CE` to use
+// `Instruction`.
+Instruction *LowerLDSGlobalImpl::replaceConstExprByInst(ConstantExpr *CE) {
+  // FIXME: This looks like a hack to me, but, is there any better way of
+  // handling `ConstantExprs`? I have no idea at the moment, need to revisit it
+  // later.
+  for (auto *U : CE->users()) {
+    auto *I = dyn_cast<Instruction>(U);
+
+    if (!I)
+      I = replaceConstExprByInst(dyn_cast<ConstantExpr>(U));
+
+    if (I) {
+      auto *NI = CE->getAsInstruction();
+      NI->insertBefore(I);
+      unsigned Ind = 0;
+      for (Use &UU : I->operands()) {
+        Value *V = UU.get();
+        if (V == CE)
+          I->setOperand(Ind, NI);
+        ++Ind;
+      }
+      return NI;
+    }
+  }
+
+  return nullptr;
+}
+
+// Within instruction `I` replace the use(s) of `LDS` by `LayoutAccessInst`.
+void LowerLDSGlobalImpl::replaceInstWhichUsesLDS(
+    Instruction *I, GlobalVariable *LDS, Value *LayoutAccessInst,
+    SmallPtrSetImpl<Instruction *> &ToBeErasedInsts) {
+  // Create clone of `I`, say, it is `NewI`. Within `NewI`, replace the use(s)
+  // of `LDS` by `LayoutAccessor`.
+  //
+  // FIXME: Instruction cloning is not required, fix it.
+  Instruction *NewI = I->clone();
+  unsigned Ind = 0;
+  for (Use &UU : NewI->operands()) {
+    Value *V = UU.get();
+    if (V == LDS)
+      NewI->setOperand(Ind, LayoutAccessInst);
+    ++Ind;
+  }
+
+  // Insert `NewI` just before `I`, replace all uses of `I` by `NewI` and mark
+  // `I` as `to be erased` instruction.
+  NewI->insertBefore(I);
+  NewI->copyMetadata(*I);
+  I->replaceAllUsesWith(NewI);
+  ToBeErasedInsts.insert(I);
+}
+
+// Replace all the users of original LDS global `LDS` with new ones which access
+// corresponding LDS layouts.
+void LowerLDSGlobalImpl::replaceAllUsersOfLDS(
+    GlobalVariable *LDS,
+    ValueMap<Function *, Instruction *> &FunctionToLDSLayoutAccessInst) {
+  // Keep track of all the erased to be instructions.
+  SmallPtrSet<Instruction *, 16> ToBeErasedInsts;
+
+  // Traverse through each use `U` of `LDS`, create a new one which replaces `U`
+  // and accordingly replace `U`.
+  for (auto *U : LDS->users()) {
+    // `U` may be using `LDS`, but 'U` itself is not used anywhere, ignore `U`.
+    if (!U->getNumUses())
+      continue;
+
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      // User is an Instruction, accordingly handle it.
+      auto *LayoutAccessInst =
+          FunctionToLDSLayoutAccessInst[I->getParent()->getParent()];
+      replaceInstWhichUsesLDS(I, LDS, LayoutAccessInst, ToBeErasedInsts);
+    } else if (auto *CE = dyn_cast<ConstantExpr>(U)) {
+      // User is a ConstantExpr, accordingly handle it.
+      auto *I = replaceConstExprByInst(CE);
+      auto *LayoutAccessInst =
+          FunctionToLDSLayoutAccessInst[I->getParent()->getParent()];
+      replaceInstWhichUsesLDS(I, LDS, LayoutAccessInst, ToBeErasedInsts);
+      CE->removeDeadConstantUsers();
+    } else
+      llvm_unreachable("Not Implemented."); // TODO: What else is missing?
+  }
+
+  // Erase all the instructions which are got replaced by new ones.
+  for (auto *I : ToBeErasedInsts)
+    I->eraseFromParent();
+}
+
+// Add instructions to access LDS layouts corresponding to LDS global `LDS`.
+bool LowerLDSGlobalImpl::accessLDSLayouts(
+    GlobalVariable *LDS,
+    ValueMap<Function *, Instruction *> &FunctionToLDSLayoutAccessInst) {
+  // LDS global is not used anywhere? ignore it.
+  if (!contains(LDSGlobalToAccessors, LDS))
+    return false;
+
+  for (auto *F : LDSGlobalToAccessors[LDS]) {
+    // Get instruction insertion point.
+    Instruction *EI = AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())
+                          ? KernelToInstInsertPt[F]
+                          : &(*F->getEntryBlock().getFirstInsertionPt());
+
+    // Get indices to access LDS offset table.
+    // FIXME: explicit cast
+    Value *KI = AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())
+                    ? (Value *)Constant::getIntegerValue(
+                          Type::getInt32Ty(Ctx), APInt(32, KernelToID[F]))
+                    : (Value *)F->getArg(F->arg_size() - 1);
+    Value *LI = Constant::getIntegerValue(Type::getInt32Ty(Ctx),
+                                          APInt(32, LDSToID[LDS]));
+
+    // Insert GEP instruction to access the LDS offset table at the address
+    // ((LDSOffsetTable + KI) + LI) which is of type `i32*`.
+    Value *Indices1[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)), KI, LI};
+    Instruction *GEPI1 = GetElementPtrInst::CreateInBounds(
+        LDSOffsetTable->getValueType(), LDSOffsetTable, Indices1,
+        PrefixStr + Twine("gep."), EI);
+
+    // Insert LOAD instruction to load the `offset` value from LDS offset table
+    // at the address ((LDSOffsetTable + KI) + LI) which is of type `i32`.
+    // FIXME: getPointerElementType() sould not be used.
+    Instruction *LoadI = new LoadInst(GEPI1->getType()->getPointerElementType(),
+                                      GEPI1, PrefixStr + Twine("load."), EI);
+
+    // Insert GEP instruction to access the starting address of LDS layout
+    // corresponding to kernel represented by the kernel id `KI` which is of
+    // type `i8*`.
+    Value *Indices2[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)), KI};
+    Instruction *GEPI2 = GetElementPtrInst::CreateInBounds(
+        LDSLayouts->getValueType(), LDSLayouts, Indices2,
+        PrefixStr + Twine("gep."), EI);
+
+    // Insert GEP instruction to access the base address corresponding to LDS
+    // global `LDS` within above LDS layout which is of type 'i8*`.
+    // FIXME: getPointerElementType() sould not be used.
+    Instruction *GEPI3 = GetElementPtrInst::CreateInBounds(
+        GEPI2->getType()->getPointerElementType(), GEPI2, LoadI,
+        PrefixStr + Twine("gep."), EI);
+
+    // Insert BITCAST instruction to cast `above base address` from 'i8*` to
+    // `ldstype*` where `ldstype` is the type of original LDS global `LDS`.
+    Instruction *CastI =
+        new BitCastInst(GEPI3, LDS->getType(), PrefixStr + Twine("cast."), EI);
+
+    // Save the above instruction `CastI` which replaces all uses of `LDS`
+    // within `F`.
+    FunctionToLDSLayoutAccessInst[F] = CastI;
+  }
+
+  return true;
+}
+
+// Add instructions within kernels to initialize LDS layouts array to hold the
+// starting address of LDS layouts.
+void LowerLDSGlobalImpl::initializeArrayOfLDSLayouts() {
+  for (auto *K : Kernels) {
+    // Kernel `K` does not associate with LDS layout? skip it.
+    if (!contains(KernelToLDSLayout, K))
+      continue;
+
+    // Insert instructions at the beginning of the entry basic block of the
+    // kernel `K` which initialize the global array `LDSLayouts` to hold the
+    // starting address of the LDS layout associated with `K` at LDSLayouts[ID]
+    // where ID is kernel id of `K`.
+    Instruction *EI = &(*(K->getEntryBlock().getFirstInsertionPt()));
+
+    // Insert GEP instruction which access the address `LDSLayout + 0`, say, the
+    // result is `GEP1` which is of type `i8*`.
+    GlobalVariable *LDSLayout = KernelToLDSLayout[K];
+    Value *Indices1[] = {
+        Constant::getNullValue(Type::getInt32Ty(Ctx)),
+        Constant::getIntegerValue(Type::getInt32Ty(Ctx), APInt(32, 0))};
+    Instruction *GEP1 = GetElementPtrInst::CreateInBounds(
+        LDSLayout->getValueType(), const_cast<GlobalVariable *>(LDSLayout),
+        Indices1, PrefixStr + Twine("gep."), EI);
+
+    // Insert GEP instruction which access the address `LDSLayouts + ID`, say,
+    // the result is `GEP2` which is of type `i8**`.
+    Value *Indices2[] = {Constant::getNullValue(Type::getInt32Ty(Ctx)),
+                         Constant::getIntegerValue(Type::getInt32Ty(Ctx),
+                                                   APInt(32, KernelToID[K]))};
+    Instruction *GEP2 = GetElementPtrInst::CreateInBounds(
+        LDSLayouts->getValueType(), const_cast<GlobalVariable *>(LDSLayouts),
+        Indices2, PrefixStr + Twine("gep."), EI);
+
+    // Insert STORE instruction which stores `GEP1` at `GEP2`.
+    new StoreInst(GEP1, GEP2, EI);
+
+    // Save the instruction insertion point which will be later required when
+    // it is necessary to insert LDS layout access instructions within kernel.
+    KernelToInstInsertPt[K] = EI;
+  }
+}
+
+// Insert various required instructions within kernels and within callees which
+// aid in accessing of newly inserted LDS layouts.
+void LowerLDSGlobalImpl::insertNewInstructions() {
+  // Add instructions within kernels to initialize LDS layouts array to hold the
+  // starting address of LDS layouts.
+  initializeArrayOfLDSLayouts();
+
+  // Add instructions to access LDS layouts and accordingly replace original
+  // instructions which use original LDS globals.
+  for (auto *LDS : LDSGlobals) {
+    ValueMap<Function *, Instruction *> FunctionToLDSLayoutAccessInst;
+
+    // LDS global is not used anywhere? ignore it.
+    if (!accessLDSLayouts(LDS, FunctionToLDSLayoutAccessInst))
+      continue;
+
+    // Replace all the users of LDS global `LDS` with new ones which access
+    // corresponding LDS layouts.
+    replaceAllUsersOfLDS(LDS, FunctionToLDSLayoutAccessInst);
+  }
+}
+
+// Create 2D LDS offset table which will be referenced at run time to access the
+// LDS specific offset within kernel specific LDS layout.
+void LowerLDSGlobalImpl::constructLDSOffsetTable() {
+  // Get type of 2D LDS offset table.
+  auto *EleTy = Type::getInt32Ty(Ctx);
+  auto *Arr1DTy = ArrayType::get(EleTy, LDSGlobals.size());
+  auto *Arr2DTy = ArrayType::get(Arr1DTy, Kernels.size());
+
+  // Create offset initialization list.
+  SmallVector<llvm::Constant *, 16> Init2DValues;
+
+  for (unsigned K = 0; K < Kernels.size(); ++K) {
+    SmallVector<llvm::Constant *, 16> Init1DValues;
+    auto *Kernel = IDToKernel[K];
+
+    for (unsigned L = 0; L < LDSGlobals.size(); ++L) {
+      auto *LDS = IDToLDS[L];
+      auto Offset = contains(KernelToLDSToOffset, Kernel)
+                        ? contains(KernelToLDSToOffset[Kernel], LDS)
+                              ? KernelToLDSToOffset[Kernel][LDS]
+                              : -1
+                        : -1;
+      auto *C = Constant::getIntegerValue(EleTy, APInt(32, Offset));
+      Init1DValues.push_back(C);
+    }
+
+    auto *Const1D = ConstantArray::get(Arr1DTy, Init1DValues);
+    Init2DValues.push_back(Const1D);
+  }
+
+  auto *Const2D = ConstantArray::get(Arr2DTy, Init2DValues);
+
+  // Create 2D LDS offset table which is initialized with offsets.
+  LDSOffsetTable = new GlobalVariable(
+      M, Arr2DTy, false, GlobalValue::InternalLinkage, Const2D,
+      Twine("__llvm.amdgcn.lds.offset.table__"), nullptr,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  // Set proper alignment.
+  LDSOffsetTable->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  LDSOffsetTable->setAlignment(getAlign(LDSOffsetTable));
+}
+
+// Create 1D array which holds LDS layouts which will be referenced at run time
+// to access kernel specific LDS layout.
+void LowerLDSGlobalImpl::constructArrayOfLDSLayouts() {
+  auto *EleTy =
+      PointerType::get(IntegerType::get(Ctx, 8), AMDGPUAS::LOCAL_ADDRESS);
+  auto *ArrTy = ArrayType::get(EleTy, Kernels.size());
+
+  LDSLayouts = new GlobalVariable(
+      M, ArrTy, false, GlobalValue::InternalLinkage, UndefValue::get(ArrTy),
+      Twine("__llvm.amdgcn.unified.lds.layouts__"), nullptr,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  LDSLayouts->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  LDSLayouts->setAlignment(getAlign(LDSLayouts));
+}
+
+// Create a new LDS layout corresponding to kernel `K` whose size will be of
+// `LayoutSize`.
+GlobalVariable *LowerLDSGlobalImpl::constructLDSLayout(Function *K,
+                                                       uint32_t LayoutSize) {
+  auto LayoutName = Twine("__llvm.amdgcn.unified.lds.layout.") +
+                    Twine(KernelToID[K]) + Twine("__");
+  auto *LayoutType = ArrayType::get(IntegerType::get(Ctx, 8), LayoutSize);
+
+  auto *LDSLayout = new GlobalVariable(
+      M, LayoutType, false, GlobalValue::InternalLinkage,
+      UndefValue::get(LayoutType), LayoutName, nullptr,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+
+  LDSLayout->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  LDSLayout->setAlignment(getAlign(LDSLayout));
+
+  return LDSLayout;
+}
+
+// Given a set of LDS globals, associate each LDS from the set to a
+// corresponding offset within the kernel specific LDS layout.
+uint32_t LowerLDSGlobalImpl::computeOffsets(
+    SmallPtrSetImpl<GlobalVariable *> &LDSSet,
+    std::map<GlobalVariable *, uint32_t> &LDSToOffSet) {
+  // Sort `LDSSet` by alignment in descending order to minimize the padding
+  // required. On ties, sort by type allocation size, again in descending order,
+  // and finally, by name in lexicographical order (in ascending order) when
+  // type allocation sizes are same.
+  std::vector<GlobalVariable *> SortedLDSSet(LDSSet.begin(), LDSSet.end());
+  llvm::stable_sort(SortedLDSSet,
+                    [&](GlobalVariable *LHS, GlobalVariable *RHS) -> bool {
+                      auto ALHS = getAlign(LHS);
+                      auto ARHS = getAlign(RHS);
+                      if (ALHS != ARHS)
+                        return ALHS > ARHS;
+
+                      auto SLHS = DL.getTypeAllocSize(LHS->getValueType());
+                      auto SRHS = DL.getTypeAllocSize(RHS->getValueType());
+                      if (SLHS != SRHS)
+                        return SLHS > SRHS;
+
+                      return LHS->getName() < RHS->getName();
+                    });
+
+  // Compute `offset` for each LDS global from `SortedLDSSet`.
+  // FIXME: Alignment is not yet handled properly.
+  uint32_t CurOffset = 0;
+  for (auto *LDS : SortedLDSSet) {
+    LDSToOffSet[LDS] = CurOffset;
+    CurOffset += DL.getTypeAllocSize(LDS->getValueType()).getFixedValue();
+  }
+
+  return CurOffset;
+}
+
+// Create unified LDS layouts for each kernel.
+void LowerLDSGlobalImpl::constructLDSLayouts() {
+  // Create a new LDS layout corresponding to each kernel `K`, and associate
+  // each kernel to a map which maps LDS globals to corresponding offsets.
+  for (auto *K : Kernels) {
+    // Kernel `K` does not associate with any LDS global? skip it.
+    if (!contains(KernelToLDSGlobals, K))
+      continue;
+
+    // For the set of LDS globals associated with kernel `K`, map each LDS
+    // from the set to a corresponding offset within the kernel specific LDS
+    // layout, and also get the grand total size of LDS layout.
+    std::map<GlobalVariable *, uint32_t> LDSToOffSet;
+    auto LayoutSize = computeOffsets(KernelToLDSGlobals[K], LDSToOffSet);
+    KernelToLDSToOffset[K] = LDSToOffSet;
+
+    // Create a new LDS layout corresponding to kernel `K` whose size will be of
+    // `LayoutSize`.
+    KernelToLDSLayout[K] = constructLDSLayout(K, LayoutSize);
+  }
+}
+
+// Insert various required globals (including per kernel specific unified LDS
+// layouts) into the module.
+void LowerLDSGlobalImpl::insertNewGlobals() {
+  // Create unified LDS layouts for each kernel.
+  constructLDSLayouts();
+
+  // Create 1D array which holds LDS layouts which will be referenced at run
+  // time to access kernel specific LDS layout.
+  constructArrayOfLDSLayouts();
+
+  // Create 2D LDS offset table which will be referenced at run time to access
+  // the LDS specific offset within kernel specific LDS layout.
+  constructLDSOffsetTable();
+}
+
+// TODO: What is the neat way to implement this functionality?
+void LowerLDSGlobalImpl::propogateKernelId() {
+  // TODO
+}
+
+// Associate kernel and LDS global to unique ids and vice versa.
+void LowerLDSGlobalImpl::associateKernelAndLDSGlobalToUniqueId() {
+  // Associate each kernel to unique integer and vice versa.
+  uint32_t KNum = 0;
+  for (auto *K : Kernels) {
+    KernelToID[K] = KNum;
+    IDToKernel[KNum] = K;
+    ++KNum;
+  }
+
+  // Associate each LDS to unique integer and vice versa.
+  uint32_t LNum = 0;
+  for (auto *LDSGlobal : LDSGlobals) {
+    LDSToID[LDSGlobal] = LNum;
+    IDToLDS[LNum] = LDSGlobal;
+    ++LNum;
+  }
+}
+
+// Associate kernels and LDS globals with unique integer id, and implement a
+// mechanism to access the kernel ids within callees at run time.
+void LowerLDSGlobalImpl::prepareForLowering() {
+  // Associate kernel and LDS global to unique ids and vice versa.
+  associateKernelAndLDSGlobalToUniqueId();
+
+  // TODO: What is the neat way to implement this functionality?
+  propogateKernelId();
+}
+
+// Associate each kernel K with LDS globals which are being accessed by K and/or
+// by the callees of K.
+void LowerLDSGlobalImpl::createKernelToLDSGlobalsMap() {
+  for (auto *K : Kernels) {
+    SmallPtrSet<GlobalVariable *, 8> LDSSet;
+
+    // Collect all those LDS globals which are being accessed by kernel K
+    // itself.
+    if (contains(AccessorToLDSGlobals, K))
+      LDSSet.insert(AccessorToLDSGlobals[K].begin(),
+                    AccessorToLDSGlobals[K].end());
+
+    // Collect all those LDS globals which are being accessed by the callees of
+    // kernel K.
+    for (auto *Callee : KernelToCallees[K]) {
+      if (contains(AccessorToLDSGlobals, Callee))
+        LDSSet.insert(AccessorToLDSGlobals[Callee].begin(),
+                      AccessorToLDSGlobals[Callee].end());
+    }
+
+    if (!LDSSet.empty())
+      KernelToLDSGlobals[K] = LDSSet;
+  }
+}
+
+// Traverse through the call graph nodes associated with the callees of current
+// caller, and push them into stack.
+void LowerLDSGlobalImpl::pushCallGraphNodes(
+    CallGraphNode *CGNode, SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+    SmallVectorImpl<CallBase *> &CallBaseStack) {
+  for (auto GI = CGNode->begin(), GE = CGNode->end(); GI != GE; ++GI) {
+    auto *CGN = GI->second;
+    auto *CB = cast<CallBase>(GI->first.getValue());
+    CGNodeStack.push_back(CGN);
+    CallBaseStack.push_back(CB);
+  }
+}
+
+// The call site associated with `CGNode` is a "direct call site", and is
+// associated with a single callee, say,`Callee` represented by `CGNode`. Add
+// `Callee` to `CalleeSet`, and push callees of `Callee` to `CGNodeStack` to
+// further explore DFS search.
+void LowerLDSGlobalImpl::collectCalleeAssociatedWithDirectCallSite(
+    CallGraphNode *CGNode, SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+    SmallVectorImpl<CallBase *> &CallBaseStack,
+    SmallPtrSetImpl<Function *> &CalleeSet) {
+  auto *Callee = CGNode->getFunction();
+  assert(Callee && "Expected a valid callee associated with call site");
+  assert(!Callee->isDeclaration() &&
+         "Expected callee defintion since this pass assumes that caller and "
+         "callee appear in same translation unit");
+  CalleeSet.insert(Callee);
+  pushCallGraphNodes(CGNode, CGNodeStack, CallBaseStack);
+}
+
+// The call site `CB` is an "indirect call site". Resolve `CB` to a set of
+// potential callees.
+void LowerLDSGlobalImpl::AssociateIndirectCallSiteWithPotentialCallees(
+    CallGraphNode *CGNode, CallBase *CB,
+    SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+    SmallVectorImpl<CallBase *> &CallBaseStack,
+    SmallPtrSetImpl<Function *> &CalleeSet) {
+  if (auto *MD = CB->getMetadata(LLVMContext::MD_callees)) {
+    // The metadata "!callee" is available at the indirect call site `CB`, which
+    // means, all the potential target callees for the call site `CB` is
+    // successfully resolved at compile time. So, push them into stack so that
+    // they will be handled just like direct callees when they are eventually
+    // poped out.
+    for (const auto &Op : MD->operands()) {
+      CGNodeStack.push_back(CG[mdconst::extract_or_null<Function>(Op)]);
+      CallBaseStack.push_back(CB);
+    }
+  } else {
+    // The metadata "!callee" is *NOT* available at the indirect call site `CB`,
+    // which means, `CB` has *NO* information about potential target callees.
+    // The simplest possible *SAFE* assumption that we can make here is to
+    // consider all those "address taken" functions whose singature matches with
+    // that of the call site `CB`, and assume that all these signature matched
+    // "address taken" functions are possible potential callees. Thus, push all
+    // the signature matched "address taken" functions into stack so that they
+    // will be handled just like direct callees when they are eventually poped
+    // out.
+    auto *CBFTy = CB->getFunctionType();
+    for (auto *CGN : AddressTakenSet) {
+      auto *F = CGN->getFunction();
+      assert(F && "Expected a valid address taken function");
+      auto *ADFTy = F->getFunctionType();
+      if (ADFTy == CBFTy) {
+        CGNodeStack.push_back(CGN);
+        CallBaseStack.push_back(CB);
+      }
+    }
+  }
+}
+
+// Collect callee(s) associated with call site `CB`. If `CB` is a `direct` call
+// site, then there is only one callee associated with it, collect it. If `CB`
+// is an `indirect` call site, then potentially there could be more than one
+// callee associated with it, collect all of them.
+bool LowerLDSGlobalImpl::collectCallees(
+    CallGraphNode *CGNode, CallBase *CB,
+    SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+    SmallVectorImpl<CallBase *> &CallBaseStack,
+    SmallPtrSet<CallBase *, 16> &VisitedIndirectCallSites,
+    SmallPtrSetImpl<Function *> &CalleeSet) {
+  // Return `true` if the collees are actually collected.
+  bool CollectedCollees = true;
+
+  if (!CB->getCalledFunction()) {
+    // Call site `CB` is an indirect call site. But, if the `CGNode` has a
+    // function defintion, say, `Callee`, associated with it, which means, we
+    // have already had visited `CB` earlier, and we had resovled it to a set of
+    // pontential callees, and `Callee` is one among them. Collect `Callee` just
+    // the way direct callee is collected. Otherwise, `CB` is encoutered for
+    // first time. Resolve it to a set of potential callees before collecting
+    // them.
+    if (!VisitedIndirectCallSites.contains(CB)) {
+      // Indirect call site `CB` is encoutered first time, resolve it to a set
+      // of potential callees.
+      AssociateIndirectCallSiteWithPotentialCallees(CGNode, CB, CGNodeStack,
+                                                    CallBaseStack, CalleeSet);
+
+      // Mark that `CB` is visited.
+      VisitedIndirectCallSites.insert(CB);
+
+      // We are only resolved indirect call site `CB` to a set of potential
+      // callees here, and they are not collected yet.
+      CollectedCollees = false;
+    } else {
+      // Indirect call site `CB` is already been resolved to a set of potential
+      // callees during its first visit. The callee represented by `CGNode` is
+      // one among them. Collect it just the way direct callee is collected.
+      collectCalleeAssociatedWithDirectCallSite(CGNode, CGNodeStack,
+                                                CallBaseStack, CalleeSet);
+    }
+  } else {
+    // Call site `CB` is a direct call site. Collect a single callee which is
+    // associated with it.
+    collectCalleeAssociatedWithDirectCallSite(CGNode, CGNodeStack,
+                                              CallBaseStack, CalleeSet);
+  }
+
+  return CollectedCollees;
+}
+
+// Traverse `CallGraph` starting from the `CallGraphNode` associated with each
+// kernel `K` in DFS manner and collect all the callees which are reachable from
+// K (including indirectly called callees).
+//
+// FIXME: Can we "really" get rid of this function, and is it that llvm
+// "CallGraph" infra structure already provides this functionality, especially,
+// by including indirect callees within SCC?
+void LowerLDSGlobalImpl::createKernelToCalleesMap() {
+  for (auto *K : Kernels) {
+    auto *KernCGNode = CG[K];
+    SmallVector<CallGraphNode *, 16> CGNodeStack;
+    SmallVector<CallBase *, 16> CallBaseStack;
+    SmallPtrSet<CallGraphNode *, 16> VisitedCGNodes;
+    SmallPtrSet<CallBase *, 16> VisitedIndirectCallSites;
+    SmallPtrSet<Function *, 8> CalleeSet;
+
+    // Push the `CallGraphNode` associated with all the callees of the kernel`K`
+    // into into `CGNodeStack`, and the corresponding call sites into
+    // `CallBaseStack`.
+    pushCallGraphNodes(KernCGNode, CGNodeStack, CallBaseStack);
+
+    // Continue DFS search until no more call graph nodes to handle.
+    while (!CGNodeStack.empty()) {
+      assert(CGNodeStack.size() == CallBaseStack.size() &&
+             "Stack holding CallBase pointers is currupted");
+      auto *CGNode = CGNodeStack.pop_back_val();
+      auto *CB = CallBaseStack.pop_back_val();
+
+      // `CGNode` is already visited and handled, ignore it and proceed to next
+      // one.
+      if (!VisitedCGNodes.insert(CGNode).second)
+        continue;
+
+      // Collect callee(s) associated with call site `CB`. If `CB` is a `direct`
+      // call site, then there is only one callee associated with it, collect
+      // it. If `CB` is an `indirect` call site, then potentially there could be
+      // more than one callee associated with it, collect all of them.
+      auto CallSiteHandledIsIndirect =
+          collectCallees(CGNode, CB, CGNodeStack, CallBaseStack,
+                         VisitedIndirectCallSites, CalleeSet);
+
+      // The call site `CB` which is being handled just now is an indirect call
+      // site. Since the indirect call site does not bind to any particular set
+      // of callees, and since the `CGNode` is one and the same for all the
+      // function pointers which have same signature, we should *NOT* assume
+      // that `CGNode` is visited unlike in case of direct call site.
+      if (CallSiteHandledIsIndirect)
+        VisitedCGNodes.erase(CGNode);
+    }
+
+    assert(CallBaseStack.empty() &&
+           "Stack holding CallBase pointers is currupted");
+
+    KernelToCallees[K] = CalleeSet;
+  }
+}
+
+// Associate each kernel/function with the LDS globals which are being accessed
+// within them.
+void LowerLDSGlobalImpl::createAccessorToLDSGlobalsMap() {
+  for (auto LI = LDSGlobalToAccessors.begin(), LE = LDSGlobalToAccessors.end();
+       LI != LE; ++LI) {
+    auto *LDS = LI->first;
+    for (auto *A : LI->second) {
+      if (!contains(AccessorToLDSGlobals, A)) {
+        SmallPtrSet<GlobalVariable *, 8> LDSSet;
+        LDSSet.insert(LDS);
+        AccessorToLDSGlobals[A] = LDSSet;
+      } else
+        AccessorToLDSGlobals[A].insert(LDS);
+    }
+  }
+}
+
+// For each `LDS`, recursively visit its user list and find all those
+// kernels/functions within which the `LDS` is being accessed.
+void LowerLDSGlobalImpl::createLDSGlobalToAccessorsMap() {
+  for (auto *LDS : LDSGlobals) {
+    assert(!LDS->user_empty() &&
+           "LDS user list cannot be empty since it must have been successfully "
+           "defined within either kernel or function");
+
+    SmallPtrSet<Function *, 8> LDSAccessors;
+    SmallVector<User *, 16> UserStack(LDS->users());
+    SmallPtrSet<User *, 16> VisitedUsers;
+
+    while (!UserStack.empty()) {
+      auto *U = UserStack.pop_back_val();
+      if (!VisitedUsers.insert(U).second)
+        continue;
+
+      auto *I = dyn_cast<Instruction>(U);
+
+      // If `U` is not an `Instruction`, then it should be a `Constant` which is
+      // nested within an `Instruction`. Push-back users of `U`, and continue
+      // further exploring the stack until an `Instruction` is found.
+      if (!I) {
+        assert(isa<Constant>(U) && "Expected a constant expression");
+        append_range(UserStack, U->users());
+        continue;
+      }
+
+      // We have successfully found a kernel/function within which the `LDS` is
+      // being accessed, insert it into `LDSAccessors` set.
+      LDSAccessors.insert(I->getParent()->getParent());
+    }
+
+    LDSGlobalToAccessors[LDS] = LDSAccessors;
+  }
+}
+
+// For each kernel `K`, collect LDS globals which are being accessed during the
+// execution of `K`.
+bool LowerLDSGlobalImpl::collectPerKernelAccessibleLDSGlobals() {
+  // Associate each LDS with the kernels/functions within which the LDS is being
+  // accessed.
+  createLDSGlobalToAccessorsMap();
+
+  // Associate each kernel/function with the LDS globals which are being
+  // accessed within them.
+  createAccessorToLDSGlobalsMap();
+
+  // Associate each kernel K with callees which are reachable from K (including
+  // indirectly called callees).
+  createKernelToCalleesMap();
+
+  // Associate each kernel K with LDS globals which are being accessed by K
+  // and/or by the callees of K.
+  createKernelToLDSGlobalsMap();
+
+  // If *none* of the kernels associate with any LDS globals, then nothing do.
+  return !KernelToLDSGlobals.empty();
+}
+
+// Collect all the amdgpu kernels defined within the current module.
+bool LowerLDSGlobalImpl::collectKernels() {
+  for (auto &F : M.functions()) {
+    if (!F.isDeclaration() &&
+        AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
+      Kernels.insert(&F);
+  }
+
+  return !Kernels.empty();
+}
+
+// Collect all the (static) LDS globals defined within the current module.
+bool LowerLDSGlobalImpl::collectLDSGlobals() {
+  for (auto &GV : M.globals()) {
+    if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        !GV.isDeclaration() && !GV.getType()->isEmptyTy())
+      LDSGlobals.insert(&GV);
+  }
+
+  return !LDSGlobals.empty();
+}
+
+// Collect functions whose address is taken within the module.
+void LowerLDSGlobalImpl::collectAddressTakenFunctions() {
+  auto *ExternalCallingNode = CG.getExternalCallingNode();
+  assert(ExternalCallingNode &&
+         "Call graph node associated with kernel/function definition cannot be "
+         "null");
+
+  for (auto GI = ExternalCallingNode->begin(), GE = ExternalCallingNode->end();
+       GI != GE; ++GI) {
+    auto *CGN = GI->second;
+    assert(CGN &&
+           "Call graph node associated with kernel/function definition cannot "
+           "be null");
+    auto *F = CGN->getFunction();
+    // FIXME: Anything else need to be excluded?
+    if (!F || F->isDeclaration() ||
+        AMDGPU::isModuleEntryFunctionCC(F->getCallingConv()))
+      continue;
+    AddressTakenSet.insert(CGN);
+  }
+}
+
+// Entry-point function.
+bool LowerLDSGlobalImpl::lower() {
+  // If there are *no* LDS globals defined within the module, or if there are
+  // *no* kernels defined within the module, or if there exist *no* kernel
+  // *execution* which accesses LDS globals at run time, then nothing to do.
+  if (!collectLDSGlobals() || !collectKernels() ||
+      !collectPerKernelAccessibleLDSGlobals())
+    return false;
+
+  // Associate kernels and LDS globals with unique integer id, and implement a
+  // mechanism to access the kernel ids within callees at run time.
+  prepareForLowering();
+
+  // Insert various required globals (including per kernel specific unified LDS
+  // layouts) into the module.
+  insertNewGlobals();
+
+  // Insert various required instructions within kernels and within callees
+  // which aid accessing of newly inserted LDS layouts.
+  insertNewInstructions();
+
+  return true;
+}
+
+class AMDGPULowerLDSGlobal : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPULowerLDSGlobal() : ModulePass(ID) {
+    initializeAMDGPULowerLDSGlobalPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPULowerLDSGlobal::ID = 0;
+char &llvm::AMDGPULowerLDSGlobalID = AMDGPULowerLDSGlobal::ID;
+
+INITIALIZE_PASS(AMDGPULowerLDSGlobal, "amdgpu-lower-lds-global",
+                "Lower LDS Global Variables", false /*only look at the cfg*/,
+                false /*analysis pass*/)
+
+bool AMDGPULowerLDSGlobal::runOnModule(Module &M) {
+  LowerLDSGlobalImpl LDSLowerer{M};
+  return LDSLowerer.lower();
+}
+
+ModulePass *llvm::createAMDGPULowerLDSGlobalPass() {
+  return new AMDGPULowerLDSGlobal();
+}
+
+PreservedAnalyses AMDGPULowerLDSGlobalPass::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  LowerLDSGlobalImpl LDSLowerer{M};
+  LDSLowerer.lower();
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableLDSGlobalLowering;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -193,6 +193,12 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool, true> EnableLDSGlobalLowering(
+    "amdgpu-enable-lds-global-lowering",
+    cl::desc("Enable LDS global variable lowering pass"),
+    cl::location(AMDGPUTargetMachine::EnableLDSGlobalLowering), cl::init(true),
+    cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -259,6 +265,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPULowerLDSGlobalPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -388,6 +395,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableLDSGlobalLowering = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -501,6 +509,10 @@
           PM.addPass(AMDGPUAlwaysInlinePass());
           return true;
         }
+        if (PassName == "amdgpu-lower-lds-global") {
+          PM.addPass(AMDGPULowerLDSGlobalPass());
+          return true;
+        }
         return false;
       });
   PB.registerPipelineParsingCallback(
@@ -847,6 +859,12 @@
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  // We expect to run this pass as a first AMDGPU IR pass so that new
+  // instructions being added in this pass can possibly undergo further
+  // transformations via subsequent passes.
+  if (EnableLDSGlobalLowering)
+    addPass(createAMDGPULowerLDSGlobalPass());
+
   addPass(createAMDGPUPrintfRuntimeBinding());
 
   // This must occur before inlining, as the inliner will not look through
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -67,6 +67,7 @@
   AMDGPULowerIntrinsics.cpp
   AMDGPULowerKernelArguments.cpp
   AMDGPULowerKernelAttributes.cpp
+  AMDGPULowerLDSGlobal.cpp
   AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUMachineModuleInfo.cpp