diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -341,6 +341,15 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
+void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &);
+extern char &AMDGPUReplaceLDSUseWithPointerID;
+struct AMDGPUReplaceLDSUseWithPointerPass
+    : PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> {
+  AMDGPUReplaceLDSUseWithPointerPass() {}
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -116,14 +116,21 @@
   // OpenCL doesn't allow declaring LDS in non-kernels, so in practice this
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
-
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  //
+  // Since now, LDS uses within non-kernel functions are being handled in the
+  // pass - `LowerModuleLDS`, we *NO* need to *forcefully* inline non-kernel
+  // functions just because they use LDS. Do forceful inlining only when the
+  // pass - `LowerModuleLDS` is not enabled. It is enabled by default.
+
+  if (!AMDGPUTargetMachine::EnableLowerModuleLDS) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -28,6 +28,7 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUGeneralUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -53,87 +54,13 @@
     return AMDGPU::isModuleEntryFunctionCC(Func->getCallingConv());
   }
 
-  static Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
-    return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
-                                         GV->getValueType());
-  }
-
-  static bool
-  userRequiresLowering(const SmallPtrSetImpl<GlobalValue *> &UsedList,
-                       User *InitialUser) {
-    // Any LDS variable can be lowered by moving into the created struct
-    // Each variable so lowered is allocated in every kernel, so variables
-    // whose users are all known to be safe to lower without the transform
-    // are left unchanged.
-    SmallPtrSet<User *, 8> Visited;
-    SmallVector<User *, 16> Stack;
-    Stack.push_back(InitialUser);
-
-    while (!Stack.empty()) {
-      User *V = Stack.pop_back_val();
-      Visited.insert(V);
-
-      if (auto *G = dyn_cast<GlobalValue>(V->stripPointerCasts())) {
-        if (UsedList.contains(G)) {
-          continue;
-        }
-      }
-
-      if (auto *I = dyn_cast<Instruction>(V)) {
-        if (isKernelCC(I->getFunction())) {
-          continue;
-        }
-      }
-
-      if (auto *E = dyn_cast<ConstantExpr>(V)) {
-        for (Value::user_iterator EU = E->user_begin(); EU != E->user_end();
-             ++EU) {
-          if (Visited.insert(*EU).second) {
-            Stack.push_back(*EU);
-          }
-        }
-        continue;
-      }
-
-      // Unknown user, conservatively lower the variable
-      return true;
-    }
-
-    return false;
-  }
-
   static std::vector<GlobalVariable *>
   findVariablesToLower(Module &M,
                        const SmallPtrSetImpl<GlobalValue *> &UsedList) {
     std::vector<llvm::GlobalVariable *> LocalVars;
     for (auto &GV : M.globals()) {
-      if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
-        continue;
-      }
-      if (!GV.hasInitializer()) {
-        // addrspace(3) without initializer implies cuda/hip extern __shared__
-        // the semantics for such a variable appears to be that all extern
-        // __shared__ variables alias one another, in which case this transform
-        // is not required
-        continue;
-      }
-      if (!isa<UndefValue>(GV.getInitializer())) {
-        // Initializers are unimplemented for local address space.
-        // Leave such variables in place for consistent error reporting.
-        continue;
-      }
-      if (GV.isConstant()) {
-        // A constant undef variable can't be written to, and any load is
-        // undef, so it should be eliminated by the optimizer. It could be
-        // dropped by the back end if not. This pass skips over it.
-        continue;
-      }
-      if (std::none_of(GV.user_begin(), GV.user_end(), [&](User *U) {
-            return userRequiresLowering(UsedList, U);
-          })) {
-        continue;
-      }
-      LocalVars.push_back(&GV);
+      if (isLDSLowereringRequired(&GV, UsedList, /*IsLDSLoweringPass=*/true))
+        LocalVars.push_back(&GV);
     }
     return LocalVars;
   }
@@ -217,20 +144,6 @@
                        "");
   }
 
-  static SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
-    SmallPtrSet<GlobalValue *, 32> UsedList;
-
-    SmallVector<GlobalValue *, 32> TmpVec;
-    collectUsedGlobalVariables(M, TmpVec, true);
-    UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-    TmpVec.clear();
-    collectUsedGlobalVariables(M, TmpVec, false);
-    UsedList.insert(TmpVec.begin(), TmpVec.end());
-
-    return UsedList;
-  }
-
 public:
   static char ID;
 
@@ -365,9 +278,16 @@
 
 char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
 
-INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
-                "Lower uses of LDS variables from non-kernel functions", false,
-                false)
+INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDS, DEBUG_TYPE,
+                      "Lower uses of LDS variables from non-kernel functions",
+                      false, false)
+// Before runnning current LDS lower pass, replace LDS uses within non-kernel
+// functions by pointers so that the current pass minimizes the unnecessary per
+// kernel allocation of LDS memory.
+INITIALIZE_PASS_DEPENDENCY(AMDGPUReplaceLDSUseWithPointer)
+INITIALIZE_PASS_END(AMDGPULowerModuleLDS, DEBUG_TYPE,
+                    "Lower uses of LDS variables from non-kernel functions",
+                    false, false)
 
 ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
   return new AMDGPULowerModuleLDS();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
@@ -0,0 +1,758 @@
+//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// One of the  memory types being supported within AMD GPU memory hierarchy is
+// `shared memory`, also called `Local Data Share` or LDS for short. LDS memory
+// is the `second` fastest memory in the AMD GPU memory hierarchy (with register
+// file being fastest available memory in the hierarchy). Being faster also
+// means LDS memory is comparatively costlier and hence is a `limited` available
+// memory resource.
+//
+// Being global scoped, an LDS variable is accessible within kernel functions
+// and non-kernel functions, but two different kernel execution paths, say
+// called from two kernels K1 and K2, cannot access the same instance of an LDS
+// variable, say L. Both K1 and K2 has to own its own instance of L. This puts
+// some challenges, especially to lower the LDS variables used within non-kernel
+// functions.
+//
+// So, the pass - "Lower Module LDS" lowers the LDS globals by packing them
+// within in a struct type, and by creating an instance of that struct type
+// within every kerenl at address zero. Though, the pass - "Lower Module LDS"
+// makes some effort to minimize unnecessary LDS allocation, it is limited by
+// means of the fundamental basis and assumption upon which the pass is
+// implemented.
+//
+// The current pass acts as an helping aid to the pass - "Lower Module LDS" with
+// the intention of minimizing unnecessary LDS allocation as much as possible.
+//
+// The main idea behind the current pass is:
+//
+// (1) To identify the LDS globals used within non-kernel function scope and
+//     global scope,
+// (2) To push the use of all the above identified LDS globals to kernel
+//     function scope by initializing their addresses to newly created LDS
+//     global pointer variables (within kernel functions),
+// (3) To replace the uses of original LDS globals within non-kernel functions
+//     by their pointer counter-parts.
+// (4) This way, the transformation makes sure that the pass "Lower Module LDS"
+//     packs only pointer variables within struct type, and hence significantly
+//     minimizes unnecessary LDS allocation, espacically when the original LDS
+//     globals are big arrays (as this is the common LDS use case).
+//
+// NOTE: The pass - "Lower Module LDS" now has a tight dependency on the current
+//       pass, and the current pass should always be run before running the pass
+//       "Lower Module LDS". Running the pass "Lower Module LDS" alone may lead
+//       to surprizing results.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUGeneralUtils.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <queue>
+#include <set>
+
+#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
+
+using namespace llvm;
+
+namespace {
+
+// Error kinds for handling the errors within the context of current pass.
+enum ReplaceLDSErrorKind : uint32_t {
+  LLEK_EndOfList = 0u,
+  LLEK_InternalError = 2u,
+  LLEK_NoCalleeDefinitionError = 3u
+};
+
+} // namespace
+
+// Report error within the context of current pass based on the error kind.
+static void reportReplaceLDSError(ReplaceLDSErrorKind EK, Value *V = nullptr) {
+  std::string ErrStr("The pass \"Replace LDS Use With Pointer\" ");
+
+  switch (EK) {
+  default:
+  case LLEK_InternalError: {
+    ErrStr = ErrStr + std::string("has encountered an internal error.");
+    break;
+  }
+  case LLEK_NoCalleeDefinitionError: {
+    ErrStr =
+        ErrStr +
+        std::string("assumes that the definitions of both caller and callee "
+                    "appear within same module. But, definition for the "
+                    "callee \"") +
+        V->getName().str() + std::string("\" not available.");
+    break;
+  }
+  }
+
+  report_fatal_error(ErrStr);
+}
+
+// Helper function around `ValueMap` to detect if an element exists within it.
+template <typename R, typename E>
+static bool contains(R &&VMap, const E &Element) {
+  return VMap.find(Element) != VMap.end();
+}
+
+// Within User `U` replace the use(s) of `OldValue` by `NewValue`.
+static void updateUserOperand(User *U, Value *OldValue, Value *NewValue) {
+  unsigned Ind = 0;
+  for (Use &UU : U->operands()) {
+    if (UU.get() == OldValue)
+      U->setOperand(Ind, NewValue);
+    ++Ind;
+  }
+}
+
+// Convert `ConstantExpr CE` to a corresponding set of instructions, and update
+// users of `CE` to use corresponding instructions.
+static Instruction *
+replaceConstExprByInst(ConstantExpr *CE,
+                       SmallPtrSetImpl<Instruction *> &Insts) {
+  Instruction *NI = nullptr;
+
+  SmallVector<User *, 8> CEUsers;
+  append_range(CEUsers, CE->users());
+
+  for (auto *U : CEUsers) {
+    auto *I = dyn_cast<Instruction>(U);
+    if (!I) {
+      auto *CE2 = dyn_cast<ConstantExpr>(U);
+      assert(CE2 && "Constant expression expected.");
+      I = replaceConstExprByInst(CE2, Insts);
+    }
+
+    NI = CE->getAsInstruction();
+    NI->insertBefore(I);
+    updateUserOperand(I, CE, NI);
+    CE->removeDeadConstantUsers();
+    Insts.insert(NI);
+  }
+
+  assert(NI && "Instruction expected.");
+
+  return NI;
+}
+
+// `U` should be either `Instruction` OR `ConstantExpr`. If it is `Instruction`
+// return it, if it is `ConstantExpr` break it into a set of instructions and
+// return it.
+static void getInstructions(User *U, SmallPtrSetImpl<Instruction *> &Insts) {
+  if (auto *I = dyn_cast<Instruction>(U)) {
+    // Return instruction `I`.
+    Insts.insert(I);
+  } else if (auto *CE = dyn_cast<ConstantExpr>(U)) {
+    // Break const expression `CE` into a set of instructions.
+    replaceConstExprByInst(CE, Insts);
+  } else {
+    // Unexpected control flow - what else is missing?
+    reportReplaceLDSError(LLEK_InternalError);
+  }
+}
+
+// Return true if the user `U` is a global variable.
+static bool isUserGlobalVariable(User *U) {
+  SmallVector<User *, 8> UserStack;
+  SmallPtrSet<User *, 8> VisitedUsers;
+
+  UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    auto *U = UserStack.pop_back_val();
+
+    if (!VisitedUsers.insert(U).second)
+      continue;
+
+    if (isa<GlobalVariable>(U))
+      return true;
+
+    if (isa<Constant>(U)) {
+      append_range(UserStack, U->users());
+      continue;
+    }
+
+    if (isa<Instruction>(U))
+      return false;
+  }
+
+  return false;
+}
+
+// Collect functions whose address is taken within the module.
+static void collectAddressTakenFunctions(
+    CallGraph &CG, SmallPtrSetImpl<CallGraphNode *> &AddressTakenSet) {
+  auto *ExternalCallingNode = CG.getExternalCallingNode();
+  for (auto GI = ExternalCallingNode->begin(), GE = ExternalCallingNode->end();
+       GI != GE; ++GI) {
+    auto *CGN = GI->second;
+    auto *F = CGN->getFunction();
+    // Note that we intentionally collect "declared only" address taken fuctions
+    // too here, but later, error will be thrown when we check for the
+    // definition of callees since this pass assumes that both caller and callee
+    // appear within the same module.
+    // FIXME: Anything else need to be excluded?
+    if (!F || AMDGPU::isModuleEntryFunctionCC(F->getCallingConv()))
+      continue;
+    AddressTakenSet.insert(CGN);
+  }
+}
+
+namespace {
+
+class ReplaceLDSUseImpl {
+  Module &M;
+  LLVMContext &Ctx;
+  const DataLayout &DL;
+
+  // Holds all kernels defined within the module `M`.
+  SmallPtrSet<Function *, 8> Kernels;
+
+  // Holds all LDS globals defined within the module `M`.
+  SmallPtrSet<GlobalVariable *, 8> LDSGlobals;
+
+  // Holds all those LDS globals which are used as initializers within some
+  // other global variable definitions.
+  SmallPtrSet<GlobalVariable *, 8> LDSGlobalsAsInitializers;
+
+  // Associates LDS global to a list of functions which references that LDS.
+  ValueMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSGlobalToAccessors;
+
+  // Associates function to a list of LDS globals which are referenced within
+  // that function.
+  ValueMap<Function *, SmallPtrSet<GlobalVariable *, 8>> AccessorToLDSGlobals;
+
+  // Associates kernel to a list of non-kernel functions which are reachable
+  // from that kernel.
+  ValueMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
+
+  // Associates kernel to a list of LDS globals which are referenced along the
+  // run time kernel execution paths (within non-kernel functions) associated
+  // with that kernel.
+  ValueMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSGlobals;
+
+  // Associates LDS global to a unique pointer which points to that LDS global.
+  ValueMap<GlobalVariable *, GlobalVariable *> LDSToPointer;
+
+  // Associates non-kernel function to an LDS global to a list of int-to-ptr
+  // instructions.
+  std::map<Function *, std::map<GlobalVariable *, Value *>> FunctionToLDSToInst;
+
+public:
+  explicit ReplaceLDSUseImpl(Module &M)
+      : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {}
+
+  // Entry-point function.
+  bool replace();
+
+private:
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating new global LDS pointers which point to
+  // original LDS globals which are referenced within non-kernel functions.
+  //===--------------------------------------------------------------------===//
+
+  // Construct an `IntToPtr` instruction which replaces `LDS` within F.
+  Value *getIntToPtrInst(Function *F, GlobalVariable *LDS,
+                         GlobalVariable *LDSPointer);
+
+  // Replace all uses of original LDS globals within all non-kernel functions by
+  // their respective LDS poitners.
+  void replaceUsesOfLDSGlobalsByPointers();
+
+  // Insert global LDS pointers (which point to original LDS globals which are
+  // referenced within non-kernel functions) and initialize them within kernels
+  // to point to respective LDS globals.
+  void insertAndInitializeLDSPointers();
+
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating the various `map` data structures.
+  //===--------------------------------------------------------------------===//
+
+  // Associate each kernel K with LDS globals which are being accessed by K
+  // and/or by the callees of K.
+  void createKernelToLDSGlobalsMap();
+
+  // Collect all call graph nodes which are reachable from the node `CGN`.
+  void
+  collectReachableCallGraphNodes(CallGraphNode *CGN,
+                                 SetVector<CallGraphNode *> &ReachableCGNodes);
+
+  // Resolve all indirect call sites within the the call graph node `CGN`.
+  void
+  resolveIndirectCallSites(CallGraphNode *CGN, CallGraph &CG,
+                           SmallPtrSetImpl<CallGraphNode *> &AddressTakenSet,
+                           SetVector<CallGraphNode *> &ReachableCGNodes);
+
+  // Traverse `CallGraph` starting from the `CallGraphNode` associated with each
+  // kernel `K` and collect all callees which are reachable from K (including
+  // indirectly called callees).
+  void createKernelToCalleesMap();
+
+  // Associate each kernel/function with the LDS globals which are being
+  // accessed within them.
+  void createAccessorToLDSGlobalsMap();
+
+  // For each `LDS`, recursively visit its user list and find all those
+  // kernels/functions within which the `LDS` is being accessed.
+  void createLDSGlobalToAccessorsMap();
+
+  // For each kernel `K`, collect LDS globals which are being accessed during
+  // the execution of `K`.
+  bool collectPerKernelAccessibleLDSGlobals();
+
+  //===--------------------------------------------------------------------===//
+  // Methods which aid in creating the various `set` data structures.
+  //===--------------------------------------------------------------------===//
+
+  // Collect all the LDS globals defined within the current module which require
+  // pointer replacement.
+  bool collectLDSGlobals();
+
+  // Collect all the amdgpu kernels defined within the current module.
+  bool collectKernels();
+};
+
+// Construct an `IntToPtr` instruction which replaces `LDS` within F.
+Value *ReplaceLDSUseImpl::getIntToPtrInst(Function *F, GlobalVariable *LDS,
+                                          GlobalVariable *LDSPointer) {
+  // Create an entry for `F` within `FunctionToLDSToInst`.
+  if (!contains(FunctionToLDSToInst, F))
+    FunctionToLDSToInst[F] = std::map<GlobalVariable *, Value *>();
+
+  // `IntToPtr` instruction to be constructed.
+  Value *IToP = nullptr;
+
+  auto &LDSToInst = FunctionToLDSToInst[F];
+  if (!contains(LDSToInst, LDS)) {
+    // Get the instruction insertion point within the beginning of the entry
+    // block of current non-kernel function.
+    auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt()));
+    IRBuilder<> Builder(EI);
+
+    // Insert Load and IntToPtr instructions.
+    IToP = Builder.CreateIntToPtr(
+        Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer),
+        LDS->getType());
+    LDSToInst[LDS] = IToP;
+  } else
+    IToP = LDSToInst[LDS];
+
+  return IToP;
+}
+
+// Replace all uses of original LDS globals within all non-kernel functions by
+// their respective LDS poitners.
+void ReplaceLDSUseImpl::replaceUsesOfLDSGlobalsByPointers() {
+  for (auto LI = LDSToPointer.begin(), LE = LDSToPointer.end(); LI != LE;
+       ++LI) {
+    auto *LDS = LI->first;
+    auto *LDSPointer = LI->second;
+
+    SmallVector<User *, 16> LDSUsers(LDS->users());
+    for (auto *U : LDSUsers) {
+      // `U` is a global variable (from different address space) which got
+      // initialized with `LDS`. No need to handle it.
+      if (isUserGlobalVariable(U))
+        continue;
+
+      // `U` is from within some function. Since the replacers of `LDS` within
+      // `U` are instructions, and if `U` is a const expression, then we cannot
+      // embed instructions within const expressions. Hence, get appropriate
+      // instructions if `U` is a const expression.
+      SmallPtrSet<Instruction *, 8> Insts;
+      getInstructions(U, Insts);
+
+      for (auto *I : Insts) {
+        // Get function to which `I` belongs to.
+        auto *F = I->getParent()->getParent();
+
+        // Ignore uses within kernels.
+        if (AMDGPU::isModuleEntryFunctionCC(F->getCallingConv()))
+          continue;
+
+        // Construct an `IntToPtr` instruction which replaces `LDS` within F.
+        auto *IToP = getIntToPtrInst(F, LDS, LDSPointer);
+
+        // Replace the uses of `LDS` within `I` by `IToP`.
+        updateUserOperand(I, LDS, IToP);
+      }
+    }
+  }
+}
+
+// Insert global LDS pointers (which point to original LDS globals which are
+// referenced within non-kernel functions) and initialize them within kernels to
+// point to respective LDS globals.
+void ReplaceLDSUseImpl::insertAndInitializeLDSPointers() {
+  unsigned LID = 0;
+
+  for (auto KI = KernelToLDSGlobals.begin(), KE = KernelToLDSGlobals.end();
+       KI != KE; ++KI) {
+    // Get the instruction insertion point within the beginning of entry block
+    // of current kernel.
+    auto *EI = &(*(KI->first->getEntryBlock().getFirstInsertionPt()));
+    IRBuilder<> Builder(EI);
+
+    // Insert and initialize LDS pointers for all LDS globals which associated
+    // with current kernel.
+    for (auto *LDS : KI->second) {
+      GlobalVariable *LDSPointer = nullptr;
+
+      if (!contains(LDSToPointer, LDS)) {
+        // `LDS` is encountered for first type, create an LDS pointer which is
+        // suppose to point to `LDS`.
+        ++LID;
+        auto *I16Ty = Type::getInt16Ty(Ctx);
+        LDSPointer = new GlobalVariable(
+            M, I16Ty, false, GlobalValue::InternalLinkage,
+            UndefValue::get(I16Ty),
+            Twine("llvm.amdgcn.lds.pointer.") + Twine(LID), nullptr,
+            GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+        LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+        LDSPointer->setAlignment(getAlign(M.getDataLayout(), LDSPointer));
+        LDSToPointer[LDS] = LDSPointer;
+      } else {
+        // An LDS pointer which points to `LDS` is already created, get it.
+        LDSPointer = LDSToPointer[LDS];
+      }
+
+      // Insert instructions at `EI` in order to initialize `LDSPointer` to
+      // point to `LDS`.
+      Builder.CreateStore(Builder.CreatePtrToInt(LDS, Type::getInt16Ty(Ctx)),
+                          LDSPointer);
+    }
+  }
+}
+
+// Associate each kernel K with LDS globals which are being accessed by K and/or
+// by the callees of K.
+void ReplaceLDSUseImpl::createKernelToLDSGlobalsMap() {
+  for (auto *K : Kernels) {
+    SmallPtrSet<GlobalVariable *, 8> LDSSet(LDSGlobalsAsInitializers.begin(),
+                                            LDSGlobalsAsInitializers.end());
+
+    // Collect all those LDS globals which are being accessed by the callees of
+    // kernel K.
+    if (contains(KernelToCallees, K)) {
+      for (auto *Callee : KernelToCallees[K]) {
+        if (contains(AccessorToLDSGlobals, Callee))
+          LDSSet.insert(AccessorToLDSGlobals[Callee].begin(),
+                        AccessorToLDSGlobals[Callee].end());
+      }
+    }
+
+    if (!LDSSet.empty())
+      KernelToLDSGlobals[K] = LDSSet;
+  }
+}
+
+// Collect all call graph nodes which are reachable from the node `CGN`.
+void ReplaceLDSUseImpl::collectReachableCallGraphNodes(
+    CallGraphNode *CGN, SetVector<CallGraphNode *> &ReachableCGNodes) {
+  for (scc_iterator<CallGraphNode *> I = scc_begin(CGN); !I.isAtEnd(); ++I) {
+    const std::vector<CallGraphNode *> &SCC = *I;
+    assert(!SCC.empty() && "SCC with no functions?");
+    for (auto *CGNode : SCC)
+      ReachableCGNodes.insert(CGNode);
+  }
+}
+
+// Resolve all indirect call sites within the the call graph node `CGN`.
+void ReplaceLDSUseImpl::resolveIndirectCallSites(
+    CallGraphNode *CGN, CallGraph &CG,
+    SmallPtrSetImpl<CallGraphNode *> &AddressTakenSet,
+    SetVector<CallGraphNode *> &ReachableCGNodes) {
+  for (auto GI = CGN->begin(), GE = CGN->end(); GI != GE; ++GI) {
+    auto *CB = cast<CallBase>(GI->first.getValue());
+
+    // If the call site `CB` is not an indirect call site, ignore it, and go to
+    // next one, otherwise, resolve the indirect call site `CB` to a set of
+    // potential callees.
+    if (!CB->isIndirectCall())
+      continue;
+
+    // "Inline asm call sites" cannot be handled. Ignore it.
+    if (CB->isInlineAsm())
+      continue;
+
+    // `CB` is an indirect call, handle it.
+    //
+    if (auto *MD = CB->getMetadata(LLVMContext::MD_callees)) {
+      // The metadata "!callee" is available at the indirect call site `CB`,
+      // which means, all the potential target callees for the call site `CB` is
+      // successfully resolved at compile time. Collect them.
+      for (const auto &Op : MD->operands()) {
+        auto *GCN = CG[mdconst::extract_or_null<Function>(Op)];
+        collectReachableCallGraphNodes(GCN, ReachableCGNodes);
+      }
+    } else {
+      // The metadata "!callee" is *NOT* available at the indirect call site
+      // `CB`, which means, `CB` has *NO* information about potential target
+      // callees. The simplest possible *SAFE* assumption that we can make here
+      // is to consider all those "address taken" functions whose singature
+      // matches with that of the call site `CB`, and assume that all these
+      // signature matched "address taken" functions are possible potential
+      // callees. So, collect all these signature matchable "address taken"
+      // functions.
+      auto *CBFTy = CB->getFunctionType();
+      for (auto *CGN : AddressTakenSet) {
+        if (CGN->getFunction()->getFunctionType() == CBFTy)
+          collectReachableCallGraphNodes(CGN, ReachableCGNodes);
+      }
+    }
+  }
+}
+
+// Traverse `CallGraph` starting from the `CallGraphNode` associated with each
+// kernel `K` and collect all the callees which are reachable from K (including
+// indirectly called callees).
+void ReplaceLDSUseImpl::createKernelToCalleesMap() {
+  // Create the call graph `CG` of the module `M`, collect all the address taken
+  // functions, and explore `CG` to collect all the reachable callees (including
+  // indirectly called callees) from all kernels.
+  CallGraph CG = CallGraph(M);
+
+  // Holds call graph nodes associated with the functions whose addresses are
+  // taken within the module.
+  SmallPtrSet<CallGraphNode *, 8> AddressTakenSet;
+
+  // Collect all address taken functions within the module `M`.
+  collectAddressTakenFunctions(CG, AddressTakenSet);
+
+  for (auto *K : Kernels) {
+    // Get `CallGraphNode` representing kernel `K`.
+    auto *KernCGNode = CG[K];
+
+    // Collect all call graph nodes which are reachable from `KernCGNode`.
+    SetVector<CallGraphNode *> ReachableCGNodes;
+    collectReachableCallGraphNodes(KernCGNode, ReachableCGNodes);
+
+    // Remove `CallGraphNode` representing kernel `K` from reachable node set.
+    ReachableCGNodes.remove(KernCGNode);
+
+    // Collect all callees (including potential indirect callees) which are
+    // reachable from kernel `K`. First, resolve all indirect call sites within
+    // kernel `K`, and then `recursively` within all reachable callees from
+    // kernel `K`.
+    SmallPtrSet<Function *, 8> ReachableCallees;
+    SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
+
+    resolveIndirectCallSites(KernCGNode, CG, AddressTakenSet, ReachableCGNodes);
+
+    while (!ReachableCGNodes.empty()) {
+      auto *CGN = ReachableCGNodes.pop_back_val();
+
+      // If `CGN` is already handled OR if there is not callee associated with
+      // `CGN`, then ignore it.
+      if (!VisitedCGNodes.insert(CGN).second || !CGN->getFunction())
+        continue;
+
+      auto *F = CGN->getFunction();
+
+      // This pass expects both caller and callee to appear in the same module.
+      // Report an error if `F` is a non-kernel function and is not definition.
+      if (!AMDGPU::isModuleEntryFunctionCC(F->getCallingConv()) &&
+          F->isDeclaration())
+        reportReplaceLDSError(LLEK_NoCalleeDefinitionError, F);
+
+      // Callee associated with `CGN` is reachable from kernel `K`.
+      ReachableCallees.insert(F);
+
+      // Resolve all indirect call sites within the callee `Callee`.
+      resolveIndirectCallSites(CGN, CG, AddressTakenSet, ReachableCGNodes);
+    }
+
+    KernelToCallees[K] = ReachableCallees;
+  }
+}
+
+// Associate each kernel/function with the LDS globals which are being accessed
+// within them.
+void ReplaceLDSUseImpl::createAccessorToLDSGlobalsMap() {
+  for (auto LI = LDSGlobalToAccessors.begin(), LE = LDSGlobalToAccessors.end();
+       LI != LE; ++LI) {
+    for (auto *A : LI->second) {
+      if (!contains(AccessorToLDSGlobals, A)) {
+        SmallPtrSet<GlobalVariable *, 8> LDSSet;
+        LDSSet.insert(LI->first);
+        AccessorToLDSGlobals[A] = LDSSet;
+      } else
+        AccessorToLDSGlobals[A].insert(LI->first);
+    }
+  }
+}
+
+// For each `LDS`, recursively visit its user list and find all those
+// kernels/functions within which the `LDS` is being accessed.
+void ReplaceLDSUseImpl::createLDSGlobalToAccessorsMap() {
+  for (auto *LDS : LDSGlobals) {
+    SmallPtrSet<Function *, 8> LDSAccessors;
+    SmallVector<User *, 8> UserStack(LDS->users());
+    SmallPtrSet<User *, 8> VisitedUsers;
+
+    while (!UserStack.empty()) {
+      auto *U = UserStack.pop_back_val();
+
+      // `U` is already visited? continue to next one.
+      if (!VisitedUsers.insert(U).second)
+        continue;
+
+      // `U` is a global variable (from different address space) which is
+      // initialized with `LDS`. Ignore `U`.
+      if (isa<GlobalVariable>(U)) {
+        LDSGlobalsAsInitializers.insert(LDS);
+        continue;
+      }
+
+      // `U` is `Constant`. Push-back users of `U`, and continue further
+      // exploring the stack until an `Instruction` is found.
+      if (isa<Constant>(U)) {
+        append_range(UserStack, U->users());
+        continue;
+      }
+
+      // `U` should be an instruction. Otherwise something is wrong.
+      auto *I = dyn_cast<Instruction>(U);
+      if (!I)
+        reportReplaceLDSError(LLEK_InternalError);
+
+      // We have successfully found a kernel/function within which the `LDS` is
+      // being accessed, insert it into `LDSAccessors` set.
+      LDSAccessors.insert(I->getParent()->getParent());
+    }
+
+    LDSGlobalToAccessors[LDS] = LDSAccessors;
+  }
+}
+
+// For each kernel `K`, collect LDS globals which are being accessed during the
+// execution of `K`.
+bool ReplaceLDSUseImpl::collectPerKernelAccessibleLDSGlobals() {
+  // Associate each LDS with the kernels/functions within which the LDS is being
+  // accessed.
+  createLDSGlobalToAccessorsMap();
+
+  // Associate each kernel/function with the LDS globals which are being
+  // accessed within them.
+  createAccessorToLDSGlobalsMap();
+
+  // Associate each kernel K with callees which are reachable from K (including
+  // indirectly called callees).
+  createKernelToCalleesMap();
+
+  // Associate each kernel K with LDS globals which are being accessed by K
+  // and/or by the callees of K.
+  createKernelToLDSGlobalsMap();
+
+  // If *none* of the kernels associate with any LDS globals which needs pointer
+  // replacement, then nothing do.
+  return !KernelToLDSGlobals.empty();
+}
+
+// Collect all the (static) LDS globals defined within the current module which
+// require pointer replacement.
+bool ReplaceLDSUseImpl::collectLDSGlobals() {
+  SmallPtrSet<GlobalValue *, 32> UsedList = getUsedList(M);
+  for (auto &GV : M.globals()) {
+    if (isLDSLowereringRequired(&GV, UsedList))
+      LDSGlobals.insert(&GV);
+  }
+
+  return !LDSGlobals.empty();
+}
+
+// Collect all the amdgpu kernels defined within the current module.
+bool ReplaceLDSUseImpl::collectKernels() {
+  for (auto &F : M.functions()) {
+    // Collect `F` if it is a definition of an entry point function.
+    if (!F.isDeclaration() &&
+        AMDGPU::isModuleEntryFunctionCC(F.getCallingConv()))
+      Kernels.insert(&F);
+  }
+
+  return !Kernels.empty();
+}
+
+// Entry-point function.
+bool ReplaceLDSUseImpl::replace() {
+  // If there are *no* kernels defined within the module, or if there are *no*
+  // LDS globals defined within the module, then nothing to do.
+  if (!collectKernels() || !collectLDSGlobals())
+    return false;
+
+  // There are kernels and LDS globals defined within the module, but, if none
+  // of the LDS globals are being accessed within non-kernel functions along the
+  // run time kernels execution paths, then nonthing to do.
+  if (!collectPerKernelAccessibleLDSGlobals())
+    return false;
+
+  // Insert global LDS pointers (which point to original LDS globals which are
+  // referenced within non-kernel functions) and initialize them within kernels
+  // to point to respective LDS globals.
+  insertAndInitializeLDSPointers();
+
+  // Replace all uses of original LDS globals within all non-kernel functions by
+  // their respective LDS poitners.
+  replaceUsesOfLDSGlobalsByPointers();
+
+  return true;
+}
+
+class AMDGPUReplaceLDSUseWithPointer : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) {
+    initializeAMDGPUReplaceLDSUseWithPointerPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPUReplaceLDSUseWithPointer::ID = 0;
+char &llvm::AMDGPUReplaceLDSUseWithPointerID =
+    AMDGPUReplaceLDSUseWithPointer::ID;
+
+INITIALIZE_PASS(AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
+                "Replace non-kernel use of LDS with pointer",
+                false /*only look at the cfg*/, false /*analysis pass*/)
+
+bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) {
+  ReplaceLDSUseImpl LDSReplacer{M};
+  return LDSReplacer.replace();
+}
+
+ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() {
+  return new AMDGPUReplaceLDSUseWithPointer();
+}
+
+PreservedAnalyses
+AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) {
+  ReplaceLDSUseImpl LDSReplacer{M};
+  LDSReplacer.replace();
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableLowerModuleLDS;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -193,10 +193,10 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
-static cl::opt<bool>
-    DisableLowerModuleLDS("amdgpu-disable-lower-module-lds", cl::Hidden,
-                          cl::desc("Disable lower module lds pass"),
-                          cl::init(false));
+static cl::opt<bool, true> EnableLowerModuleLDS(
+    "amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
+    cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
+    cl::Hidden);
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
@@ -265,6 +265,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -398,6 +399,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableLowerModuleLDS = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -512,6 +514,7 @@
           return true;
         }
         if (PassName == "amdgpu-lower-module-lds") {
+          PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
           PM.addPass(AMDGPULowerModuleLDSPass());
           return true;
         }
@@ -872,7 +875,6 @@
 
   addPass(createAtomicExpandPass());
 
-
   addPass(createAMDGPULowerIntrinsicsPass());
 
   // Function calls are not supported, so make sure we inline everything.
@@ -893,8 +895,10 @@
   addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
 
   // Can increase LDS used by kernel so runs before PromoteAlloca
-  if (!DisableLowerModuleLDS)
+  if (EnableLowerModuleLDS) {
+    addPass(createAMDGPUReplaceLDSUseWithPointerPass());
     addPass(createAMDGPULowerModuleLDSPass());
+  }
 
   if (TM.getOptLevel() > CodeGenOpt::None) {
     addPass(createInferAddressSpacesPass());
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,6 +81,7 @@
   AMDGPUPropagateAttributes.cpp
   AMDGPURegBankCombiner.cpp
   AMDGPURegisterBankInfo.cpp
+  AMDGPUReplaceLDSUseWithPointer.cpp
   AMDGPURewriteOutArguments.cpp
   AMDGPUSubtarget.cpp
   AMDGPUTargetMachine.cpp
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.h
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.h
@@ -0,0 +1,43 @@
+//===- AMDGPUGeneralUtils.h - general helper functions -*- C++ -*----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU target related general helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUGENERALUTILS_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUGENERALUTILS_H
+
+#include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "Utils/AMDGPUGeneralUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+namespace llvm {
+
+// Check if `GV` is an LDS global, and lowerering is required for it.
+bool isLDSLowereringRequired(GlobalVariable *GV,
+                             const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                             bool IsLDSLoweringPass = false);
+
+// Get the required alignment for global variable `GV`.
+Align getAlign(const DataLayout &DL, const GlobalVariable *GV);
+
+// Get a list of all used global values in the module `M`.
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M);
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUGENERALUTILS_H
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUGeneralUtils.cpp
@@ -0,0 +1,168 @@
+//===- AMDGPUGeneralUtils.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDGPU target related general helper utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUGeneralUtils.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+// Check if we can skip the lowering for current LDS global `GV`.
+static bool skipLowering(GlobalVariable *GV,
+                         const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                         bool IsLDSLoweringPass) {
+  bool UsedAsInitializer = false;
+  bool UsedAsNonLLVMUsedInitializer = false;
+  bool UsedWithinKernelFunction = false;
+  bool UsedWithinNonKernelFunction = false;
+  SmallPtrSet<User *, 8> VisitedUsers;
+
+  // There are no users for `GV`, skip lowering for `GV`.
+  SmallVector<User *, 8> UserStack(GV->users());
+  if (UserStack.empty())
+    return true;
+
+  while (!UserStack.empty()) {
+    auto *U = UserStack.pop_back_val();
+
+    // `U` is already visited? continue to next one.
+    if (!VisitedUsers.insert(U).second)
+      continue;
+
+    if (isa<GlobalVariable>(U)) {
+      // `U` is a global variable, and `GV` is used as its initializer.
+      UsedAsInitializer = true;
+      if (!UsedList.contains(GV)) {
+        // Used as initializer of normal globals apart from "llvm.used" or
+        // "llvm.compiler.used".
+        UsedAsNonLLVMUsedInitializer = true;
+      }
+      continue;
+    }
+
+    if (isa<Constant>(U)) {
+      // `U` is `Constant`. Push-back users of `U`, and continue further
+      // exploring the stack.
+      append_range(UserStack, U->users());
+      continue;
+    }
+
+    // `U` should be an instruction belonging to some function.
+    auto *I = dyn_cast<Instruction>(U);
+    assert(I && "Instruction expected.");
+    auto *F = I->getFunction();
+    if (AMDGPU::isModuleEntryFunctionCC(F->getCallingConv())) {
+      UsedWithinKernelFunction = true;
+    } else {
+      UsedWithinNonKernelFunction = true;
+    }
+  }
+
+  if (UsedWithinNonKernelFunction) {
+    // `GV` is used within non-kernel function, it requires lowering.
+    return false;
+  }
+
+  if (UsedAsInitializer) {
+    // `GV` is used as an initializer of some global variable.
+    if (UsedAsNonLLVMUsedInitializer) {
+      // `GV` is used as global variable initializer of normal globals.
+      if (!IsLDSLoweringPass) {
+        // This is "LDS replace with pointer" pass, and let this pass make sure
+        // that pointer variable is created for `GV`, and that pointer variable
+        // is initialized with `GV` within all kernels.
+        return false;
+      } else {
+        // "LDS replace with pointer" pass makes sure that a pointer variable is
+        // created for `GV`, and it is initialized with `GV` within all kernels,
+        // and which means that per kernel specific `GV` will be created, and
+        // hence "LDS lowering pass" no need to touch it.
+        return true;
+      }
+    } else {
+      // `GV` is used as global variable initializer of "llvm.used" or
+      // "llvm.compiler.used". Ignore lowering.
+      return true;
+    }
+  }
+
+  if (UsedWithinKernelFunction) {
+    // `GV` is *only* used within kernel, it does not require lowering.
+    return true;
+  }
+
+  // Ideally control should not reach here. If it is, then, we need to take a
+  // re-look at the above logic.
+  assert(false && "Internal error.");
+  return true;
+}
+
+// Check if `GV` is an LDS global, and lowerering is required for it.
+bool isLDSLowereringRequired(GlobalVariable *GV,
+                             const SmallPtrSetImpl<GlobalValue *> &UsedList,
+                             bool IsLDSLoweringPass) {
+  if (GV->getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
+    // Ignore addrspace other than 3.
+    return false;
+  }
+
+  if (!GV->hasInitializer()) {
+    // addrspace(3) without initializer implies cuda/hip extern __shared__ the
+    // semantics for such a variable appears to be that all extern // __shared__
+    // variables alias one another, in which case this transform is not required
+    return false;
+  }
+
+  if (!isa<UndefValue>(GV->getInitializer())) {
+    // Initializers are unimplemented for local address space. Leave such
+    // variables in place for consistent error reporting.
+    return false;
+  }
+
+  if (GV->isConstant()) {
+    // A constant undef variable can't be written to, and any load is undef, so
+    // it should be eliminated by the optimizer. It could be dropped by the back
+    // end if not. This pass skips over it.
+    return false;
+  }
+
+  if (skipLowering(GV, UsedList, IsLDSLoweringPass)) {
+    // We can safely ignore the users of GV, hence lowering of GV is not
+    // required.
+    return false;
+  }
+
+  return true;
+}
+
+// Get the required alignment for global variable `GV`.
+Align getAlign(const DataLayout &DL, const GlobalVariable *GV) {
+  return DL.getValueOrABITypeAlignment(GV->getPointerAlignment(DL),
+                                       GV->getValueType());
+}
+
+// Get a list of all used global values in the module `M`.
+SmallPtrSet<GlobalValue *, 32> getUsedList(Module &M) {
+  SmallPtrSet<GlobalValue *, 32> UsedList;
+
+  SmallVector<GlobalValue *, 32> TmpVec;
+  collectUsedGlobalVariables(M, TmpVec, true);
+  UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+  TmpVec.clear();
+  collectUsedGlobalVariables(M, TmpVec, false);
+  UsedList.insert(TmpVec.begin(), TmpVec.end());
+
+  return UsedList;
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -2,6 +2,7 @@
   AMDGPUBaseInfo.cpp
   AMDKernelCodeTUtils.cpp
   AMDGPUAsmUtils.cpp
+  AMDGPUGeneralUtils.cpp
   AMDGPUPALMetadata.cpp
 
   LINK_COMPONENTS
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-enable-lower-module-lds=false -o - %s 2> %t | FileCheck --check-prefix=GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false -o - %s 2> %t | FileCheck --check-prefix=GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-initializer-unsupported.ll
@@ -1,4 +1,4 @@
-; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-disable-lower-module-lds=true < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+; RUN: not --crash llc -march=amdgcn -verify-machineinstrs -amdgpu-enable-lower-module-lds=false < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: LLVM ERROR: Unsupported expression in static initializer: addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*)
 
diff --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
--- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address-codegen.ll
@@ -1,6 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-function-calls -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=GCN %s
 
 @lds0 = addrspace(3) global i32 undef, align 4
 
diff --git a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
--- a/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/force-alwaysinline-lds-global-address.ll
@@ -1,7 +1,7 @@
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline %s | FileCheck --check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-stress-function-calls -passes=amdgpu-always-inline -amdgpu-enable-lower-module-lds=false %s | FileCheck --check-prefix=ALL %s
 
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
--- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -o - -amdgpu-enable-lower-module-lds=false %s 2> %t | FileCheck -check-prefixes=GCN,GFX8 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-disable-lower-module-lds=true %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - -amdgpu-enable-lower-module-lds=false %s 2> %t | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: FileCheck -check-prefix=ERR %s < %t
 
 @lds = internal addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-constantexpr.ll
@@ -1,6 +1,10 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer`
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
+; The LDS globals @func and @both are used within non-kernel functions, but they are not called from kernel. Hence the
+; pass `amdgpu-replace-lds-use-with-pointer` has become help-less here, thus, the pass `amdgpu-lower-module-lds` results
+; in creating `float` members. But, this is only true in this test case, in reality those non-called functions, and
+; hence LDS globals referenced within them would have eliminated as not used globals.
 ; CHECK: %llvm.amdgcn.module.lds.t = type { float, float }
 
 @func = addrspace(3) global float undef, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inactive.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer`
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
 ; Variables that are not lowered by this pass are left unchanged
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect.ll
@@ -1,22 +1,33 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer`
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
-; CHECK: %llvm.amdgcn.module.lds.t = type { double, float }
-
-; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 1) to float*), align 8
-
-; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* null to double*), align 8
-
-; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
-
+; The original LDS globals - `@function_target` and `@kernel_target` are used as initializers of globals -
+; `@function_indirect` and `@kernel_indirect`, and they are not referenced directly anywhere else. The pass -
+; `amdgpu-replace-lds-use-with-pointer` makes sure that they are referenced within all kernels by assigning
+; their addresses to respective pointers within all kernels, and hence global initialization of `@function_indirect`
+; and `@kernel_indirect` are taken care. Hence, the pass - `amdgpu-lower-module-lds` does not do any further
+; lowering here.
+; CHECK-NOT: %llvm.amdgcn.module.lds.t
+
+; Original globals left unchanged.
+; CHECK: @function_target = addrspace(3) global float undef, align 4
+; CHECK: @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8
+; CHECK: @kernel_target = addrspace(3) global double undef, align 8
+; CHECK: @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8
 @function_target = addrspace(3) global float undef, align 4
 @function_indirect = addrspace(1) global float* addrspacecast (float addrspace(3)* @function_target to float*), align 8
-
 @kernel_target = addrspace(3) global double undef, align 8
 @kernel_indirect = addrspace(1) global double* addrspacecast (double addrspace(3)* @kernel_target to double*), align 8
 
-; CHECK-LABEL: @function(float %x)
+; New pointers introduced by the pass - `amdgpu-replace-lds-use-with-pointer`
+; CHECK: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; CHECK: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+
+; No changes to function - @function
+; CHECK-LABEL: entry:
 ; CHECK: %0 = load float*, float* addrspace(1)* @function_indirect, align 8
+; CHECK: store float %x, float* %0, align 4
+; CHECK: ret void
 define void @function(float %x) local_unnamed_addr #5 {
 entry:
   %0 = load float*, float* addrspace(1)* @function_indirect, align 8
@@ -24,16 +35,19 @@
   ret void
 }
 
-; CHECK-LABEL: @kernel(double %x)
-; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
-; CHECK: %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
+; The LDS globals @function_target and @kernel_target are referenced within kernel by initializing them
+; to respective pointers.
+; CHECK-LABEL: entry:
+; CHECK: %{{[0-9]+}} = ptrtoint {{[a-z]+}} addrspace(3)* @{{[a-z]+}}_target to i16
+; CHECK: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[12]}}, align 2
+; CHECK: %{{[0-9]+}} = ptrtoint {{[a-z]+}} addrspace(3)* @{{[a-z]+}}_target to i16
+; CHECK: store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[12]}}, align 2
+; CHECK: %{{[0-9]+}} = load double*, double* addrspace(1)* @kernel_indirect, align 8
+; CHECK: store double %x, double* %{{[0-9]+}}, align 8
+; CHECK: ret void
 define amdgpu_kernel void @kernel(double %x) local_unnamed_addr #5 {
 entry:
   %0 = load double*, double* addrspace(1)* @kernel_indirect, align 8
   store double %x, double* %0, align 8
   ret void
 }
-
-
-
-
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-inline-asm-call.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inline-asm-call.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-inline-asm-call.ll
@@ -0,0 +1,31 @@
+; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer`
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
+
+; The pass - `amdgpu-replace-lds-use-with-pointer` cannot handle inline asm calls, and hence the
+; pass - `amdgpu-lower-module-lds` need to handle LDS global @func.
+; CHECK: %llvm.amdgcn.module.lds.t = type { i32 }
+
+; @func is only used from a non-kernel function so is rewritten
+; CHECK-NOT: @func
+@func = addrspace(3) global i32 undef, align 4
+
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 4
+; CHECK: @llvm.compiler.used = appending global [1 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
+
+; CHECK-LABEL: @function()
+; CHECK: %0 = load i32, i32 addrspace(3)* null, align 4
+; CHECK: ret i32 %0
+define i32 @function() local_unnamed_addr {
+entry:
+  %0 = load i32, i32 addrspace(3)* @func, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: @kernel()
+; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: call void asm sideeffect "", "~{v23}"()
+; CHECK: ret void
+define amdgpu_kernel void @kernel() {
+  call void asm sideeffect "", "~{v23}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -1,8 +1,10 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer`
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
-; Check new struct is added to compiler.used and that the replaced variable is removed
-
+; The LDS global @tolower is used within non-kernel function, but it is not called from kernel. Hence the
+; pass `amdgpu-replace-lds-use-with-pointer` has become help-less here, thus, the pass `amdgpu-lower-module-lds`
+; results in creating `float` member. But, this is only true in this test case, in reality this non-called
+; function, and hence the LDS global referenced within it would have eliminated as not used globals.
 ; CHECK: %llvm.amdgcn.module.lds.t = type { float }
 ; CHECK: @ignored = addrspace(1) global i64 0
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds.ll
@@ -1,33 +1,43 @@
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
+; The pass - `amdgpu-lower-module-lds` should be run with its prerequisite pass `amdgpu-replace-lds-use-with-pointer`
 ; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
 
-; Padding to meet alignment, so references to @var1 replaced with gep ptr, 0, 2
-; No i64 as addrspace(3) types with initializers are ignored. Likewise no addrspace(4).
-; CHECK: %llvm.amdgcn.module.lds.t = type { float, [4 x i8], i32 }
+; The LDS glboals @var0 and @var1 are replaced by pointers, and hence two i16 members.
+; CHECK: %llvm.amdgcn.module.lds.t = type { i16, i16 }
 
-; Variables removed by pass
-; CHECK-NOT: @var0
-; CHECK-NOT: @var1
+; Orignal LDS globals
+; CHECK: @var0 = addrspace(3) global float undef, align 8
+; CHECK: @var1 = addrspace(3) global i32 undef, align 8
 
 @var0 = addrspace(3) global float undef, align 8
 @var1 = addrspace(3) global i32 undef, align 8
 
-@ptr =  addrspace(1) global i32 addrspace(3)* @var1, align 4
+; Initializer @var1 is left untouched
+; CHECK: @ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4
+@ptr = addrspace(1) global i32 addrspace(3)* @var1, align 4
 
-; A variable that is unchanged by pass
+; A variable that is untouched by pass because of wrong initialization to LDS global
 ; CHECK: @with_init = addrspace(3) global i64 0
 @with_init = addrspace(3) global i64 0
 
+; The two i16 pointers which are introduced by the pass `amdgpu-replace-lds-use-with-pointer` should be removed by the pass `amdgpu-lower-module-lds`.
+; CHECK-NOT: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; CHECK-NOT: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+
 ; Instance of new type, aligned to max of element alignment
-; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 8
+; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t undef, align 2
 
 ; Use in func rewritten to access struct at address zero, which prints as null
 ; CHECK-LABEL: @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 1.0
-; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
+; CHECK: %1 = load i16, i16 addrspace(3)*
+; CHECK: %2 = inttoptr i16 %1 to {{[a-z0-9]+}} addrspace(3)*
+; CHECK: %3 = load i16, i16 addrspace(3)*
+; CHECK: %4 = inttoptr i16 %3 to {{[a-z0-9]+}} addrspace(3)*
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* %{{[0-9]+}}, float 1.000000e+00 monotonic, align 4
+; CHECK: %val0 = load i32, i32 addrspace(3)* %{{[0-9]+}}, align 4
 ; CHECK: %val1 = add i32 %val0, 4
-; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* null, i32 0, i32 2), align 4
-; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
+; CHECK: store i32 %val1, i32 addrspace(3)* %{{[0-9]+}}, align 4
+; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic, align 8
+; CHECK: ret void
 define void @func() {
   %dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
   %val0 = load i32, i32 addrspace(3)* @var1, align 4
@@ -40,17 +50,25 @@
 ; This kernel calls a function that uses LDS so needs the block
 ; CHECK-LABEL: @kern_call()
 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %1 = ptrtoint {{[a-z0-9]+}} addrspace(3)* @var{{[01]}} to i16
+; CHECK: store i16 %1, i16 addrspace(3)*
+; CHECK: %2 = ptrtoint {{[a-z0-9]+}} addrspace(3)* @var{{[01]}} to i16
+; CHECK: store i16 %2, i16 addrspace(3)*
 ; CHECK: call void @func()
-; CHECK: %dec = atomicrmw fsub float addrspace(3)* null, float 2.0
+; CHECK: %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.000000e+00 monotonic, align 4
+; CHECK: ret void
 define amdgpu_kernel void @kern_call() {
   call void @func()
   %dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic
   ret void
 }
 
-; This kernel does not need to alloc the LDS block as it makes no calls
+; Though the kernel does not make call, because @var1 is used as initializer, it still need to alloc the LDS block.
 ; CHECK-LABEL: @kern_empty()
 ; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
+; CHECK: %1 = ptrtoint i32 addrspace(3)* @var1 to i16
+; CHECK: store i16 %1, i16 addrspace(3)*
+; CHECK: ret void
 define spir_kernel void @kern_empty() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-to-lds-constantexpr-use.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-promote-alloca < %s | FileCheck -check-prefix=IR %s
-; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-disable-lower-module-lds=true < %s | FileCheck -check-prefix=ASM %s
+; RUN: llc -disable-promote-alloca-to-vector -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-enable-lower-module-lds=false < %s | FileCheck -check-prefix=ASM %s
 
 target datalayout = "A5"
 
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_report_error_no_func_def.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_report_error_no_func_def.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_report_error_no_func_def.ll
@@ -0,0 +1,20 @@
+; RUN: not --crash opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s 2>&1 | FileCheck -check-prefix=ERROR %s
+
+; ERROR: LLVM ERROR: The pass "Replace LDS Use With Pointer" assumes that the definitions of both caller and callee appear within same module. But, definition for the callee "callee_1" not available.
+
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 16
+
+declare hidden void @callee_1() local_unnamed_addr
+
+define internal void @callee_2() {
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+entry:
+  call void @callee_1()
+  call void @callee_2()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_diamond_shape.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_diamond_shape.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_diamond_shape.ll
@@ -0,0 +1,63 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_4() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: call void @function_4()
+; GCN-NEXT: ret void
+entry:
+  call void @function_4()
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: call void @function_4()
+; GCN-NEXT: ret void
+entry:
+  call void @function_4()
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: call void @function_2()
+; GCN: call void @function_3()
+; GCN-NEXT: ret void
+entry:
+  call void @function_2()
+  call void @function_3()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_1 to i16
+; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: call void @function_1()
+; GCN-NEXT: ret void
+entry:
+  call void @function_1()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: ret void
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_misc.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_direct_call_misc.ll
@@ -0,0 +1,89 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[13]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[13]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  call void @function_3()
+; GCN-NEXT:  call void @function_1()
+; GCN-NEXT:  ret void
+entry:
+  call void @function_3()
+  call void @function_1()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[23]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[23]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  call void @function_2()
+; GCN-NEXT:  call void @function_3()
+; GCN-NEXT:  ret void
+entry:
+  call void @function_2()
+  call void @function_3()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[12]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[12]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  call void @function_1()
+; GCN-NEXT:  call void @function_2()
+; GCN-NEXT:  ret void
+entry:
+  call void @function_1()
+  call void @function_2()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_ignored_lds.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_ignored_lds.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_ignored_lds.ll
@@ -0,0 +1,80 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; Ignore LDS lds_global_1 because it is dynamic lds.
+; Ignore LDS lds_global_2 because it is used only within kernel.
+; Ignore LDS lds_global_3 because it is used within nowhere called non-kernel function.
+; Ignore LDS lds_global_4 because it is used within non-kernel function but is not reachable due to inline asm call.
+
+; LDS: @lds_global_1 = external addrspace(3) global [0 x i32], align 4
+; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_4 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.4 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@lds_global_1 = external addrspace(3) global [0 x i32], align 4
+@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_4 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [0 x i32], [0 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [0 x i32], [0 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: call void @function_1()
+; GCN-NEXT: ret void
+entry:
+  call void @function_1()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_3() {
+; GCN-LABEL: entry:
+; GCN: ret void
+entry:
+  ret void
+}
+
+define internal void @function_4() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_4, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_4, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_4() {
+; GCN-LABEL: entry:
+; GCN: call void asm sideeffect "", "~{v23}"()
+; GCN-NEXT: ret void
+entry:
+  call void asm sideeffect "", "~{v23}"()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_diamond_shape.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_diamond_shape.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_diamond_shape.ll
@@ -0,0 +1,81 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=FPTR,LDS,POINTER,GCN %s
+
+; FPTR: @gv.fptr4 = internal local_unnamed_addr externally_initialized global void ()* @function_4, align 8
+; FPTR: @gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8
+; FPTR: @gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8
+; FPTR: @gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@gv.fptr4 = internal local_unnamed_addr externally_initialized global void ()* @function_4, align 8
+@gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8
+@gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8
+@gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_4() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %fptr4 = load void ()*, void ()** @gv.fptr4, align 8
+; GCN-NEXT: call void %fptr4()
+; GCN-NEXT: ret void
+entry:
+  %fptr4 = load void ()*, void ()** @gv.fptr4, align 8
+  call void %fptr4()
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %fptr4 = load void ()*, void ()** @gv.fptr4, align 8
+; GCN-NEXT: call void %fptr4()
+; GCN-NEXT: ret void
+entry:
+  %fptr4 = load void ()*, void ()** @gv.fptr4, align 8
+  call void %fptr4()
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %fptr2 = load void ()*, void ()** @gv.fptr2, align 8
+; GCN-NEXT: %fptr3 = load void ()*, void ()** @gv.fptr3, align 8
+; GCN-NEXT: call void %fptr2()
+; GCN-NEXT: call void %fptr3()
+; GCN-NEXT: ret void
+entry:
+  %fptr2 = load void ()*, void ()** @gv.fptr2, align 8
+  %fptr3 = load void ()*, void ()** @gv.fptr3, align 8
+  call void %fptr2()
+  call void %fptr3()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_1 to i16
+; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: %fptr1 = load void ()*, void ()** @gv.fptr1, align 8
+; GCN-NEXT: call void %fptr1()
+; GCN-NEXT: ret void
+entry:
+  %fptr1 = load void ()*, void ()** @gv.fptr1, align 8
+  call void %fptr1()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: ret void
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc.ll
@@ -0,0 +1,113 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; FPTR: @gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8
+; FPTR: @gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8
+; FPTR: @gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8
+@gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8
+@gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %fptr3 = load void ()*, void ()** @gv.fptr3, align 8
+; GCN-NEXT:  %fptr1 = load void ()*, void ()** @gv.fptr1, align 8
+; GCN-NEXT:  call void %fptr3()
+; GCN-NEXT:  call void %fptr1()
+; GCN-NEXT:  ret void
+entry:
+  %fptr3 = load void ()*, void ()** @gv.fptr3, align 8
+  %fptr1 = load void ()*, void ()** @gv.fptr1, align 8
+  call void %fptr3()
+  call void %fptr1()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %fptr2 = load void ()*, void ()** @gv.fptr2, align 8
+; GCN-NEXT:  %fptr3 = load void ()*, void ()** @gv.fptr3, align 8
+; GCN-NEXT:  call void %fptr2()
+; GCN-NEXT:  call void %fptr3()
+; GCN-NEXT:  ret void
+entry:
+  %fptr2 = load void ()*, void ()** @gv.fptr2, align 8
+  %fptr3 = load void ()*, void ()** @gv.fptr3, align 8
+  call void %fptr2()
+  call void %fptr3()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %fptr1 = load void ()*, void ()** @gv.fptr1, align 8
+; GCN-NEXT:  %fptr2 = load void ()*, void ()** @gv.fptr2, align 8
+; GCN-NEXT:  call void %fptr1()
+; GCN-NEXT:  call void %fptr2()
+; GCN-NEXT:  ret void
+entry:
+  %fptr1 = load void ()*, void ()** @gv.fptr1, align 8
+  %fptr2 = load void ()*, void ()** @gv.fptr2, align 8
+  call void %fptr1()
+  call void %fptr2()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc2.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc2.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_misc2.ll
@@ -0,0 +1,128 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void ()* @return_function_3() {
+; GCN-LABEL: entry:
+; GCN: ret void ()* @function_3
+entry:
+  ret void ()* @function_3
+}
+
+define internal void ()* @return_function_2() {
+; GCN-LABEL: entry:
+; GCN: ret void ()* @function_2
+entry:
+  ret void ()* @function_2
+}
+
+define internal void ()* @return_function_1() {
+; GCN-LABEL: entry:
+; GCN: ret void ()* @function_1
+entry:
+  ret void ()* @function_1
+}
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %fptr3 = call void ()* @return_function_3()
+; GCN-NEXT:  %fptr1 = call void ()* @return_function_1()
+; GCN-NEXT:  call void %fptr3()
+; GCN-NEXT:  call void %fptr1()
+; GCN-NEXT:  ret void
+entry:
+  %fptr3 = call void ()* @return_function_3()
+  %fptr1 = call void ()* @return_function_1()
+  call void %fptr3()
+  call void %fptr1()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %fptr2 = call void ()* @return_function_2()
+; GCN-NEXT:  %fptr3 = call void ()* @return_function_3()
+; GCN-NEXT:  call void %fptr2()
+; GCN-NEXT:  call void %fptr3()
+; GCN-NEXT:  ret void
+entry:
+  %fptr2 = call void ()* @return_function_2()
+  %fptr3 = call void ()* @return_function_3()
+  call void %fptr2()
+  call void %fptr3()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %fptr1 = call void ()* @return_function_1()
+; GCN-NEXT:  %fptr2 = call void ()* @return_function_2()
+; GCN-NEXT:  call void %fptr1()
+; GCN-NEXT:  call void %fptr2()
+; GCN-NEXT:  ret void
+entry:
+  %fptr1 = call void ()* @return_function_1()
+  %fptr2 = call void ()* @return_function_2()
+  call void %fptr1()
+  call void %fptr2()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_addr_taken.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_addr_taken.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_addr_taken.ll
@@ -0,0 +1,77 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER-NOT: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_3() {
+; GCN-LABEL: entry:
+; GCN: %alloca = alloca void ()*
+; GCN-NEXT: %fptr = load void ()*, void ()** %alloca
+; GCN-NEXT: call void %fptr()
+; GCN-NEXT: ret void
+entry:
+  %alloca = alloca void ()*
+  %fptr = load void ()*, void ()** %alloca
+  call void %fptr()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: %alloca = alloca void ()*
+; GCN-NEXT: %fptr = load void ()*, void ()** %alloca
+; GCN-NEXT: call void %fptr()
+; GCN-NEXT: ret void
+entry:
+  %alloca = alloca void ()*
+  %fptr = load void ()*, void ()** %alloca
+  call void %fptr()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %alloca = alloca void ()*
+; GCN-NEXT: %fptr = load void ()*, void ()** %alloca
+; GCN-NEXT: call void %fptr()
+; GCN-NEXT: ret void
+entry:
+  %alloca = alloca void ()*
+  %fptr = load void ()*, void ()** %alloca
+  call void %fptr()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_init.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_init.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_indirect_call_no_init.ll
@@ -0,0 +1,69 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; FPTR: @gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8
+; FPTR: @gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8
+; FPTR: @gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8
+; LDS: @lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+; LDS: @lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@gv.fptr3 = internal local_unnamed_addr externally_initialized global void ()* @function_3, align 8
+@gv.fptr2 = internal local_unnamed_addr externally_initialized global void ()* @function_2, align 8
+@gv.fptr1 = internal local_unnamed_addr externally_initialized global void ()* @function_1, align 8
+@lds_global_1 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_2 = internal addrspace(3) global [1 x i32] undef, align 4
+@lds_global_3 = internal addrspace(3) global [1 x i32] undef, align 4
+
+define internal void @function_3() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_3, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [1 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [1 x i32], [1 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %1 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %1, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %2 = ptrtoint [1 x i32] addrspace(3)* @lds_global_{{[1-3]}} to i16
+; GCN-NEXT:  store i16 %2, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT:  %alloca = alloca void ()*
+; GCN-NEXT:  %fptr = load void ()*, void ()** %alloca
+; GCN-NEXT:  call void %fptr()
+; GCN-NEXT:  ret void
+entry:
+  %alloca = alloca void ()*
+  %fptr = load void ()*, void ()** %alloca
+  call void %fptr()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_llvm_insts.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_llvm_insts.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_llvm_insts.ll
@@ -0,0 +1,32 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @smem_ptr = hidden addrspace(3) global i32* undef, align 8
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@smem_ptr = hidden addrspace(3) global i32* undef, align 8
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT:  %1 = inttoptr i16 %0 to i32* addrspace(3)*
+; GCN-NEXT:  %2 = addrspacecast i32* addrspace(3)* %1 to i32**
+; GCN-NEXT:  %ptr = load i32*, i32** %2, align 8
+; GCN-NEXT:  %res1 = atomicrmw add i32* %ptr, i32 8 acquire, align 4
+; GCN-NEXT:  %res2 = cmpxchg i32* %ptr, i32 8, i32 16 acq_rel monotonic, align 4
+; GCN-NEXT:  ret void
+entry:
+  %ptr = load i32*, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8
+  %res1 = atomicrmw add i32* %ptr, i32 8 acquire
+  %res2 = cmpxchg i32* %ptr, i32 8, i32 16 acq_rel monotonic
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint i32* addrspace(3)* @smem_ptr to i16
+; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: call void @function_1()
+; GCN-NEXT: ret void
+entry:
+  call void @function_1()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_misc.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_misc.ll
@@ -0,0 +1,39 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @lds_global_1 = internal addrspace(3) global [65 x i32] undef, align 16
+; LDS: @lds_global_2 = internal addrspace(3) global [65 x i16] undef, align 16
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@lds_global_1 = internal addrspace(3) global [65 x i32] undef, align 16
+@lds_global_2 = internal addrspace(3) global [65 x i16] undef, align 16
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [65 x i16] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [65 x i16], [65 x i16] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [65 x i16], [65 x i16] addrspace(3)* @lds_global_2, i32 0, i32 0
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to [65 x i32] addrspace(3)*
+; GCN-NEXT: %gep = getelementptr inbounds [65 x i32], [65 x i32] addrspace(3)* %1, i32 0, i32 0
+; GCN-NEXT: ret void
+entry:
+  %gep = getelementptr inbounds [65 x i32], [65 x i32] addrspace(3)* @lds_global_1, i32 0, i32 0
+  ret void
+}
+
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+define protected amdgpu_kernel void @kernel_1() {
+entry:
+  call void @function_1()
+  call void @function_2()
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers.ll
@@ -0,0 +1,55 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @smem_ptr = hidden addrspace(3) global i32* undef, align 8
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@smem_ptr = hidden addrspace(3) global i32* undef, align 8
+
+define internal void @function_2() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to i32* addrspace(3)*
+; GCN-NEXT: %2 = addrspacecast i32* addrspace(3)* %1 to i32**
+; GCN-NEXT: %3 = load i32*, i32** %2, align 8
+; GCN-NEXT: %4 = addrspacecast i32* addrspace(3)* %1 to i32**
+; GCN-NEXT: store i32* %3, i32** %4, align 8
+; GCN-NEXT: ret void
+entry:
+  %0 = load i32*, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8
+  store i32* %0, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8
+  ret void
+}
+
+define internal void @function_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: %1 = inttoptr i16 %0 to i32* addrspace(3)*
+; GCN-NEXT: %2 = addrspacecast i32* addrspace(3)* %1 to i32**
+; GCN-NEXT: %3 = load i32*, i32** %2, align 8
+; GCN-NEXT: %4 = addrspacecast i32* addrspace(3)* %1 to i32**
+; GCN-NEXT: store i32* %3, i32** %4, align 8
+; GCN-NEXT: ret void
+entry:
+  %0 = load i32*, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8
+  store i32* %0, i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), align 8
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_1() {
+; GCN-LABEL: entry:
+; GCN: %0 = ptrtoint i32* addrspace(3)* @smem_ptr to i16
+; GCN-NEXT: store i16 %0, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.1, align 2
+; GCN-NEXT: call void @function_1()
+; GCN-NEXT: call void @function_2()
+; GCN-NEXT: ret void
+entry:
+  call void @function_1()
+  call void @function_2()
+  ret void
+}
+
+define protected amdgpu_kernel void @kernel_2() {
+; GCN-LABEL: entry:
+; GCN: ret void
+entry:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers_misc.ll b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers_misc.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/replace_lds_test_types_pointers_misc.ll
@@ -0,0 +1,65 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-replace-lds-use-with-pointer -S < %s | FileCheck -check-prefixes=LDS,POINTER,GCN %s
+
+; LDS: @smem = hidden addrspace(3) global i32 undef, align 4
+; LDS: @smem_ptr = hidden addrspace(3) global i32* undef, align 8
+; LDS: @smem_ptr_ptr = hidden local_unnamed_addr addrspace(3) global i32** undef, align 8
+; LDS: @smem_arr = hidden addrspace(3) global [1 x i32] undef, align 4
+; LDS: @smem_ptr2 = hidden local_unnamed_addr addrspace(3) global i32* undef, align 8
+; POINTER: @llvm.amdgcn.lds.pointer.1 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.2 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.3 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.4 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+; POINTER: @llvm.amdgcn.lds.pointer.5 = internal unnamed_addr addrspace(3) global i16 undef, align 2
+@smem = hidden addrspace(3) global i32 undef, align 4
+@smem_ptr = hidden addrspace(3) global i32* undef, align 8
+@smem_ptr_ptr = hidden local_unnamed_addr addrspace(3) global i32** undef, align 8
+@smem_arr = hidden addrspace(3) global [1 x i32] undef, align 4
+@smem_ptr2 = hidden local_unnamed_addr addrspace(3) global i32* undef, align 8
+
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+define internal void @function_2() {
+entry:
+  store i32* addrspacecast (i32 addrspace(3)* @smem to i32*), i32* addrspace(3)* @smem_ptr, align 8
+  store i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), i32** addrspace(3)* @smem_ptr_ptr, align 8
+  %0 = load i32, i32 addrspace(3)* @smem, align 4
+  ret void
+}
+
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+define internal void @function_1() {
+entry:
+  store i32* addrspacecast (i32 addrspace(3)* @smem to i32*), i32* addrspace(3)* @smem_ptr, align 8
+  store i32** addrspacecast (i32* addrspace(3)* @smem_ptr to i32**), i32** addrspace(3)* @smem_ptr_ptr, align 8
+  %0 = load i32, i32 addrspace(3)* @smem, align 4
+  ret void
+}
+
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+define protected amdgpu_kernel void @kernel_1() {
+entry:
+  call void @function_1()
+  call void @function_2()
+  ret void
+}
+
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: %{{[0-9]+}} = load i16, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+define internal void @function_3() {
+entry:
+  store i32* addrspacecast (i32 addrspace(3)* getelementptr inbounds ([1 x i32], [1 x i32] addrspace(3)* @smem_arr, i32 0, i32 0) to i32*), i32* addrspace(3)* @smem_ptr2, align 8
+  ret void
+}
+
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+; GCN: store i16 %{{[0-9]+}}, i16 addrspace(3)* @llvm.amdgcn.lds.pointer.{{[0-9]+}}, align 2
+define protected amdgpu_kernel void @kernel_2() {
+entry:
+  call void @function_3()
+  ret void
+}