diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -271,6 +271,10 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPUDeviceScopeSharedVariablePass();
+void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &);
+extern char &AMDGPUDeviceScopeSharedVariableID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -123,13 +123,15 @@
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
 
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
@@ -0,0 +1,1230 @@
+//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// [1]. What does this module pass is about?
+//
+// This module pass is about an "indirect" method for supporting the "shared"
+// variables which are defined within "device" functions. Programmatically
+// speaking, a "hacked" way of supporting device scoped shared variables. Yes,
+// it is a hacked way, because, we couldn't think of any other "neat" way of
+// supporting it due to below main reason:
+//
+// A shared variable is a "block" scoped variable, but it's lifetime is same as
+// the "work-group" to which it belongs, which necessitates to keep track of
+// multiple copies of shared variables related to different work-groups from
+// different running kernels, which is too costly and too complex to implement
+// given the "scarcity" of the shared memory and "strange properties" of the
+// shared variables.
+//
+// [2]. What does this pass do?
+//
+// Though, programmer define deviced scoped shared variables within device
+// functions, this module pass,
+//
+//   A. internally pushes all those deviced scoped shared variables within the
+//      associated kernel(s),
+//   B. appropriately inserts new arguments within in the call graph paths from
+//      kernel(s) to device function(s), and
+//   C. implements necessary program transformations.
+//
+// [3]. How does the implementation of the pass look like in brief?
+//
+// At a very high level, implementation of this pass can be described as below:
+//
+//   A. For every kernel, traverse through it's call graph, and collect the
+//      direct shared variables which are defined within the kernel and
+//      indirect shared variables which are defined within the device functions
+//      which appear in the kernel's call graph.
+//   B. Create a single big shared memory layout within the kernel by combining
+//      all the direct and indirect shared variables which are collected above.
+//   C. Map each direct and indirect shared variable to it's "offset" in this
+//      big shared memory layout.
+//   D. Pass the "offsets" to (indirect) shared variables as new function
+//      arguments along the call graph paths from kernel to device functions
+//      within which the original shared variables are defined.
+//   E. Replace all the references to original shared variables by their offset
+//      counterparts.
+//   F. Finally, remove all the original shared variables.
+//
+// [4]. What are the positive consequences of this pass on the applications?
+//
+// Is one really exist?
+//
+// [5]. What are the negative consequences of this pass on the applications?
+//
+//   A. Since we add new arguments to the functions along the call graph paths
+//      from kernels to device functions, there is a posibility of increased
+//      register pressure, which may affect performance.
+//   B. Since we create single big shared memory layouts within kernels, we
+//      land-up duplicating share memory of indirect shared variables within
+//      kernels, however, it is still less costlier compare to the direct
+//      support.
+//   C. This implementation is indeed a very careful hack, and hence any bug in
+//      the implementation may have some adverse effect on running application.
+//
+// [6]. What are some important current limitations of the pass?
+//
+//   A. At present, shared variables of integer types, floating-point types,
+//      shared variable arrays of integer types and floating-point types are
+//      supported, other aggregate types like struct are not yet supprted.
+//   B. The implementation assumes that there are no recursive calls to device
+//      functions (both direct or indirect), and hence call graphs associated
+//      with kernels are acyclic.
+//   C. All TODOs need to be revisted sooner than later.
+//
+// [7]. An Example.
+//
+// Before Pass:
+//
+//     __device__ void foo() {
+//        __shared__ char smc[10];
+//        __shared__ int smi[10];
+//        __shared__ float smf[10];
+//     }
+//     __global__ void kernel() {
+//       foo();
+//     }
+//
+// After Pass:
+//
+//     __device__ void foo(char *smc, int *smi, float *smf) {
+//     }
+//     __global__ void kernel() {
+//       __shared__ char sm[90];  // assuming char occupies 1 byte, int occupies
+//                                // 4 bytes, and float occupies 4 bytes.
+//       foo((char*)sm, (int*)(sm + 10), (float*)(sm + 50));
+//     }
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <llvm/ADT/SmallVector.h>
+#include <map>
+#include <set>
+
+#define DEBUG_TYPE "amdgpu-device-scope-shared-variable"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUDeviceScopeSharedVariable : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) {
+    initializeAMDGPUDeviceScopeSharedVariablePass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPUDeviceScopeSharedVariable::ID = 0;
+
+char &llvm::AMDGPUDeviceScopeSharedVariableID =
+    AMDGPUDeviceScopeSharedVariable::ID;
+
+ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() {
+  return new AMDGPUDeviceScopeSharedVariable();
+}
+
+INITIALIZE_PASS(AMDGPUDeviceScopeSharedVariable,
+                "implement-amdgpu-device-scope-shared-variable",
+                "Implement AMDPGU Device Scope Shared Variable",
+                false /*only look at the cfg*/, false /*analysis pass*/)
+
+static void updateKernelToCallieList(
+    ValueMap<Function *, std::set<Function *>> &KernelToCallie,
+    ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  for (auto KI = KernelToCallie.begin(), KE = KernelToCallie.end(); KI != KE;
+       ++KI) {
+    auto *K = KI->first;
+    auto OldCallieList = KI->second;
+    std::set<Function *> NewCallieList;
+    for (auto *OldCallie : OldCallieList) {
+      if (OldCallieToNewCallie.find(OldCallie) != OldCallieToNewCallie.end())
+        NewCallieList.insert(OldCallieToNewCallie[OldCallie]);
+      else
+        NewCallieList.insert(OldCallie);
+    }
+    KernelToCallie[K] = NewCallieList;
+  }
+}
+
+static void
+updateLDSToFunctionMap(ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+                       ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE;
+       ++LI) {
+    auto *LDS = LI->first;
+    auto *OldF = LI->second;
+    if (OldCallieToNewCallie.find(OldF) != OldCallieToNewCallie.end())
+      LDSToFunction[LDS] = OldCallieToNewCallie[OldF];
+  }
+}
+
+static void createFunctionToLDSMap(
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS) {
+  for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE;
+       ++LI) {
+    auto *LDSGlobal = LI->first;
+    auto *F = LI->second;
+    auto FI = FunctionToLDS.find(F);
+    if (FI == FunctionToLDS.end()) {
+      SetVector<GlobalVariable *> LDSSet;
+      LDSSet.insert(LDSGlobal);
+      FunctionToLDS[F] = LDSSet;
+    } else
+      FunctionToLDS[F].insert(LDSGlobal);
+  }
+}
+
+static void pairUpKernelWithLDSList(
+    Function *K, ValueMap<Function *, std::set<Function *>> &KernelToCallie,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToDirectLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToIndirectLDS) {
+  // If direct LDS globals exist within the kernel, collect it
+  if (FunctionToLDS.find(K) != FunctionToLDS.end())
+    KernelToDirectLDS[K] = FunctionToLDS[K];
+
+  // Collect all the indirect LDS globals defined within the callie(s) of the
+  // kernel
+  SetVector<GlobalVariable *> IndirectLDSSet;
+  auto Callies = KernelToCallie[K];
+  for (auto *Callie : Callies) {
+    if (FunctionToLDS.find(Callie) == FunctionToLDS.end())
+      continue;
+    SetVector<GlobalVariable *> CallieLDSList = FunctionToLDS[Callie];
+    for (auto *CallieLDS : CallieLDSList)
+      IndirectLDSSet.insert(CallieLDS);
+  }
+  if (!IndirectLDSSet.empty())
+    KernelToIndirectLDS[K] = IndirectLDSSet;
+}
+
+static void pairUpKernelWithCallieList(
+    Module &M, Function *K,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, std::set<Function *>> &KernelToCallie) {
+  // Get the call graph node associated with current kernel, traverse the call
+  // graph associated with it in DFS manner and collect all the associated
+  // callies which define LDS global(s)
+  auto CG = CallGraph(M);
+  auto *KernCGNode = CG[K];
+  SmallVector<CallGraphNode *, 16> CGNodeStack;
+  SetVector<CallGraphNode *> Visited;
+
+#ifndef NDEBUG
+  assert(KernCGNode && "Call graph node associated with kernel definition "
+                       "cannot be null\n");
+#endif
+
+  for (auto KI = KernCGNode->begin(), KE = KernCGNode->end(); KI != KE; ++KI) {
+    auto *CGN = KI->second;
+#ifndef NDEBUG
+    assert(CGN && "Call graph node associated with function definition cannot"
+                  " be null\n");
+#endif
+    CGNodeStack.push_back(CGN);
+  }
+
+  std::set<Function *> CallieSet;
+  while (!CGNodeStack.empty()) {
+    auto *CGNode = CGNodeStack.pop_back_val();
+    if (!Visited.insert(CGNode))
+      continue;
+
+    auto *F = CGNode->getFunction();
+    if (!F || F->isDeclaration()) {
+#ifndef NDEBUG
+      assert(CGNode->empty() && "Call graph node associated with function "
+                                "declaration should not have callie list\n");
+#endif
+      continue;
+    }
+
+    if (FunctionToLDS.find(F) != FunctionToLDS.end())
+      CallieSet.insert(F);
+
+    for (auto CI = CGNode->begin(), CE = CGNode->end(); CI != CE; ++CI) {
+      auto *CGN = CI->second;
+#ifndef NDEBUG
+      assert(CGN && "Call graph node associated with function definition cannot"
+                    " be null\n");
+#endif
+      CGNodeStack.push_back(CGN);
+    }
+  }
+
+  KernelToCallie[K] = CallieSet;
+}
+
+static void pairUpLDSGlobalWithItsAssociatedFunction(
+    GlobalVariable *LDSGlobal,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction) {
+  // Recursively visit the user list of current LDS global, and find the
+  // enclosing function where the LDS global is defined, and the enclosing
+  // function should always be successfully found.
+  //
+  // TODO: Is there any other efficient way to find the associated functions of
+  // LDS globals?
+#ifndef NDEBUG
+  assert(!LDSGlobal->user_empty() &&
+         "LDS Global user list cannot be empty since it must have been defined "
+         "within either kernel or device function");
+#endif
+  SmallVector<User *, 16> UserStack;
+  SetVector<User *> Visited;
+
+  for (auto *U : LDSGlobal->users())
+    UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    auto *U = UserStack.pop_back_val();
+    if (!Visited.insert(U))
+      continue;
+
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      auto *F = I->getParent()->getParent();
+      if (F) {
+        LDSToFunction[LDSGlobal] = F;
+        return;
+      }
+      continue;
+    }
+
+    for (auto *UU : U->users())
+      UserStack.push_back(UU);
+  }
+#ifndef NDEBUG
+  assert(false && "Control is not expected to reach this point");
+#endif
+}
+
+unsigned getFixedSizeOfTypeInBits(Module &M, Type *Ty) {
+  return M.getDataLayout().getTypeSizeInBits(Ty).getFixedSize();
+}
+
+unsigned getFixedSizeOfTypeInBytes(Module &M, Type *Ty) {
+  return getFixedSizeOfTypeInBits(M, Ty) / 8;
+}
+
+static void
+getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal,
+                        ValueMap<GlobalVariable *, uint64_t> &LDSToSize) {
+  LDSToSize[LDSGlobal] =
+      getFixedSizeOfTypeInBytes(M, LDSGlobal->getValueType());
+}
+
+static void
+eraseOldCallies(ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // TODO: May be we can come-up with a more efficient implmentation to erase
+  // old callies from the module. It depends on how many callies that we land-up
+  // erasing in a real world hip application. May be not many, hence, as of now,
+  // we have just employed a simplest method of repeatedly visiting the old
+  // callies and removing each of them, once their number of uses become 0.
+  bool Loopover = true;
+  while (Loopover) {
+    Loopover = false;
+    for (auto OI = OldCallieToNewCallie.begin(),
+              OE = OldCallieToNewCallie.end();
+         OI != OE; ++OI) {
+      auto *OldCallie = OI->first;
+      if (OldCallie->getNumUses())
+        continue;
+      OldCallieToNewCallie.erase(OI);
+      OldCallie->eraseFromParent();
+      Loopover = true;
+    }
+  }
+}
+
+// TODO: This really looks like a horrible hack to me, but, is there any better
+// way of handling `ConstantExprs`? I have no idea at the moment, need to
+// revisit it later.
+static Instruction *replaceConstExprByInst(ConstantExpr *CE) {
+  for (auto *U : CE->users()) {
+    auto *I = dyn_cast<Instruction>(U);
+
+    if (!I)
+      I = replaceConstExprByInst(dyn_cast<ConstantExpr>(U));
+
+    if (I) {
+      auto *NI = CE->getAsInstruction();
+      NI->insertBefore(I);
+      unsigned Ind = 0;
+      for (Use &UU : I->operands()) {
+        Value *V = UU.get();
+        if (V == CE) {
+          I->setOperand(Ind, NI);
+          break;
+        }
+        ++Ind;
+      }
+      return NI;
+    }
+  }
+
+  return nullptr;
+}
+
+static void replaceInstWhichUsesLDS(Module &M, GlobalVariable *LDS,
+                                    Value *BasePtr, Function *F, Instruction *I,
+                                    SetVector<Instruction *> &ToBeErasedInsts) {
+  // Assert that the function associated with the `I` is nonthing but the
+  // one where LDS global is actually defined.
+#ifndef NDEBUG
+  assert(I->getParent()->getParent() == F &&
+         "The reference to LDS should only exists within the function where it "
+         "is actually defined\n");
+#endif
+
+  // Suffix the names of the instructions with unique integer values
+  static uint64_t Suffix = 0;
+  ++Suffix;
+
+  // The new instruction which replaces `UserInst`.
+  Instruction *NewI = nullptr;
+
+  switch (I->getOpcode()) {
+  case Instruction::GetElementPtr: {
+    auto *GEPInst = dyn_cast<GetElementPtrInst>(I);
+
+    // Get the number of indices. We expect it to be greater than 1 including
+    // the default first index which points to original pointer operand.
+    auto NumIndices = GEPInst->getNumIndices();
+#ifndef NDEBUG
+    assert(NumIndices >= 2 && "Expected two or more GEP indecies\n");
+#endif
+
+    // New index operand to be inserted to new GEP instruction. The new GEP
+    // instruction access the memory at `BasePtr + Offset`.
+    Value *Offset = nullptr;
+    if (NumIndices > 2) {
+      // `GEPInst` has nested pointer access from higher dimentional aggregate
+      // type, we need to flatten it for the new GEP instrution to be inserted.
+      //
+      // TODO: We are at present only handle `Array` aggregate type.
+      SmallVector<Value *, 4> MulInstList;
+      SmallVector<uint64_t, 4> IdxList;
+      auto *SETy = GEPInst->getSourceElementType();
+
+      // The `FOR` loop repeates at least twice.
+      IdxList.push_back(1);
+      for (unsigned i = 2; i <= NumIndices; ++i) {
+        auto *IdxOffset = GEPInst->getOperand(i);
+
+        IdxList.push_back(i);
+        auto *IdxTy = GetElementPtrInst::getIndexedType(SETy, IdxList);
+        uint64_t NumIdxTyElements = 0;
+        if (auto ArrTy = dyn_cast<ArrayType>(IdxTy))
+          NumIdxTyElements = ArrTy->getNumElements();
+
+        Value *Mul = nullptr;
+        if (NumIdxTyElements) {
+          APInt NumIdxTyElementsConst = {
+              getFixedSizeOfTypeInBits(M, IdxOffset->getType()),
+              NumIdxTyElements, false};
+          Mul = BinaryOperator::Create(
+              Instruction::Mul, IdxOffset,
+              ConstantInt::get(IdxOffset->getType(), NumIdxTyElementsConst),
+              Twine(""), I);
+        } else
+          Mul = IdxOffset;
+
+        MulInstList.push_back(Mul);
+      }
+
+      auto MI = MulInstList.begin();
+      auto ME = MulInstList.end();
+      Value *Add = BinaryOperator::Create(Instruction::Add, *MI, *(MI + 1),
+                                          Twine(""), I);
+      for (MI = MI + 2; MI != ME; ++MI)
+        Add = BinaryOperator::Create(Instruction::Add, Add, *MI, Twine(""), I);
+
+      Offset = Add;
+    } else
+      Offset = GEPInst->getOperand(NumIndices);
+
+    // New GEP instruction which replaces `GEPInst`
+    NewI = GetElementPtrInst::CreateInBounds(
+        GEPInst->getResultElementType(), BasePtr, Offset,
+        Twine(BasePtr->getName()) + Twine(".ptr.arith.") + Twine(Suffix), I);
+    break;
+  }
+  case Instruction::Load: {
+    auto *LInst = dyn_cast<LoadInst>(I);
+
+    // New LOAD instruction which repalces `LInst`.
+    NewI = new LoadInst(
+        LInst->getType(), BasePtr,
+        Twine(BasePtr->getName()) + Twine(".load.") + Twine(Suffix), I);
+    break;
+  }
+  case Instruction::Store: {
+    auto *SInst = dyn_cast<StoreInst>(I);
+
+    // New STORE instruction which repalces `SInst`.
+    NewI = new StoreInst(SInst->getValueOperand(), BasePtr, I);
+    break;
+  }
+  case Instruction::PtrToInt: {
+    auto *PToIInst = dyn_cast<PtrToIntInst>(I);
+
+    // New pointer to int cast instruction which replaces `PToIInst`.
+    NewI = new PtrToIntInst(
+        BasePtr, PToIInst->getType(),
+        Twine(BasePtr->getName()) + Twine(".ptoi.") + Twine(Suffix), I);
+    break;
+  }
+  case Instruction::BitCast: {
+    auto *BCInst = dyn_cast<BitCastInst>(I);
+
+    // New bit cast instruction which replaces `BCInst`.
+    NewI = new BitCastInst(
+        BasePtr, BCInst->getDestTy(),
+        Twine(BasePtr->getName()) + Twine(".bitcast.") + Twine(Suffix), I);
+    break;
+  }
+  case Instruction::PHI: {
+    auto *PhiInst = dyn_cast<PHINode>(I);
+
+    // New PHI value to be replaced with.
+    Value *NewPhiValue = BasePtr;
+    if (PhiInst->getType() != BasePtr->getType()) {
+      Instruction *InsertBefore = nullptr;
+      if (auto *II = dyn_cast<Instruction>(BasePtr))
+        InsertBefore = II->getNextNode();
+      else
+        InsertBefore = &*F->getEntryBlock().getFirstInsertionPt();
+
+      NewPhiValue = new BitCastInst(BasePtr, PhiInst->getType(),
+                                    Twine(BasePtr->getName()) +
+                                        Twine(".bitcast.") + Twine(Suffix),
+                                    InsertBefore);
+    }
+
+    // New PHI instruction which replaces `PhiInst`.
+    auto *NewPhiInst = PHINode::Create(
+        PhiInst->getType(), 0,
+        Twine(BasePtr->getName()) + Twine(".phi.") + Twine(Suffix), I);
+
+    // Add PHI values to new PHI instruction.
+    for (unsigned i = 0; i < PhiInst->getNumIncomingValues(); ++i) {
+      auto *V = PhiInst->getIncomingValue(i);
+      auto *IBB = PhiInst->getIncomingBlock(i);
+      if (V == LDS)
+        NewPhiInst->addIncoming(NewPhiValue, IBB);
+      else
+        NewPhiInst->addIncoming(V, IBB);
+    }
+
+    NewI = NewPhiInst;
+    break;
+  }
+  default:
+    llvm_unreachable("Not Implemented."); // TODO: What else is missing?
+  }
+
+  // Replace `I` by `NewI`, erase `I` and mark `I` as `to be erased`
+  // instruction.
+#ifndef NDEBUG
+  assert(NewI && "Valid instruction expected");
+#endif
+  NewI->copyMetadata(*I);
+  I->replaceAllUsesWith(NewI);
+  ToBeErasedInsts.insert(I);
+}
+
+static void updateFunctionAssociatedWithLDS(Module &M, GlobalVariable *LDS,
+                                            Value *BasePtr, Function *F) {
+  // Keep track of all the erased to be instructions.
+  SetVector<Instruction *> ToBeErasedInsts;
+
+  // Traverse through each `use` of `LDS`, create a new to be replaced value
+  // for each use case, and accordingly replace it with new one.
+  for (auto *U : LDS->users()) {
+    // `U` may be using `LDS`, but 'U` itself is not used anywhere, ignore `U`.
+    if (!U->getNumUses())
+      continue;
+
+    // Cast away const-ness from `U`.
+    User *UU = const_cast<User *>(U);
+
+    if (auto *I = dyn_cast<Instruction>(UU)) {
+      replaceInstWhichUsesLDS(M, LDS, BasePtr, F, I, ToBeErasedInsts);
+    } else if (auto *CE = dyn_cast<ConstantExpr>(UU)) {
+      // TODO: Performance issues because of converting `ConstExpr` to
+      // `Instruction`?
+      auto *I = replaceConstExprByInst(CE);
+      replaceInstWhichUsesLDS(M, LDS, BasePtr, F, I, ToBeErasedInsts);
+      CE->removeDeadConstantUsers();
+    } else
+      llvm_unreachable("Not Implemented."); // TODO: What else is missing?
+  }
+
+  // Erase all the instructions which are got replaced by new ones.
+  for (auto *I : ToBeErasedInsts)
+    I->eraseFromParent();
+}
+
+static void getNewArgumentList(
+    GlobalVariable *LDS,
+    std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+        &KernelToIndirectBasePtrInst,
+    Function *Caller, CallInst *CI, SmallVectorImpl<Value *> &NewArgs) {
+  Value *NewArg = nullptr;
+
+  if (Caller->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    std::map<GlobalVariable *, Instruction *> LDSToIndirectBasePtrInst =
+        KernelToIndirectBasePtrInst[Caller];
+    NewArg = LDSToIndirectBasePtrInst[LDS];
+  } else
+    NewArg = Caller->getArg(Caller->arg_size() - 1);
+
+  for (auto AI = CI->arg_begin(), AE = CI->arg_end(); AI != AE; ++AI)
+    NewArgs.push_back(*AI);
+  NewArgs.push_back(NewArg);
+}
+
+static bool
+isNewClonedFunction(Function *F,
+                    ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  for (auto OI = OldCallieToNewCallie.begin(), OE = OldCallieToNewCallie.end();
+       OI != OE; ++OI) {
+    auto *NF = OI->second;
+    if (F == NF)
+      return true;
+  }
+  return false;
+}
+
+static void
+updateCallSites(GlobalVariable *LDS, Function *OldCallie, Function *NewCallie,
+                std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+                    &KernelToIndirectBasePtrInst,
+                ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // Update call sites for current callie.
+  for (auto *U : OldCallie->users()) {
+    // Get call instruction.
+    auto *CI = dyn_cast<CallInst>(U);
+#ifndef NDEBUG
+    assert(CI && "Valid call instruction expected");
+#endif
+
+    // We are only interested in the call sites within kernel or within new
+    // cloned functions.
+    auto *Caller = CI->getParent()->getParent();
+    if (Caller->getCallingConv() != CallingConv::AMDGPU_KERNEL &&
+        !isNewClonedFunction(Caller, OldCallieToNewCallie))
+      continue;
+
+    // Get new argument list which can be used to insert new call instruction.
+    SmallVector<Value *, 8> NewArgs;
+    getNewArgumentList(LDS, KernelToIndirectBasePtrInst, Caller, CI, NewArgs);
+
+    // Insert new call instruction `NewCI` just before the existing call
+    // instruction `CI`.
+    auto *NewCI = CallInst::Create(NewCallie->getFunctionType(), NewCallie,
+                                   NewArgs, Twine(""), CI);
+    // TODO: Why copyMetadata() not copying meta data. I see metadat associated
+    // with CI, but it is not copied to NewCI. CI->hasMetadata() is false, why?
+    NewCI->copyMetadata(*CI);
+    NewCI->setTailCall(CI->isTailCall());
+    NewCI->setCallingConv(CI->getCallingConv());
+
+    // Now, since new updated call instruction is in place, delete old one.
+    CI->replaceAllUsesWith(NewCI);
+    CI->eraseFromParent();
+  }
+}
+
+static void
+updateCallSites(GlobalVariable *LDS,
+                std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+                    &KernelToIndirectBasePtrInst,
+                ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // Update call sites for all callies.
+  for (auto OI = OldCallieToNewCallie.begin(), OE = OldCallieToNewCallie.end();
+       OI != OE; ++OI) {
+    auto *OldCallie = OI->first;
+    auto *NewCallie = OI->second;
+    updateCallSites(LDS, OldCallie, NewCallie, KernelToIndirectBasePtrInst,
+                    OldCallieToNewCallie);
+  }
+}
+
+static Function *cloneCallie(Module &M, Type *NewParamType, Function *Callie) {
+  // Create a new function type by adding `NewParamType` to the end of existing
+  // parameter list.
+  SmallVector<Type *, 8> NewParams;
+  auto *FnTy = Callie->getFunctionType();
+  for (auto PI = FnTy->param_begin(), PE = FnTy->param_end(); PI != PE; ++PI)
+    NewParams.push_back(*PI);
+  NewParams.push_back(NewParamType);
+  auto *NewFnTy =
+      FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg());
+
+  // Create a copy of the `Callie`, but with new function type
+  auto *NewCallie =
+      Function::Create(NewFnTy, Callie->getLinkage(), Callie->getAddressSpace(),
+                       Callie->getName() + Twine(".c"));
+
+  ValueToValueMapTy VMap;
+  auto *NewCallieArgIt = NewCallie->arg_begin();
+  for (auto &Arg : Callie->args()) {
+    auto ArgName = Arg.getName();
+    NewCallieArgIt->setName(ArgName);
+    VMap[&Arg] = &(*NewCallieArgIt++);
+  }
+
+  // TODO: ModuleLevelChanges should be set to true or false?
+  SmallVector<ReturnInst *, 8> Returns;
+  CloneFunctionInto(NewCallie, Callie, VMap, /*ModuleLevelChanges=*/false,
+                    Returns);
+
+  // Copy all metadata
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  Callie->getAllMetadata(MDs);
+  for (auto MDIt : MDs)
+    NewCallie->addMetadata(MDIt.first, *MDIt.second);
+
+  // Insert `NewCallie` just before `Callie` within the module.
+  M.getFunctionList().insert(Callie->getIterator(), NewCallie);
+
+  // Swap names so that new function retains original name.
+  auto NewName = NewCallie->getName();
+  NewCallie->takeName(Callie);
+  Callie->setName(NewName);
+
+  return NewCallie;
+}
+
+static void CollectCallGraphPathsBetweenKernelAndCallie(
+    Module &M, Function *K, Function *Callie,
+    SmallVectorImpl<SetVector<Function *>> &CGPaths) {
+  // Traverse the call graph associated with the kernel in DFS manner and
+  // collect all the paths from kernel to callie.
+  //
+  // TODO: Note that this algorithm will not work if there exist recursive
+  // calls, and the current assumption here is that the call graph is acyclic.
+  // We need to visit it back again to handle call graph which could contain
+  // cycles.
+  auto CG = CallGraph(M);
+  auto *KernCGNode = CG[K];
+#ifndef NDEBUG
+  assert(KernCGNode && "Call graph node associated with kernel definition "
+                       "cannot be null\n");
+#endif
+
+  SmallVector<SetVector<CallGraphNode *>, 8> Stack;
+  SetVector<CallGraphNode *> Path;
+  Path.insert(KernCGNode);
+  Stack.push_back(Path);
+
+  while (!Stack.empty()) {
+    auto TopPath = Stack.pop_back_val();
+    auto *CGNode = TopPath.back();
+    auto *F = CGNode->getFunction();
+    if (F == Callie) {
+      SetVector<Function *> FPath;
+      for (auto *CGN : TopPath)
+        FPath.insert(CGN->getFunction());
+      CGPaths.push_back(FPath);
+      continue;
+    }
+
+    for (auto NI = CGNode->begin(), NE = CGNode->end(); NI != NE; ++NI) {
+      auto *CGN = NI->second;
+#ifndef NDEBUG
+      assert(CGN && "Call graph node associated with function definition cannot"
+                    " be null\n");
+#endif
+      SetVector<CallGraphNode *> ClonedPath(TopPath.begin(), TopPath.end());
+      ClonedPath.insert(CGN);
+      Stack.push_back(ClonedPath);
+    }
+  }
+}
+
+static void createCloneOfCalliesWithNewParameter(
+    Module &M, GlobalVariable *LDS, ValueMap<Function *, Function *> &KToC,
+    Type *BasePtrType, ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // Update callies to accept the new parameter which is of type `BasePtrType`
+  // by creating their clones. Here is the brief sketch of the functionality of
+  // this function:
+  //
+  // 1. Collect all the call graph paths between the kernels and the callies
+  // 2. Traverse all the call graph paths from kernels to callies.
+  // 3. For each device function encoutered while traversing, create a clone of
+  //    it, by adding a new parameter of type BasePtr`s type to it's parameter
+  //    list, but also retain the original device function for a moment.
+  SmallVector<SetVector<Function *>, 8> CGPaths;
+  for (auto KI = KToC.begin(), KE = KToC.end(); KI != KE; ++KI)
+    CollectCallGraphPathsBetweenKernelAndCallie(M, KI->first, KI->second,
+                                                CGPaths);
+
+  for (auto CGPath : CGPaths) {
+    // TODO: We can in-fact assert that length of `CGPath` is atleast two. But,
+    // we are okay for now.
+    if (CGPath.size() < 2)
+      continue;
+
+    // We are interested in cloning of only device functions in the call graph
+    // path, hence we start from second node in the call graph path.
+    auto PI = CGPath.begin() + 1;
+    auto PE = CGPath.end();
+    for (; PI != PE; ++PI) {
+      auto *CurCallie = *PI;
+
+      if (OldCallieToNewCallie.find(CurCallie) != OldCallieToNewCallie.end())
+        continue;
+
+      // The `CurCallie` is encountered first time for the LDS in question,
+      // Create a clone of it, and save it.
+      auto *NewCallie = cloneCallie(M, BasePtrType, CurCallie);
+      OldCallieToNewCallie[CurCallie] = NewCallie;
+    }
+  }
+}
+
+static Type *getBasePtrAccessInstType(
+    GlobalVariable *LDS,
+    std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+        &KernelToIndirectBasePtrInst) {
+  for (auto KI = KernelToIndirectBasePtrInst.begin(),
+            KE = KernelToIndirectBasePtrInst.end();
+       KI != KE; ++KI)
+    for (auto BPI = KI->second.begin(), BPE = KI->second.end(); BPI != BPE;
+         ++BPI)
+      if (LDS == BPI->first)
+        return BPI->second->getType();
+
+#ifndef NDEBUG
+  assert(false && "Control is not expected to reach this point");
+#endif
+  return nullptr;
+}
+
+static void
+getKernelToCallieMap(GlobalVariable *LDS,
+                     ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+                     ValueMap<Function *, std::set<Function *>> &KernelToCallie,
+                     ValueMap<Function *, Function *> &KToC) {
+  // Collect all call graph paths from kernels to end-callies which are
+  // associated with current LDS.
+  auto *Callie = LDSToFunction[LDS];
+  for (auto KI = KernelToCallie.begin(), KE = KernelToCallie.end(); KI != KE;
+       ++KI) {
+    auto *K = KI->first;
+    auto Callies = KI->second;
+    if (Callies.find(Callie) != Callies.end())
+      KToC[K] = Callie;
+  }
+}
+
+static bool handleIndirectLDSGlobals(
+    Module &M, ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, std::set<Function *>> &KernelToCallie,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToIndirectLDS,
+    std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+        &KernelToIndirectBasePtrInst) {
+  // Construct a grand list of all indirect LDS globals.
+  SetVector<GlobalVariable *> GrandLDSList;
+  for (auto KI = KernelToIndirectLDS.begin(), KE = KernelToIndirectLDS.end();
+       KI != KE; ++KI)
+    for (auto *LDS : KI->second)
+      GrandLDSList.insert(LDS);
+
+  // Process each indirect LDS global one by one.
+  for (auto *LDS : GrandLDSList) {
+    // For the current indirect LDS, pair-up kernel and end-callie.
+    ValueMap<Function *, Function *> KToC;
+    getKernelToCallieMap(LDS, LDSToFunction, KernelToCallie, KToC);
+
+    // For the current indirect LDS, get the base pointer access instruction
+    // type.
+    auto *BasePtrType =
+        getBasePtrAccessInstType(LDS, KernelToIndirectBasePtrInst);
+
+    // Create clones of all those functions which exist within the call graph
+    // paths from kernels to end-callie so that they accept new argument
+    // associated with current indirect LDS global.
+    ValueMap<Function *, Function *> OldCallieToNewCallie;
+    createCloneOfCalliesWithNewParameter(M, LDS, KToC, BasePtrType,
+                                         OldCallieToNewCallie);
+
+    // Update all the required data structures to point to the new cloned
+    // functions in place of their old counterparts.
+    updateLDSToFunctionMap(LDSToFunction, OldCallieToNewCallie);
+    updateKernelToCallieList(KernelToCallie, OldCallieToNewCallie);
+
+    // Update all call sites of the all old functions being cloned so that calls
+    // are being made to new cloned functions instead to old functions.
+    updateCallSites(LDS, KernelToIndirectBasePtrInst, OldCallieToNewCallie);
+
+    // By now, all old functions are dead without any reference being made to
+    // them, erase them now from the module.
+    eraseOldCallies(OldCallieToNewCallie);
+
+#ifndef NDEBUG
+    assert(OldCallieToNewCallie.empty() &&
+           "None of the old callies should alive by now\n");
+#endif
+
+    // Finally, update the end-callie where the current LDS was originally
+    // defined, so that all the references to this LDS within this end-callie
+    // are appropriately replaced.
+    auto *F = LDSToFunction[LDS];
+    updateFunctionAssociatedWithLDS(M, LDS, F->getArg(F->arg_size() - 1), F);
+  }
+
+  return true;
+}
+
+static bool handleDirectLDSGlobals(
+    Module &M, std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+                   &KernelToDirectBasePtrInst) {
+  // Go through each kernel one by one, and handle all the direct globals
+  // defined within each of them.
+  for (auto KI = KernelToDirectBasePtrInst.begin(),
+            KE = KernelToDirectBasePtrInst.end();
+       KI != KE; ++KI) {
+    auto *K = KI->first;
+    auto DirectLDSToBasePtrInst = KI->second;
+
+    for (auto LI = DirectLDSToBasePtrInst.begin(),
+              LE = DirectLDSToBasePtrInst.end();
+         LI != LE; ++LI) {
+      auto *LDS = LI->first;
+      auto *BasePtr = LI->second;
+      updateFunctionAssociatedWithLDS(M, LDS, BasePtr, K);
+    }
+  }
+
+  return true;
+}
+
+static Type *getLDSBaseType(Type *LDSTy) {
+  if (auto *ArrTy = dyn_cast<ArrayType>(LDSTy))
+    return getLDSBaseType(ArrTy->getElementType());
+  return LDSTy;
+}
+
+static Instruction *insertBasePointerAccessInstructionsWithinKernel(
+    Module &M, Function *K, GlobalVariable *LDS, GlobalVariable *NewLDS,
+    uint64_t Offset) {
+  // Insert instructions as below at the begining of the entry basic block of
+  // the kernel
+  //
+  // 1. Insert GEP instruction which access the address `NewLDS + Offset`, say,
+  //    result is `GEPInst` which is of type `char*`.
+  // 2. Insert type cast instruction which type casts `GEPInst` from `char*` to
+  //    `basetype*` where `basetype` is base type of `LDS`, say the result is,
+  //    `CastInst`.
+  // 3. Return `CastInst`.
+
+  // Suffix the names of the instructions with unique integer values
+  static int Suffix = 0;
+  ++Suffix;
+
+  // Insert GEP instruction which points to `offset` corresponding to current
+  // LDS.
+  //
+  // Get the first insert point of the entry basic block
+  auto BI = K->getEntryBlock().getFirstInsertionPt();
+#ifndef NDEBUG
+  assert(BI != K->getEntryBlock().end() &&
+         "Entry basic block of the kernel cannot be empty, otherwise control "
+         "would not reach this point\n");
+#endif
+  auto &EI = *BI;
+
+  // Prepare indices for GEP instruction to be inserted
+  Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())),
+                      Constant::getIntegerValue(
+                          Type::getInt64Ty(M.getContext()), APInt(64, Offset))};
+
+  // Insert GEP instruction at the first insert point of the entry basic block
+  Instruction *GEPInst = GetElementPtrInst::CreateInBounds(
+      NewLDS->getValueType(), const_cast<GlobalVariable *>(NewLDS), Indices,
+      Twine("dssv.gep.") + Twine(Suffix), const_cast<Instruction *>(&EI));
+
+  // Insert type-cast instruction just after above inserted GEP instruction
+  // which type-casts GEP instruction from `char*` to `basetype*` where
+  // `basetype` is the base type of LDS global.
+  //
+  // TODO: We are at present only handle `Array` aggregate type.
+  auto *LDSBaseType = getLDSBaseType(LDS->getValueType());
+
+  // TODO: Only base types supported at present are integer types and
+  // floating-point types.
+  if (!LDSBaseType->isIntegerTy() && !LDSBaseType->isFloatingPointTy())
+    llvm_unreachable("Not Implemented.");
+
+  auto *ToBeCastedType = PointerType::get(LDSBaseType, AMDGPUAS::LOCAL_ADDRESS);
+  Instruction *CastInst = new BitCastInst(GEPInst, ToBeCastedType,
+                                          Twine("dssv.cast.") + Twine(Suffix),
+                                          const_cast<Instruction *>(&EI));
+
+  // Return type-casted instruction
+  return CastInst;
+}
+
+static GlobalVariable *
+createSingleContiguousLayout(Module &M, Function *K,
+                             uint64_t &TotalLDSSizeInBytes) {
+  // Insert a new LDS global which is nothing but a single contigeous shared
+  // memory layout representing all the LDS globals associted with the kernel
+  // which includes those directly defined within the kernel and those
+  // indirectly defined within callies.
+  //
+  // The size of this new contigeous LDS global layout is equal to the sum of
+  // the sizes of all the associated LDS globals.
+  // TODO: what about the name of this new LDS global? is it fine or need to be
+  // changed?
+  auto *NewLDSTy =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes);
+  auto *NewLDS = new GlobalVariable(
+      M, NewLDSTy, false, GlobalValue::InternalLinkage,
+      UndefValue::get(NewLDSTy),
+      Twine(K->getName()) + Twine(".Single.LDS.Layout"), nullptr,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+  NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS)));
+
+  return NewLDS;
+}
+
+static void
+computeTotalLDSSizeInBytes(ValueMap<GlobalVariable *, uint64_t> &LDSToSize,
+                           SetVector<GlobalVariable *> &DirectLDSList,
+                           SetVector<GlobalVariable *> &IndirectLDSList,
+                           ValueMap<GlobalVariable *, uint64_t> &LDSToOffset,
+                           uint64_t &TotalLDSSizeInBytes) {
+  // For the current kernel, compute the total size of all LDS globals, and also
+  // offsets associated with them within in the new LDS global.
+  TotalLDSSizeInBytes = 0;
+  for (auto *LDS : DirectLDSList) {
+    LDSToOffset[LDS] = TotalLDSSizeInBytes;
+    TotalLDSSizeInBytes += LDSToSize[LDS];
+  }
+  for (auto *LDS : IndirectLDSList) {
+    LDSToOffset[LDS] = TotalLDSSizeInBytes;
+    TotalLDSSizeInBytes += LDSToSize[LDS];
+  }
+}
+
+static bool prepareKernelsForHandlingLDSGlobals(
+    Module &M, SetVector<Function *> &Kernels,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToDirectLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToIndirectLDS,
+    ValueMap<GlobalVariable *, uint64_t> &LDSToSize,
+    std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+        &KernelToDirectBasePtrInst,
+    std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+        &KernelToIndirectBasePtrInst) {
+  bool Change = false;
+
+  // For each LDS global, insert base pointer access instruction within
+  // associated kernel(s).
+  for (auto *K : Kernels) {
+    // Copy both direct and indirect LDS list for current kernel.
+    SetVector<GlobalVariable *> DirectLDSList;
+    SetVector<GlobalVariable *> IndirectLDSList;
+    if (KernelToDirectLDS.find(K) != KernelToDirectLDS.end())
+      DirectLDSList = KernelToDirectLDS[K];
+    if (KernelToIndirectLDS.find(K) != KernelToIndirectLDS.end())
+      IndirectLDSList = KernelToIndirectLDS[K];
+
+    // No indirect LDS globals to process? ignore the kernel, goto next kernel.
+    if (IndirectLDSList.empty())
+      continue;
+
+    // We are going process indirect LDS globals atleast for one kernel, and
+    // hence, we are going to make module level changes.
+    Change = true;
+
+    // Create a single contigeous LDS latout for current kernel
+    uint64_t TotalLDSSizeInBytes;
+    ValueMap<GlobalVariable *, uint64_t> LDSToOffset;
+    computeTotalLDSSizeInBytes(LDSToSize, DirectLDSList, IndirectLDSList,
+                               LDSToOffset, TotalLDSSizeInBytes);
+    auto *NewLDS = createSingleContiguousLayout(M, K, TotalLDSSizeInBytes);
+
+    // For each LDS global (both direct and indirect ones), insert base pointer
+    // access instructions within kernel.
+    std::map<GlobalVariable *, Instruction *> DirectLDSToBasePtrInst;
+    std::map<GlobalVariable *, Instruction *> IndirectLDSToBasePtrInst;
+    for (auto *LDS : DirectLDSList)
+      DirectLDSToBasePtrInst[LDS] =
+          insertBasePointerAccessInstructionsWithinKernel(M, K, LDS, NewLDS,
+                                                          LDSToOffset[LDS]);
+    for (auto *LDS : IndirectLDSList)
+      IndirectLDSToBasePtrInst[LDS] =
+          insertBasePointerAccessInstructionsWithinKernel(M, K, LDS, NewLDS,
+                                                          LDSToOffset[LDS]);
+
+    KernelToDirectBasePtrInst[K] = DirectLDSToBasePtrInst;
+    KernelToIndirectBasePtrInst[K] = IndirectLDSToBasePtrInst;
+  }
+
+  return Change;
+}
+
+static bool handleDeviceScopeSharedVariables(
+    Module &M, SetVector<Function *> &Kernels,
+    SetVector<GlobalVariable *> &LDSGlobals,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, std::set<Function *>> &KernelToCallie,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToDirectLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToIndirectLDS,
+    ValueMap<GlobalVariable *, uint64_t> &LDSToSize) {
+  bool Change = false;
+  std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+      KernelToDirectBasePtrInst;
+  std::map<Function *, std::map<GlobalVariable *, Instruction *>>
+      KernelToIndirectBasePtrInst;
+
+  // 1. Create a single contigeous LDS global layout for each kernel.
+  // 2. Compute base pointer offset for each LDS within the above single
+  //    contigeous LDS global layout, and insert it within the associated
+  //    kernel(s).
+  Change = prepareKernelsForHandlingLDSGlobals(
+      M, Kernels, KernelToDirectLDS, KernelToIndirectLDS, LDSToSize,
+      KernelToDirectBasePtrInst, KernelToIndirectBasePtrInst);
+
+  // None of the kernel has any LDS globals (direct and/or indirect ones)
+  // associated with them. Nothing to do, no changes being made to module.
+  if (!Change)
+    return false;
+
+  // Handle all indirect LDS globals defined within device functions.
+  if (!KernelToIndirectLDS.empty())
+    Change = handleIndirectLDSGlobals(M, LDSToFunction, KernelToCallie,
+                                      KernelToIndirectLDS,
+                                      KernelToIndirectBasePtrInst);
+
+  // Handle all direct LDS globals defined within kernels.
+  if (!KernelToDirectLDS.empty())
+    Change = handleDirectLDSGlobals(M, KernelToDirectBasePtrInst);
+
+  // Now, finally, erase all the original LDS globals from the module.
+  for (auto *LDS : LDSGlobals)
+    LDS->eraseFromParent();
+
+  return Change;
+}
+
+static bool
+handleDeviceScopeSharedVariables(Module &M,
+                                 SetVector<GlobalVariable *> &LDSGlobals,
+                                 SetVector<Function *> &Kernels) {
+  // Pair up each LDS global with the enclosing function where the LDS global is
+  // defined
+  ValueMap<GlobalVariable *, Function *> LDSToFunction;
+  for (auto *LDSGlobal : LDSGlobals)
+    pairUpLDSGlobalWithItsAssociatedFunction(LDSGlobal, LDSToFunction);
+
+  // Create reverse map from enclosing function to LDS global list
+  ValueMap<Function *, SetVector<GlobalVariable *>> FunctionToLDS;
+  createFunctionToLDSMap(LDSToFunction, FunctionToLDS);
+
+  // Pair up kernels with callie list which define LDS globals
+  ValueMap<Function *, std::set<Function *>> KernelToCallie;
+  for (auto *K : Kernels)
+    pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie);
+
+  // Pair up kernels with all the LDS globals: both direct LDS globals (those
+  // directly defined within the kernels), and indirect LDS globals (those
+  // indirectly defined within the callies).
+  ValueMap<Function *, SetVector<GlobalVariable *>> KernelToDirectLDS;
+  ValueMap<Function *, SetVector<GlobalVariable *>> KernelToIndirectLDS;
+  for (auto *K : Kernels)
+    pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, KernelToDirectLDS,
+                            KernelToIndirectLDS);
+
+  // Get the size of each LDS global in bytes
+  ValueMap<GlobalVariable *, uint64_t> LDSToSize;
+  for (auto *LDSGlobal : LDSGlobals)
+    getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize);
+
+  return handleDeviceScopeSharedVariables(M, Kernels, LDSGlobals, LDSToFunction,
+                                          KernelToCallie, KernelToDirectLDS,
+                                          KernelToIndirectLDS, LDSToSize);
+}
+
+static bool handleDeviceScopeSharedVariables(Module &M) {
+  // Collect all the (static) LDS globals defined within the current module
+  SetVector<GlobalVariable *> LDSGlobals;
+  for (auto &GV : M.globals())
+    if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        !GV.hasExternalLinkage())
+      LDSGlobals.insert(&GV);
+
+  if (LDSGlobals.empty()) {
+    LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName()
+                      << ", skipping handling device of scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  // Collect all the amdgpu kernels defined within the current module
+  SetVector<Function *> Kernels;
+  for (auto &F : M.functions())
+    if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) &&
+        !F.isDeclaration())
+      Kernels.insert(&F);
+
+  if (Kernels.empty()) {
+    LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName()
+                      << ", skipping handling of device scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels);
+}
+
+bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) {
+  LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the "
+                       "module "
+                    << M.getName() << "\n");
+
+  // TODO: We only want to handle HIP kernels, and no kernels from from other
+  // programming languages, like OpenCL, OpenMP, etc. Do we need to add a
+  // condition here for it, and skip running the pass for non-HIP kernels?
+  if (skipModule(M)) {
+    LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables "
+                         "in the module "
+                      << M.getName() << "\n");
+    return false;
+  }
+
+  bool Changed = handleDeviceScopeSharedVariables(M);
+
+  LLVM_DEBUG(dbgs() << "===== Done with handling device scope shared variables "
+                       "in the module "
+                    << M.getName() << "\n");
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -40,6 +40,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableDeviceScopeSharedVariable;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -198,6 +198,12 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool, true> EnableDeviceScopeSharedVariable(
+    "amdgpu-enable-device-scope-shared-variable",
+    cl::desc("Support amdgpu device scope shared variables"),
+    cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -265,6 +271,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPUDeviceScopeSharedVariablePass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -394,6 +401,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -699,6 +707,12 @@
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  // We expect to run this pass as a first AMDGPU IR pass so that new
+  // instructions being added in this pass can possibly undergo further
+  // transformations via subsequent passes.
+  if (EnableDeviceScopeSharedVariable)
+    addPass(createAMDGPUDeviceScopeSharedVariablePass());
+
   addPass(createAMDGPUPrintfRuntimeBinding());
 
   // This must occur before inlining, as the inliner will not look through
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUDeviceScopeSharedVariable.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll
@@ -0,0 +1,163 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E13smem_dev_four
+; OLD-LDS-NOT: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two
+; OLD-LDS-NOT: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one
+; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout
+; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E13smem_dev_four
+; OLD-LDS-NOT: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two
+; OLD-LDS-NOT: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one
+@_ZZ22function_four_with_ldsPiS_E13smem_dev_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z22function_four_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: %idxprom = zext i32 %1 to i64
+; GCN-NEXT: %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT: %2 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT: %.ptr.arith.2 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %1
+; GCN-NEXT: store i32 %2, i32 addrspace(3)* %.ptr.arith.2, align 4
+; GCN-NEXT: %add = add nuw nsw i32 %1, 1
+; GCN-NEXT: %.ptr.arith.1 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %add
+; GCN-NEXT: %3 = load i32, i32 addrspace(3)* %.ptr.arith.1, align 4
+; GCN-NEXT: %mul = mul nsw i32 %3, %2
+; GCN-NEXT: %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT: store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT: ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E13smem_dev_four, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E13smem_dev_four, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i32 addrspace(3)* %0)
+; GCN-NEXT:  ret void
+entry:
+  tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z24function_two_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i32 addrspace(3)* %0)
+; GCN-NEXT:  ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z24function_one_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i32 addrspace(3)* %0)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.2 = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 1024
+; GCN-NEXT:  %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep.2 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.1 = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep.1 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %dssv.cast.1.ptr.arith.4 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.1, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %dssv.cast.1.ptr.arith.4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %dssv.cast.1.ptr.arith.3 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.1, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %dssv.cast.1.ptr.arith.3, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.2)
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.2)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.4 = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 1024
+; GCN-NEXT:  %dssv.cast.4 = bitcast i8 addrspace(3)* %dssv.gep.4 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.3 = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.3 = bitcast i8 addrspace(3)* %dssv.gep.3 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %dssv.cast.3.ptr.arith.6 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.3, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %dssv.cast.3.ptr.arith.6, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %dssv.cast.3.ptr.arith.5 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.3, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %dssv.cast.3.ptr.arith.5, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.4)
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.4)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll
@@ -0,0 +1,59 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+; NEW-LDS: @_Z6kernelPiS_.Single.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+@_ZZ17function_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x [4 x i32]] undef, align 16
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: %idxprom = zext i32 %1 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %2 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %3 = mul i32 %1, 4
+; GCN-NEXT:  %4 = add i32 %3, 2
+; GCN-NEXT:  %.ptr.arith.2 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %4
+; GCN-NEXT:  store i32 %2, i32 addrspace(3)* %.ptr.arith.2, align 8
+; GCN-NEXT:  %add = add nuw nsw i32 %1, 1
+; GCN-NEXT:  %5 = mul i32 %add, 4
+; GCN-NEXT:  %6 = add i32 %5, 2
+; GCN-NEXT:  %.ptr.arith.1 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %6
+; GCN-NEXT:  %7 = load i32, i32 addrspace(3)* %.ptr.arith.1, align 8
+; GCN-NEXT:  %mul = mul nsw i32 %7, %2
+; GCN-NEXT:  %arrayidx15 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx15, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx42 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %0, i32 2
+  store i32 %1, i32 addrspace(3)* %arrayidx42, align 8
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx126 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %add, i32 2
+  %2 = load i32, i32 addrspace(3)* %arrayidx126, align 8
+  %mul = mul nsw i32 %2, %1
+  %arrayidx15 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx15, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z6kernelPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.1 = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(3)* @_Z6kernelPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep.1 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.1)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll
@@ -0,0 +1,111 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem
+; NEW-LDS: @_Z6kernelPcS_PiS0_PfS1_.Single.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem
+@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem = internal unnamed_addr addrspace(3) global [256 x i8] undef, align 16
+@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem = internal unnamed_addr addrspace(3) global [256 x float] undef, align 16
+
+; NEW-PARAM: float addrspace(3)*
+; NEW-PARAM: i32 addrspace(3)*
+; NEW-PARAM: i8 addrspace(3)*
+define internal fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* nocapture readonly %ci_arg, i8* nocapture %co_arg, i32* nocapture readonly %ii_arg, i32* nocapture %io_arg, float* nocapture readonly %fi_arg, float* nocapture %fo_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %3 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT: %idxprom = zext i32 %3 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i8, i8* %ci_arg, i64 %idxprom
+; GCN-NEXT:  %4 = load i8, i8* %arrayidx, align 1
+; GCN-NEXT:  %.ptr.arith.6 = getelementptr inbounds i8, i8 addrspace(3)* %2, i32 %3
+; GCN-NEXT:  store i8 %4, i8 addrspace(3)* %.ptr.arith.6, align 1
+; GCN-NEXT:  %add = add nuw nsw i32 %3, 1
+; GCN-NEXT:  %.ptr.arith.5 = getelementptr inbounds i8, i8 addrspace(3)* %2, i32 %add
+; GCN-NEXT:  %5 = load i8, i8 addrspace(3)* %.ptr.arith.5, align 1
+; GCN-NEXT:  %mul = mul i8 %5, %4
+; GCN-NEXT:  %arrayidx14 = getelementptr inbounds i8, i8* %co_arg, i64 %idxprom
+; GCN-NEXT:  store i8 %mul, i8* %arrayidx14, align 1
+; GCN-NEXT:  %arrayidx17 = getelementptr inbounds i32, i32* %ii_arg, i64 %idxprom
+; GCN-NEXT:  %6 = load i32, i32* %arrayidx17, align 4
+; GCN-NEXT:  %.ptr.arith.4 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 %3
+; GCN-NEXT:  store i32 %6, i32 addrspace(3)* %.ptr.arith.4, align 4
+; GCN-NEXT:  %.ptr.arith.3 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 %add
+; GCN-NEXT:  %7 = load i32, i32 addrspace(3)* %.ptr.arith.3, align 4
+; GCN-NEXT:  %mul28 = mul nsw i32 %7, %6
+; GCN-NEXT:  %arrayidx31 = getelementptr inbounds i32, i32* %io_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul28, i32* %arrayidx31, align 4
+; GCN-NEXT:  %arrayidx34 = getelementptr inbounds float, float* %fi_arg, i64 %idxprom
+; GCN-NEXT:  %8 = load float, float* %arrayidx34, align 4
+; GCN-NEXT:  %.ptr.arith.2 = getelementptr inbounds float, float addrspace(3)* %0, i32 %3
+; GCN-NEXT:  store float %8, float addrspace(3)* %.ptr.arith.2, align 4
+; GCN-NEXT:  %.ptr.arith.1 = getelementptr inbounds float, float addrspace(3)* %0, i32 %add
+; GCN-NEXT:  %9 = load float, float addrspace(3)* %.ptr.arith.1, align 4
+; GCN-NEXT:  %mul45 = fmul contract float %8, %9
+; GCN-NEXT:  %arrayidx48 = getelementptr inbounds float, float* %fo_arg, i64 %idxprom
+; GCN-NEXT:  store float %mul45, float* %arrayidx48, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %ci_arg, i64 %idxprom
+  %1 = load i8, i8* %arrayidx, align 1
+  %arrayidx32 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem, i32 0, i32 %0
+  store i8 %1, i8 addrspace(3)* %arrayidx32, align 1
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx94 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem, i32 0, i32 %add
+  %2 = load i8, i8 addrspace(3)* %arrayidx94, align 1
+  %mul = mul i8 %2, %1
+  %arrayidx14 = getelementptr inbounds i8, i8* %co_arg, i64 %idxprom
+  store i8 %mul, i8* %arrayidx14, align 1
+  %arrayidx17 = getelementptr inbounds i32, i32* %ii_arg, i64 %idxprom
+  %3 = load i32, i32* %arrayidx17, align 4
+  %arrayidx205 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem, i32 0, i32 %0
+  store i32 %3, i32 addrspace(3)* %arrayidx205, align 4
+  %arrayidx277 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx277, align 4
+  %mul28 = mul nsw i32 %4, %3
+  %arrayidx31 = getelementptr inbounds i32, i32* %io_arg, i64 %idxprom
+  store i32 %mul28, i32* %arrayidx31, align 4
+  %arrayidx34 = getelementptr inbounds float, float* %fi_arg, i64 %idxprom
+  %5 = load float, float* %arrayidx34, align 4
+  %arrayidx378 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem, i32 0, i32 %0
+  store float %5, float addrspace(3)* %arrayidx378, align 4
+  %arrayidx4410 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem, i32 0, i32 %add
+  %6 = load float, float addrspace(3)* %arrayidx4410, align 4
+  %mul45 = fmul contract float %5, %6
+  %arrayidx48 = getelementptr inbounds float, float* %fo_arg, i64 %idxprom
+  store float %mul45, float* %arrayidx48, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z6kernelPcS_PiS0_PfS1_(i8 addrspace(1)* nocapture readonly %ci_arg.coerce, i8 addrspace(1)* nocapture %co_arg.coerce, i32 addrspace(1)* nocapture readonly %ii_arg.coerce, i32 addrspace(1)* nocapture %io_arg.coerce, float addrspace(1)* nocapture readonly %fi_arg.coerce, float addrspace(1)* nocapture %fo_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.3 = getelementptr inbounds [2304 x i8], [2304 x i8] addrspace(3)* @_Z6kernelPcS_PiS0_PfS1_.Single.LDS.Layout, i32 0, i64 2048
+; GCN-NEXT: %dssv.cast.3 = bitcast i8 addrspace(3)* %dssv.gep.3 to i8 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.2 = getelementptr inbounds [2304 x i8], [2304 x i8] addrspace(3)* @_Z6kernelPcS_PiS0_PfS1_.Single.LDS.Layout, i32 0, i64 1024
+; GCN-NEXT:  %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep.2 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.1 = getelementptr inbounds [2304 x i8], [2304 x i8] addrspace(3)* @_Z6kernelPcS_PiS0_PfS1_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep.1 to float addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i8 addrspace(1)* %ci_arg.coerce to i8*
+; GCN-NEXT:  %1 = addrspacecast i8 addrspace(1)* %co_arg.coerce to i8*
+; GCN-NEXT:  %2 = addrspacecast i32 addrspace(1)* %ii_arg.coerce to i32*
+; GCN-NEXT:  %3 = addrspacecast i32 addrspace(1)* %io_arg.coerce to i32*
+; GCN-NEXT:  %4 = addrspacecast float addrspace(1)* %fi_arg.coerce to float*
+; GCN-NEXT:  %5 = addrspacecast float addrspace(1)* %fo_arg.coerce to float*
+; GCN-NEXT:  tail call fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* %0, i8* %1, i32* %2, i32* %3, float* %4, float* %5, float addrspace(3)* %dssv.cast.1, i32 addrspace(3)* %dssv.cast.2, i8 addrspace(3)* %dssv.cast.3)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i8 addrspace(1)* %ci_arg.coerce to i8*
+  %1 = addrspacecast i8 addrspace(1)* %co_arg.coerce to i8*
+  %2 = addrspacecast i32 addrspace(1)* %ii_arg.coerce to i32*
+  %3 = addrspacecast i32 addrspace(1)* %io_arg.coerce to i32*
+  %4 = addrspacecast float addrspace(1)* %fi_arg.coerce to float*
+  %5 = addrspacecast float addrspace(1)* %fo_arg.coerce to float*
+  tail call fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* %0, i8* %1, i32* %2, i32* %3, float* %4, float* %5)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll
@@ -0,0 +1,56 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+; NEW-LDS: @_Z6kernelPiS_.Single.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+@_ZZ17function_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr  {
+; GCN-LABEL: entry:
+; GCN: %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %1 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %2 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %.ptr.arith.2 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %1
+; GCN-NEXT:  store i32 %2, i32 addrspace(3)* %.ptr.arith.2, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %1, 1
+; GCN-NEXT:  %.ptr.arith.1 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %add
+; GCN-NEXT:  %3 = load i32, i32 addrspace(3)* %.ptr.arith.1, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %3, %2
+; GCN-NEXT:  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z6kernelPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr  {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.1 = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z6kernelPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep.1 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.1)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll
@@ -0,0 +1,84 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ20function_with_no_ldsPiS_E8smem_dev
+; OLD-LDS-NOT: @_ZZ15kernel_with_ldsPiS_E9smem_kern
+; NEW-LDS: @_Z15kernel_with_ldsPiS_.Single.LDS.Layout
+; OLD-LDS-NOT: @_ZZ20function_with_no_ldsPiS_E8smem_dev
+; OLD-LDS-NOT: @_ZZ15kernel_with_ldsPiS_E9smem_kern
+@_ZZ20function_with_no_ldsPiS_E8smem_dev = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ15kernel_with_ldsPiS_E9smem_kern = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; NEW-PARAM: i32 addrspace(3)* %0
+define internal fastcc void @_Z20function_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN:  %1 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %1 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %2 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %.ptr.arith.2 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %1
+; GCN-NEXT:  store i32 %2, i32 addrspace(3)* %.ptr.arith.2, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %1, 1
+; GCN-NEXT:  %.ptr.arith.1 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %add
+; GCN-NEXT:  %3 = load i32, i32 addrspace(3)* %.ptr.arith.1, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %3, %2
+; GCN-NEXT:  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ20function_with_no_ldsPiS_E8smem_dev, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ20function_with_no_ldsPiS_E8smem_dev, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z15kernel_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.2 = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z15kernel_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 1024
+; GCN-NEXT:  %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep.2 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.1 = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z15kernel_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep.1 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+; GCN-NEXT:  %dssv.cast.1.ptr.arith.4 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.1, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %dssv.cast.1.ptr.arith.4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %dssv.cast.1.ptr.arith.3 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.1, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %dssv.cast.1.ptr.arith.3, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+; GCN-NEXT:  tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.2)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+  tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll
@@ -0,0 +1,65 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; NEW-LDS-NOT: @_Z6kernelPiS_.Single.LDS.Layout
+; OLD-LDS: @_ZZ15kernel_with_ldsPiS_E4smem
+; NEW-LDS-NOT: @_Z6kernelPiS_.Single.LDS.Layout
+@_ZZ15kernel_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; NEW-PARAM-NOT: i32 addrspace(3)* %0
+define internal fastcc void @_Z20function_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %0 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %1 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %arrayidx3 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %1, i32* %arrayidx3, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %1, i32* %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z15kernel_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+; GCN-NEXT:  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+; GCN-NEXT:  tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+  tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll
@@ -0,0 +1,184 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E14smem_dev_four1
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E14smem_dev_four2
+; OLD-LDS-NOT: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two
+; OLD-LDS-NOT: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one
+; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout
+; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E14smem_dev_four1
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E14smem_dev_four2
+; OLD-LDS-NOT: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two
+; OLD-LDS-NOT: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one
+@_ZZ22function_four_with_ldsPiS_E14smem_dev_four1 = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ22function_four_with_ldsPiS_E14smem_dev_four2 = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; NEW-PARAM: i32 addrspace(3)* %0
+; NEW-PARAM: i32 addrspace(3)* %1
+define internal fastcc void @_Z22function_four_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT  %idxprom = zext i32 %2 to i64
+; GCN-NEXT  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT  %.ptr.arith.4 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 %2
+; GCN-NEXT  store i32 %3, i32 addrspace(3)* %.ptr.arith.4, align 4
+; GCN-NEXT  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT  %.ptr.arith.3 = getelementptr inbounds i32, i32 addrspace(3)* %1, i32 %add
+; GCN-NEXT  %4 = load i32, i32 addrspace(3)* %.ptr.arith.3, align 4
+; GCN-NEXT  %add10 = add nsw i32 %3, %4
+; GCN-NEXT  %.ptr.arith.2 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %2
+; GCN-NEXT  store i32 %add10, i32 addrspace(3)* %.ptr.arith.2, align 4
+; GCN-NEXT  %.ptr.arith.1 = getelementptr inbounds i32, i32 addrspace(3)* %0, i32 %add
+; GCN-NEXT  %5 = load i32, i32 addrspace(3)* %.ptr.arith.1, align 4
+; GCN-NEXT  %mul = mul nsw i32 %5, %3
+; GCN-NEXT  %arrayidx23 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT  store i32 %mul, i32* %arrayidx23, align 4
+; GCN-NEXT  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx32 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E14smem_dev_four1, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx32, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx63 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E14smem_dev_four1, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx63, align 4
+  %add10 = add nsw i32 %1, %2
+  %arrayidx134 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E14smem_dev_four2, i32 0, i32 %0
+  store i32 %add10, i32 addrspace(3)* %arrayidx134, align 4
+  %arrayidx206 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E14smem_dev_four2, i32 0, i32 %add
+  %3 = load i32, i32 addrspace(3)* %arrayidx206, align 4
+  %mul = mul nsw i32 %3, %1
+  %arrayidx23 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx23, align 4
+  ret void
+}
+
+; NEW-PARAM: i32 addrspace(3)* %0
+; NEW-PARAM: i32 addrspace(3)* %1
+define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i32 addrspace(3)* %0, i32 addrspace(3)* %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i32 addrspace(3)* %0
+; NEW-PARAM: i32 addrspace(3)* %1
+define internal fastcc void @_Z24function_two_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i32 addrspace(3)* %0, i32 addrspace(3)* %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i32 addrspace(3)* %0
+; NEW-PARAM: i32 addrspace(3)* %1
+define internal fastcc void @_Z24function_one_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i32 addrspace(3)* %0, i32 addrspace(3)* %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.3 = getelementptr inbounds [3072 x i8], [3072 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 2048
+; GCN-NEXT: %dssv.cast.3 = bitcast i8 addrspace(3)* %dssv.gep.3 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.2 = getelementptr inbounds [3072 x i8], [3072 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 1024
+; GCN-NEXT:  %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep.2 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.1 = getelementptr inbounds [3072 x i8], [3072 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep.1 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %dssv.cast.1.ptr.arith.6 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.1, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %dssv.cast.1.ptr.arith.6, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %dssv.cast.1.ptr.arith.5 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.1, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %dssv.cast.1.ptr.arith.5, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.2, i32 addrspace(3)* %dssv.cast.3)
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.2, i32 addrspace(3)* %dssv.cast.3)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.6 = getelementptr inbounds [3072 x i8], [3072 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 2048
+; GCN-NEXT: %dssv.cast.6 = bitcast i8 addrspace(3)* %dssv.gep.6 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.5 = getelementptr inbounds [3072 x i8], [3072 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 1024
+; GCN-NEXT:  %dssv.cast.5 = bitcast i8 addrspace(3)* %dssv.gep.5 to i32 addrspace(3)*
+; GCN-NEXT:  %dssv.gep.4 = getelementptr inbounds [3072 x i8], [3072 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Single.LDS.Layout, i32 0, i64 0
+; GCN-NEXT:  %dssv.cast.4 = bitcast i8 addrspace(3)* %dssv.gep.4 to i32 addrspace(3)*
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %dssv.cast.4.ptr.arith.8 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.4, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %dssv.cast.4.ptr.arith.8, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %dssv.cast.4.ptr.arith.7 = getelementptr inbounds i32, i32 addrspace(3)* %dssv.cast.4, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %dssv.cast.4.ptr.arith.7, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.5, i32 addrspace(3)* %dssv.cast.6)
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i32 addrspace(3)* %dssv.cast.5, i32 addrspace(3)* %dssv.cast.6)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()