diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -271,6 +271,10 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPUDeviceScopeSharedVariablePass();
+void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &);
+extern char &AMDGPUDeviceScopeSharedVariableID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -123,13 +123,15 @@
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
 
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
@@ -0,0 +1,1548 @@
+//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// [1]. What does this module pass is about?
+//
+// This module pass is about an "indirect" method for supporting the "shared"
+// variables which are defined within "device" functions. Programmatically
+// speaking, a "hacked" way of supporting device scoped shared variables. Yes,
+// it is a hacked way, because, it has been become very complicated to support
+// device scoped shared variables due to below "main" reason:
+//
+// A shared variable is a "block" scoped variable, but it's lifetime is same as
+// the "work-group" to which it belongs, which necessitates to keep track of
+// multiple copies of shared variables related to different work-groups from
+// different running kernels, which is too costly and too complex to implement
+// given the "scarcity" of the shared memory and "strange properties" of the
+// shared variables.
+//
+// [2]. What does this pass do?
+//
+// Though, programmer define device scoped shared variables within device
+// functions, this module pass,
+//
+//   A. Internally pushes the definitions of all those device scoped shared
+//      variables within the associated kernel(s). By associated kernels, what
+//      we mean here is that all those kernels from which there exist call graph
+//      paths from kernels to device functions within which device scoped shared
+//      variables are originally defined.
+//   B. Implements necessary program transformations in order to make sure that
+//      within kernel defined new and big device scoped shared variable layout
+//      is accessible within device functions within which those original device
+//      scoped shared variables were defined.
+//
+// [3]. How does the implementation of the pass look like in brief?
+//
+// At a very high level, implementation of this pass can be described as below:
+//
+//   A. For every kernel, traverse through it's call graph, and collect all
+//      those device functions within which device scoped shared variables are
+//      defined, and also the corresponding device scoped shared variables.
+//   B. Create a single big shared memory layout within the kernel by combining
+//      all those device scoped shared variables which are collected above.
+//   C. Map each corresponding device scoped shared variable to it's "offset"
+//      within the above defined corresponding big shared memory layout per
+//      kernel.
+//   D. Associate each kernel with unique positive integer starting from 0 which
+//      we call `kernel number`, and similarly associate each device scoped
+//      shared variable with unique positive integer again starting from 0 which
+//      we call `lds number`.
+//   E. Create a 2D offset table (within constant memory) where each row
+//      represents different kernel, and each column represents different device
+//      scoped shared variable, and fill this table with offsets which are
+//      computed as in above item C.
+//   F. Within each kernel, type cast the corresponding single big shared memory
+//      layout to `char*`, and pass this type-casted pointer and the kernel
+//      number as new function arguments along the call graph path(s) so that
+//      these are accessible within those device functions within which original
+//      device scoped shared variables were defined.
+//   G. Within device functions, replace all the references to original device
+//      scoped shared variables by their offset counterparts.
+//   H. Finally, remove all the original device scoped shared variables.
+//
+// [4]. What are the positive consequences of this pass on the applications?
+//
+// Is one really exist?
+//
+// [5]. What are the negative consequences of this pass on the applications?
+//
+//   A. Since we add new arguments to the functions along the call graph paths
+//      from kernels to device functions, there is a posibility of increased
+//      register pressure, which may affect performance.
+//   B. Since we create single big shared memory layouts within kernels, we
+//      land-up duplicating share memory of indirect shared variables within
+//      kernels, however, it is still less costlier compare to the direct
+//      support.
+//   C. This implementation is indeed a very careful hack, and hence any bug in
+//      the implementation may have some adverse effect on running application.
+//
+// [6]. What are some important current limitations of the pass?
+//
+//   A. The implementation assumes that there are no recursive calls to device
+//      functions (both direct or indirect), and hence call graphs associated
+//      with kernels are acyclic. Presence of recursive calls may leads to
+//      undefined beviour of the running application or compilation may itself
+//      crash.
+//   B. The handling of the presence of any indirect call(s) completely depends
+//      on the pass `called-value-propagation` which inserts metadata info at
+//      indirect call site points. This metadata indicates the set of functions
+//      the call site could possibly target at runtime. If for some reasons,
+//      this pass fails to attach above metadata, then, it may leads to
+//      undefined beviour of the running application, or compilation may itself
+//      crash.
+//   C. All TODOs need to be revisited sooner than later.
+//
+// [7]. An Example.
+//
+// Before Pass:
+//
+//     __device__ void foo()
+//     {
+//        __shared__ char smc[10];
+//        __shared__ int smi[10];
+//        __shared__ float smf[10];
+//
+//        smc[1] = 1;
+//        smi[2] = 2;
+//        smf[3] = 3.0;
+//     }
+//
+//     __global__ void kernel()
+//     {
+//        foo();
+//     }
+//
+// After Pass:
+//
+//     int offset_table[1][3] = {0, 10, 50};
+//
+//     __device__ void foo(char *sm, int knum)
+//     {
+//        int c_offset = offset_table[knum][0];
+//        int i_offset = offset_table[knum][1];
+//        int f_offset = offset_table[knum][2];
+//
+//        char *c_addr = (char*)(sm + c_offset);
+//        int *i_addr = (int*)(sm + i_offset);
+//        float *f_addr = (float*)(sm + f_offset);
+//
+//        c_addr[1] = 1;
+//        i_addr[2] = 2;
+//        f_addr[3] = 3.0;
+//     }
+//
+//     __global__ void kernel()
+//     {
+//        __shared__ char sm[90];  // assuming char occupies 1 byte, int
+//                                 // occupies 4 bytes, and float occupies 4
+//                                 // bytes.
+//        foo(sm, 0);  // 0 is kernel number.
+//     }
+//
+// NOTE: This pass is disabled by default, and enabled with the AMDGPU back-end
+//       option `--amdgpu-enable-device-scope-shared-variable=true`.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <map>
+#include <set>
+
+#define DEBUG_TYPE "amdgpu-device-scope-shared-variable"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUDeviceScopeSharedVariable : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) {
+    initializeAMDGPUDeviceScopeSharedVariablePass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPUDeviceScopeSharedVariable::ID = 0;
+
+char &llvm::AMDGPUDeviceScopeSharedVariableID =
+    AMDGPUDeviceScopeSharedVariable::ID;
+
+ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() {
+  return new AMDGPUDeviceScopeSharedVariable();
+}
+
+INITIALIZE_PASS(AMDGPUDeviceScopeSharedVariable,
+                "implement-amdgpu-device-scope-shared-variable",
+                "Implement AMDPGU Device Scope Shared Variable",
+                false /*only look at the cfg*/, false /*analysis pass*/)
+
+static unsigned getTypeStoreSizeInBits(Module &M, Type *Ty) {
+  return M.getDataLayout().getTypeSizeInBits(Ty).getFixedSize();
+}
+
+static unsigned getTypeStoreSizeInBytes(Module &M, Type *Ty) {
+  return getTypeStoreSizeInBits(M, Ty) / 8;
+}
+
+// TODO: This really looks like a horrible hack to me, but, is there any better
+// way of handling `ConstantExprs`? I have no idea at the moment, need to
+// revisit it later.
+static Instruction *replaceConstExprByInst(ConstantExpr *CE) {
+  for (auto *U : CE->users()) {
+    auto *I = dyn_cast<Instruction>(U);
+
+    if (!I)
+      I = replaceConstExprByInst(dyn_cast<ConstantExpr>(U));
+
+    if (I) {
+      auto *NI = CE->getAsInstruction();
+      NI->insertBefore(I);
+      unsigned Ind = 0;
+      for (Use &UU : I->operands()) {
+        Value *V = UU.get();
+        if (V == CE)
+          I->setOperand(Ind, NI);
+        ++Ind;
+      }
+      return NI;
+    }
+  }
+
+  return nullptr;
+}
+
+static void replaceInstWhichUsesLDS(Module &M, GlobalVariable *LDS,
+                                    Value *LDSAccessInst, Function *F,
+                                    Instruction *I,
+                                    SetVector<Instruction *> &ToBeErasedInsts) {
+  // The function associated with the `I` shoud be the one where `LDS` is
+  // actually defined.
+#ifndef NDEBUG
+  assert(I->getParent()->getParent() == F &&
+         "The reference to LDS should only exists within the function where it "
+         "is actually defined\n");
+#endif
+
+  // Create clone of `I`, say, it is `NewI`. Within `NewI`, replace the use(s)
+  // of `LDS` by `LDSAccessInst`.
+  Instruction *NewI = I->clone();
+  unsigned Ind = 0;
+  for (Use &UU : NewI->operands()) {
+    Value *V = UU.get();
+    if (V == LDS)
+      NewI->setOperand(Ind, LDSAccessInst);
+    ++Ind;
+  }
+
+  // Insert `NewI` just before `I`, replace all uses of `I` by `NewI` and mark
+  // `I` as `to be erased`instruction.
+  NewI->insertBefore(I);
+  NewI->copyMetadata(*I);
+  I->replaceAllUsesWith(NewI);
+  ToBeErasedInsts.insert(I);
+}
+
+static void updateAllUsersOfOriginalLDS(Module &M, GlobalVariable *LDS,
+                                        Function *F,
+                                        Instruction *LDSAccessInst) {
+  // Keep track of all the erased to be instructions.
+  SetVector<Instruction *> ToBeErasedInsts;
+
+  // Traverse through each `use` of `LDS`, create a new to be replaced value
+  // for each use case, and accordingly replace it with new one.
+  for (auto *U : LDS->users()) {
+    // `U` may be using `LDS`, but 'U` itself is not used anywhere, ignore `U`.
+    if (!U->getNumUses())
+      continue;
+
+    // Cast away const-ness from `U`.
+    User *UU = const_cast<User *>(U);
+
+    if (auto *I = dyn_cast<Instruction>(UU)) {
+      replaceInstWhichUsesLDS(M, LDS, LDSAccessInst, F, I, ToBeErasedInsts);
+    } else if (auto *CE = dyn_cast<ConstantExpr>(UU)) {
+      // TODO: Performance issues because of converting `ConstExpr` to
+      // `Instruction`?
+      auto *I = replaceConstExprByInst(CE);
+      replaceInstWhichUsesLDS(M, LDS, LDSAccessInst, F, I, ToBeErasedInsts);
+      CE->removeDeadConstantUsers();
+    } else
+      llvm_unreachable("Not Implemented."); // TODO: What else is missing?
+  }
+
+  // Erase all the instructions which are got replaced by new ones.
+  for (auto *I : ToBeErasedInsts)
+    I->eraseFromParent();
+}
+
+static Instruction *insertInstructionsToAccessNewLDSLayout(
+    Module &M, GlobalVariable *LDS, Function *F,
+    std::map<GlobalVariable *, uint64_t> &LDSToID,
+    GlobalVariable *LDSOffsetTable) {
+  // Suffix the names of the instructions with unique integer values.
+  static int Suffix = 0;
+  ++Suffix;
+
+  // Get the first insert point of the entry basic block of current kernel.
+  auto BI = F->getEntryBlock().getFirstInsertionPt();
+#ifndef NDEBUG
+  assert(BI != F->getEntryBlock().end() &&
+         "Entry basic block of the function cannot be empty, otherwise control "
+         "would not reach this point\n");
+#endif
+  auto &EI = *BI;
+
+  // Get LDS offset table indicies.
+  auto *KIndex = F->getArg(F->arg_size() - 1);
+  auto *LIndex = Constant::getIntegerValue(Type::getInt64Ty(M.getContext()),
+                                           APInt(64, LDSToID[LDS]));
+
+  // Insert GEP instruction which access the address
+  // `((LDSOffsetTable + KIndex) + LIndex)`, say, the result is `GEPInst` which
+  // is  of type `Int64*`.
+  Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())),
+                      KIndex, LIndex};
+  auto *GEPInst = GetElementPtrInst::CreateInBounds(
+      LDSOffsetTable->getValueType(), LDSOffsetTable, Indices,
+      Twine("dssv.gep1.") + Twine(Suffix), const_cast<Instruction *>(&EI));
+
+  // Insert LOAD instruction which loads `offset` value from LDS offset table.
+  auto *LInst = new LoadInst(GEPInst->getType()->getPointerElementType(),
+                             GEPInst, Twine("dssv.load.") + Twine(Suffix),
+                             const_cast<Instruction *>(&EI));
+
+  // Get the base pointer of new LDS layout.
+  auto *BasePtr = F->getArg(F->arg_size() - 2);
+
+  // Insert GEP instruction which access the address `BasePtr + LInst`, say, the
+  // result is `GEPInst2` which if of type 'char*`.
+  Instruction *GEPInst2 = GetElementPtrInst::CreateInBounds(
+      BasePtr->getType()->getPointerElementType(), BasePtr, LInst,
+      Twine("dssv.gep2.") + Twine(Suffix), const_cast<Instruction *>(&EI));
+
+  // Insert type-cast instruction just after above inserted GEP instruction
+  // which type-casts GEP instruction from `char*` to `ldstype*` where `ldstype`
+  // is the type of original LDS global.
+  Instruction *CastInst = new BitCastInst(GEPInst2, LDS->getType(),
+                                          Twine("dssv.cast.") + Twine(Suffix),
+                                          const_cast<Instruction *>(&EI));
+
+  return CastInst;
+}
+
+static void updateFunctionAssociatedWithLDS(
+    Module &M, GlobalVariable *LDS,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    std::map<GlobalVariable *, uint64_t> &LDSToID,
+    GlobalVariable *LDSOffsetTable) {
+  // Get end-callee associated with `LDS`.
+  auto *F = LDSToFunction[LDS];
+
+  // Insert necessary instruction(s) within `F` in order to access new LDS
+  // layout.
+  auto *LDSAccessInst = insertInstructionsToAccessNewLDSLayout(
+      M, LDS, F, LDSToID, LDSOffsetTable);
+
+  // Finally update all the users of original LDS to refer to new LDS layout via
+  // above inserted LDS access instruction.
+  updateAllUsersOfOriginalLDS(M, LDS, F, LDSAccessInst);
+}
+
+static void
+eraseOldCallees(ValueMap<Function *, Function *> &OldCalleeToNewCallee) {
+  // TODO: May be we can come-up with a more efficient implmentation to erase
+  // old callees from the module. It depends on how many callees that we land-up
+  // erasing in a real world hip application. May be not many, hence, as of now,
+  // we have just employed a simplest method of repeatedly visiting the old
+  // callees and removing each of them, once their number of uses become 0.
+  bool Loopover = true;
+  while (Loopover) {
+    Loopover = false;
+    for (auto OI = OldCalleeToNewCallee.begin(),
+              OE = OldCalleeToNewCallee.end();
+         OI != OE; ++OI) {
+      auto *OldCallee = OI->first;
+      if (OldCallee->getNumUses())
+        continue;
+      OldCalleeToNewCallee.erase(OI);
+      OldCallee->eraseFromParent();
+      Loopover = true;
+    }
+  }
+}
+
+static void
+replaceNonCallInsts(ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+                    SetVector<Instruction *> &NonCallInsts) {
+  for (auto *I : NonCallInsts) {
+    Instruction *NewI = I->clone();
+    Type *NewTy = nullptr;
+
+    for (auto OI = OldCalleeToNewCallee.begin(),
+              OE = OldCalleeToNewCallee.end();
+         OI != OE; ++OI) {
+      auto *OldCallee = OI->first;
+      auto *NewCallee = OI->second;
+      unsigned Ind = 0;
+      for (Use &UU : NewI->operands()) {
+        Value *V = UU.get();
+        if (V == OldCallee) {
+          NewI->setOperand(Ind, NewCallee);
+          if (!NewTy) // TODO: Can we fix this horrible hack?
+            NewTy = NewCallee->getType();
+        }
+        ++Ind;
+      }
+    }
+
+    NewI->insertBefore(I);
+    NewI->copyMetadata(*I);
+    I->replaceAllUsesWith(NewI);
+    I->eraseFromParent();
+    NewI->mutateType(NewTy);
+  }
+}
+
+static void
+updateNonCallInsts(ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+                   SetVector<Instruction *> &NonCallInsts) {
+  replaceNonCallInsts(OldCalleeToNewCallee, NonCallInsts);
+}
+
+static void
+eraseOldCallSites(std::map<CallInst *, CallInst *> &OldToNewCallSite) {
+  for (auto KI = OldToNewCallSite.begin(), KE = OldToNewCallSite.end();
+       KI != KE; ++KI) {
+    auto *OldCI = KI->first;
+    auto *NewCI = KI->second;
+    OldCI->replaceAllUsesWith(NewCI);
+    OldCI->eraseFromParent();
+  }
+}
+
+static void getNewIndirectCallTargets(
+    Module &M, CallInst *CI,
+    ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+    std::vector<Function *> &NewIndirectCallTargets) {
+#ifndef NDEBUG
+  assert(CI->isIndirectCall() && "Indirect call expected\n");
+#endif
+
+  auto *MD = CI->getMetadata(LLVMContext::MD_callees);
+#ifndef NDEBUG
+  assert(MD && "Metadata about indirect call targets expected\n");
+#endif
+
+  for (auto &Op : MD->operands()) {
+    auto *OldCallee = mdconst::extract_or_null<Function>(Op);
+    auto OI = OldCalleeToNewCallee.find(OldCallee);
+#ifndef NDEBUG
+    assert(OI != OldCalleeToNewCallee.end() && "Old callee is expected\n");
+#endif
+    NewIndirectCallTargets.push_back(OI->second);
+  }
+}
+
+static void copyDataFromOldToNewCallSite(CallInst *OldCI, CallInst *NewCI) {
+  // TODO: Why copyMetadata() not copying meta data. I see metadat associated
+  // with CI, but it is not copied to NewCI. CI->hasMetadata() is false, why?
+  NewCI->copyMetadata(*OldCI);
+  NewCI->setTailCall(OldCI->isTailCall());
+  NewCI->setCallingConv(OldCI->getCallingConv());
+}
+
+static FunctionType *getClonedFunctionType(Module &M, Function *Callee) {
+  // Create a new function type by adding new parameters to the end of existing
+  // parameter list.
+  auto *BasePtrTy = Type::getInt8PtrTy(M.getContext(), AMDGPUAS::LOCAL_ADDRESS);
+  auto *KernNumTy = Type::getInt64Ty(M.getContext());
+
+  SmallVector<Type *, 8> NewParams;
+  auto *FnTy = Callee->getFunctionType();
+  for (auto PI = FnTy->param_begin(), PE = FnTy->param_end(); PI != PE; ++PI)
+    NewParams.push_back(*PI);
+
+  NewParams.push_back(BasePtrTy);
+  NewParams.push_back(KernNumTy);
+
+  auto *NewFnTy =
+      FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg());
+
+  return NewFnTy;
+}
+
+static void
+getNewArgumentList(Module &M, std::map<Function *, uint64_t> &KernelToID,
+                   std::map<Function *, Instruction *> &KernelToBasePtrInst,
+                   CallInst *CI, SmallVectorImpl<Value *> &NewArgs) {
+  Value *BasePtrArg = nullptr;
+  Value *KernNumArg = nullptr;
+
+  auto *Caller = CI->getParent()->getParent();
+  if (Caller->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
+    BasePtrArg = KernelToBasePtrInst[Caller];
+    auto KernelNum = KernelToID[Caller];
+    KernNumArg = Constant::getIntegerValue(Type::getInt64Ty(M.getContext()),
+                                           APInt(64, KernelNum));
+  } else {
+    auto ArgSize = Caller->arg_size();
+    BasePtrArg = Caller->getArg(ArgSize - 2);
+    KernNumArg = Caller->getArg(ArgSize - 1);
+  }
+
+  for (auto AI = CI->arg_begin(), AE = CI->arg_end(); AI != AE; ++AI)
+    NewArgs.push_back(*AI);
+  NewArgs.push_back(BasePtrArg);
+  NewArgs.push_back(KernNumArg);
+}
+
+static void replaceIndirectCallSites(
+    Module &M, std::map<Function *, uint64_t> &KernelToID,
+    std::map<Function *, Instruction *> &KernelToBasePtrInst,
+    ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+    ValueMap<CallInst *, Function *> &IndirectCallSiteToCallee) {
+  MDBuilder MDB(M.getContext());
+  std::map<CallInst *, CallInst *> OldToNewCallSite;
+  for (auto SI = IndirectCallSiteToCallee.begin(),
+            SE = IndirectCallSiteToCallee.end();
+       SI != SE; ++SI) {
+    auto *CI = SI->first;
+    auto *Callee = SI->second;
+
+    // Get new argument list which can be used to insert new call instruction.
+    SmallVector<Value *, 8> NewArgs;
+    getNewArgumentList(M, KernelToID, KernelToBasePtrInst, CI, NewArgs);
+
+    // Get required function type.
+    auto *NewFnTy = getClonedFunctionType(M, Callee);
+
+    // Insert new call instruction `NewCI` just before the existing call
+    // instruction `CI`.
+    auto *NewCI = CallInst::Create(NewFnTy, CI->getCalledOperand(), NewArgs,
+                                   Twine(""), CI);
+
+    copyDataFromOldToNewCallSite(CI, NewCI);
+
+    // Create `!callee` metadata which targets newly cloned functions and set it
+    // to `NewCI`.
+    std::vector<Function *> NewIndirectCallTargets;
+    getNewIndirectCallTargets(M, CI, OldCalleeToNewCallee,
+                              NewIndirectCallTargets);
+    auto *NewCallees = MDB.createCallees(NewIndirectCallTargets);
+    NewCI->setMetadata(LLVMContext::MD_callees, NewCallees);
+
+    OldToNewCallSite[CI] = NewCI;
+  }
+
+  eraseOldCallSites(OldToNewCallSite);
+}
+
+static bool isCallerNewClonedFunction(
+    Function *F, ValueMap<Function *, Function *> &NewCalleeToOldCallee) {
+  if (NewCalleeToOldCallee.find(F) != NewCalleeToOldCallee.end())
+    return true;
+
+  return false;
+}
+
+static bool
+isApplicableInst(Instruction *I,
+                 ValueMap<Function *, Function *> &NewCalleeToOldCallee) {
+  // We are only interested in the inctructions within kernel or within new
+  // cloned functions.
+#ifndef NDEBUG
+  assert(I && "Valid instruction expected\n");
+#endif
+  auto *Caller = I->getParent()->getParent();
+  if (Caller->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
+      isCallerNewClonedFunction(Caller, NewCalleeToOldCallee))
+    return true;
+
+  return false;
+}
+
+static void insertToIndirectCallSiteSet(
+    CallInst *CI, ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+    ValueMap<Function *, Function *> &NewCalleeToOldCallee,
+    ValueMap<CallInst *, Function *> &IndirectCallSiteToCallee) {
+  auto *MD = CI->getMetadata(LLVMContext::MD_callees);
+  if (!MD)
+    return;
+
+  // TODO: Further clean-up the logic?
+  for (auto &Op : MD->operands()) {
+    auto *Callee = mdconst::extract_or_null<Function>(Op);
+    if (Callee && !Callee->isDeclaration() &&
+        OldCalleeToNewCallee.find(Callee) != OldCalleeToNewCallee.end() &&
+        isApplicableInst(CI, NewCalleeToOldCallee) &&
+        IndirectCallSiteToCallee.find(CI) == IndirectCallSiteToCallee.end()) {
+      IndirectCallSiteToCallee[CI] = Callee;
+      return;
+    }
+  }
+}
+
+static void collectIndirectCallSites(
+    Module &M, ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+    ValueMap<Function *, Function *> &NewCalleeToOldCallee,
+    SetVector<Instruction *> &NonCallInsts,
+    ValueMap<CallInst *, Function *> &IndirectCallSiteToCallee) {
+  SmallVector<Value *, 16> ValueStack;
+  SetVector<Value *> Visited;
+
+  for (auto *I : NonCallInsts)
+    ValueStack.push_back(I);
+
+  while (!ValueStack.empty()) {
+    auto *V = ValueStack.pop_back_val();
+
+    if (!Visited.insert(V))
+      continue;
+
+    auto *CI = dyn_cast<CallInst>(V);
+    if (CI && CI->isIndirectCall()) {
+      insertToIndirectCallSiteSet(CI, OldCalleeToNewCallee,
+                                  NewCalleeToOldCallee,
+                                  IndirectCallSiteToCallee);
+      continue;
+    }
+
+    for (auto *U : V->users())
+      ValueStack.push_back(U);
+  }
+}
+
+static void updateIndirectCallSites(
+    Module &M, std::map<Function *, uint64_t> &KernelToID,
+    std::map<Function *, Instruction *> &KernelToBasePtrInst,
+    ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+    ValueMap<Function *, Function *> &NewCalleeToOldCallee,
+    SetVector<Instruction *> &NonCallInsts) {
+  ValueMap<CallInst *, Function *> IndirectCallSiteToCallee;
+  collectIndirectCallSites(M, OldCalleeToNewCallee, NewCalleeToOldCallee,
+                           NonCallInsts, IndirectCallSiteToCallee);
+  replaceIndirectCallSites(M, KernelToID, KernelToBasePtrInst,
+                           OldCalleeToNewCallee, IndirectCallSiteToCallee);
+}
+
+static void
+replaceDirectCallSites(Module &M, std::map<Function *, uint64_t> &KernelToID,
+                       std::map<Function *, Instruction *> &KernelToBasePtrInst,
+                       Function *NewCallee,
+                       SetVector<CallInst *> &DirectCallSites) {
+  std::map<CallInst *, CallInst *> OldToNewCallSite;
+  for (auto *CI : DirectCallSites) {
+    // Get new argument list which can be used to insert new call instruction.
+    SmallVector<Value *, 8> NewArgs;
+    getNewArgumentList(M, KernelToID, KernelToBasePtrInst, CI, NewArgs);
+
+    // Insert new call instruction `NewCI` just before the existing call
+    // instruction `CI`.
+    auto *NewCI = CallInst::Create(NewCallee->getFunctionType(), NewCallee,
+                                   NewArgs, Twine(""), CI);
+
+    copyDataFromOldToNewCallSite(CI, NewCI);
+
+    OldToNewCallSite[CI] = NewCI;
+  }
+
+  eraseOldCallSites(OldToNewCallSite);
+}
+
+static void
+collectDirectCallSites(Function *OldCallee,
+                       ValueMap<Function *, Function *> &NewCalleeToOldCallee,
+                       SetVector<CallInst *> &DirectCallSites) {
+  for (auto *U : OldCallee->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (CI && isApplicableInst(CI, NewCalleeToOldCallee))
+      DirectCallSites.insert(CI);
+  }
+}
+
+static void
+updateDirectCallSites(Module &M, std::map<Function *, uint64_t> &KernelToID,
+                      std::map<Function *, Instruction *> &KernelToBasePtrInst,
+                      ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+                      ValueMap<Function *, Function *> &NewCalleeToOldCallee) {
+  // Update all direct call sites of all callees.
+  for (auto OI = OldCalleeToNewCallee.begin(), OE = OldCalleeToNewCallee.end();
+       OI != OE; ++OI) {
+    auto *OldCallee = OI->first;
+    auto *NewCallee = OI->second;
+    SetVector<CallInst *> DirectCallSites;
+    collectDirectCallSites(OldCallee, NewCalleeToOldCallee, DirectCallSites);
+    replaceDirectCallSites(M, KernelToID, KernelToBasePtrInst, NewCallee,
+                           DirectCallSites);
+  }
+}
+
+static void
+collectNonCallInsts(Function *OldCallee,
+                    ValueMap<Function *, Function *> &NewCalleeToOldCallee,
+                    SetVector<Instruction *> &NonCallInsts) {
+  for (auto *U : OldCallee->users()) {
+    auto *CI = dyn_cast<CallInst>(U);
+    if (!CI && isApplicableInst(dyn_cast<Instruction>(U), NewCalleeToOldCallee))
+      NonCallInsts.insert(dyn_cast<Instruction>(U));
+  }
+}
+
+static void
+updateCalleeUsers(Module &M, std::map<Function *, uint64_t> &KernelToID,
+                  std::map<Function *, Instruction *> &KernelToBasePtrInst,
+                  ValueMap<Function *, Function *> &OldCalleeToNewCallee) {
+  // Create mapping from new callees to old callees.
+  ValueMap<Function *, Function *> NewCalleeToOldCallee;
+  for (auto OI = OldCalleeToNewCallee.begin(), OE = OldCalleeToNewCallee.end();
+       OI != OE; ++OI)
+    NewCalleeToOldCallee[OI->second] = OI->first;
+
+  // Collect all non-call users of all callees, usually associated with indirect
+  // calls.
+  SetVector<Instruction *> NonCallInsts;
+  for (auto OI = OldCalleeToNewCallee.begin(), OE = OldCalleeToNewCallee.end();
+       OI != OE; ++OI)
+    collectNonCallInsts(OI->first, NewCalleeToOldCallee, NonCallInsts);
+
+  // Update all direct call sites of all callees.
+  updateDirectCallSites(M, KernelToID, KernelToBasePtrInst,
+                        OldCalleeToNewCallee, NewCalleeToOldCallee);
+
+  // Update all indirect call sites of all callees.
+  updateIndirectCallSites(M, KernelToID, KernelToBasePtrInst,
+                          OldCalleeToNewCallee, NewCalleeToOldCallee,
+                          NonCallInsts);
+
+  // Update all non-call user instructions which refer to old callees like
+  // address taken instructions for indirect calls.
+  updateNonCallInsts(OldCalleeToNewCallee, NonCallInsts);
+}
+
+static void updateKernelToCalleeList(
+    ValueMap<Function *, std::set<Function *>> &KernelToCallee,
+    ValueMap<Function *, Function *> &OldCalleeToNewCallee) {
+  for (auto KI = KernelToCallee.begin(), KE = KernelToCallee.end(); KI != KE;
+       ++KI) {
+    auto *K = KI->first;
+    auto OldCalleeList = KI->second;
+    std::set<Function *> NewCalleeList;
+    for (auto *OldCallee : OldCalleeList) {
+      if (OldCalleeToNewCallee.find(OldCallee) != OldCalleeToNewCallee.end())
+        NewCalleeList.insert(OldCalleeToNewCallee[OldCallee]);
+      else
+        NewCalleeList.insert(OldCallee);
+    }
+    KernelToCallee[K] = NewCalleeList;
+  }
+}
+
+static void
+updateLDSToFunctionMap(ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+                       ValueMap<Function *, Function *> &OldCalleeToNewCallee) {
+  for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE;
+       ++LI) {
+    auto *LDS = LI->first;
+    auto *OldF = LI->second;
+    if (OldCalleeToNewCallee.find(OldF) != OldCalleeToNewCallee.end())
+      LDSToFunction[LDS] = OldCalleeToNewCallee[OldF];
+  }
+}
+
+static Function *cloneCallee(Module &M, Function *Callee) {
+  // Create a new function type of to be cloned function.
+  auto *NewFnTy = getClonedFunctionType(M, Callee);
+
+  // Create a copy of the `Callee`, but with new function type.
+  auto *NewCallee =
+      Function::Create(NewFnTy, Callee->getLinkage(), Callee->getAddressSpace(),
+                       Callee->getName() + Twine(".c"));
+
+  ValueToValueMapTy VMap;
+  auto *NewCalleeArgIt = NewCallee->arg_begin();
+  for (auto &Arg : Callee->args()) {
+    auto ArgName = Arg.getName();
+    NewCalleeArgIt->setName(ArgName);
+    VMap[&Arg] = &(*NewCalleeArgIt++);
+  }
+
+  // TODO: ModuleLevelChanges should be set to true or false?
+  SmallVector<ReturnInst *, 8> Returns;
+  CloneFunctionInto(NewCallee, Callee, VMap, /*ModuleLevelChanges=*/false,
+                    Returns);
+
+  // Copy all metadata
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  Callee->getAllMetadata(MDs);
+  for (auto MDIt : MDs)
+    NewCallee->addMetadata(MDIt.first, *MDIt.second);
+
+  // Insert `NewCallee` just before `Callee` within the module.
+  M.getFunctionList().insert(Callee->getIterator(), NewCallee);
+
+  // Swap names so that new function retains original name.
+  auto NewName = NewCallee->getName();
+  NewCallee->takeName(Callee);
+  Callee->setName(NewName);
+
+  return NewCallee;
+}
+
+static void traverseCGPathsAndCloneCallees(
+    Module &M, ValueMap<Function *, Function *> &OldCalleeToNewCallee,
+    SmallVectorImpl<SetVector<Function *>> &CGPaths) {
+  // 1. Traverse all the call graph paths from kernels to callees.
+  // 2. For each device function encoutered while traversing, create a clone of
+  //    it, by adding parameters to it's parameter list, but also retain the
+  //    original device function for the moment.
+  for (auto CGPath : CGPaths) {
+    // TODO: We can in-fact assert that length of `CGPath` is atleast two. But,
+    // we are okay for now.
+    if (CGPath.size() < 2)
+      continue;
+
+    // We are interested in cloning of only device functions in the call graph
+    // path, hence we start from second node in the call graph path.
+    auto PI = CGPath.begin() + 1;
+    auto PE = CGPath.end();
+    for (; PI != PE; ++PI) {
+      auto *CurCallee = *PI;
+#ifndef NDEBUG
+      assert(CurCallee && "Valid function expected\n");
+#endif
+
+      if (OldCalleeToNewCallee.find(CurCallee) != OldCalleeToNewCallee.end())
+        continue;
+
+      // The `CurCallee` is encountered first time, create a clone of it.
+      auto *NewCallee = cloneCallee(M, CurCallee);
+      OldCalleeToNewCallee[CurCallee] = NewCallee;
+    }
+  }
+}
+
+static void
+pushCallGraphPaths(CallGraphNode *CGNode, SetVector<CallGraphNode *> &TopCGPath,
+                   SmallVectorImpl<SetVector<CallGraphNode *>> &CGPathStack,
+                   std::map<CallGraphNode *, CallInst *> &CGNodeToCallInst) {
+  for (auto NI = CGNode->begin(), NE = CGNode->end(); NI != NE; ++NI) {
+    auto *CI = dyn_cast<CallInst>(NI->first.getValue());
+    auto *CGN = NI->second;
+#ifndef NDEBUG
+    assert(CI && "Call instruction associated with call graph node cannot be"
+                 " null\n");
+    assert(CGN && "Call graph node associated with function definition cannot"
+                  " be null\n");
+#endif
+    SetVector<CallGraphNode *> ClonedCGPath(TopCGPath.begin(), TopCGPath.end());
+    ClonedCGPath.insert(CGN);
+    CGPathStack.push_back(ClonedCGPath);
+    CGNodeToCallInst[CGN] = CI;
+  }
+}
+
+static void
+insertToCGPathSet(CallGraphNode *CGNode, Function *EndCallee,
+                  SetVector<CallGraphNode *> &TopCGPath,
+                  SmallVectorImpl<SetVector<CallGraphNode *>> &CGPathStack,
+                  std::map<CallGraphNode *, CallInst *> &CGNodeToCallInst,
+                  SmallVectorImpl<SetVector<Function *>> &CGPaths,
+                  bool IndirectCallHandling = false) {
+  auto *Callee = CGNode->getFunction();
+  if (Callee && !Callee->isDeclaration()) {
+    if (Callee == EndCallee) {
+      SetVector<Function *> FPath;
+      for (auto *CGN : TopCGPath)
+        FPath.insert(CGN->getFunction());
+      CGPaths.push_back(FPath);
+      return;
+    }
+    if (IndirectCallHandling)
+      TopCGPath.insert(CGNode);
+    pushCallGraphPaths(CGNode, TopCGPath, CGPathStack, CGNodeToCallInst);
+  }
+}
+
+static void collectCallGraphPathsBetweenKernelAndCallee(
+    Module &M, Function *K, Function *Callee,
+    SmallVectorImpl<SetVector<Function *>> &CGPaths) {
+  // Traverse the call graph associated with the kernel in DFS manner and
+  // collect all the paths from kernel to callee.
+  //
+  // TODO: Note that this algorithm will not work if there exist recursive
+  // calls, and the current assumption here is that the call graph is acyclic.
+  // We need to visit it back again to handle call graph which could contain
+  // cycles.
+  auto CG = CallGraph(M);
+  auto *KernCGNode = CG[K];
+#ifndef NDEBUG
+  assert(KernCGNode && "Call graph node associated with kernel definition "
+                       "cannot be null\n");
+#endif
+
+  SetVector<CallGraphNode *> CGPath;
+  CGPath.insert(KernCGNode);
+  SmallVector<SetVector<CallGraphNode *>, 8> CGPathStack;
+  std::map<CallGraphNode *, CallInst *> CGNodeToCallInst;
+
+  pushCallGraphPaths(KernCGNode, CGPath, CGPathStack, CGNodeToCallInst);
+
+  while (!CGPathStack.empty()) {
+    auto TopCGPath = CGPathStack.pop_back_val();
+    auto *CGNode = TopCGPath.back();
+
+    if (CGNode->getFunction()) {
+      // Direct calls
+      insertToCGPathSet(CGNode, Callee, TopCGPath, CGPathStack,
+                        CGNodeToCallInst, CGPaths);
+    } else {
+      // Indirect calls
+      TopCGPath.remove(CGNode);
+      auto *CI = CGNodeToCallInst[CGNode];
+      if (auto *MD = CI->getMetadata(LLVMContext::MD_callees)) {
+        auto CloneOfTopCGPath = TopCGPath;
+        for (auto &Op : MD->operands())
+          insertToCGPathSet(CG[mdconst::extract_or_null<Function>(Op)], Callee,
+                            CloneOfTopCGPath, CGPathStack, CGNodeToCallInst,
+                            CGPaths, true);
+      }
+    }
+  }
+}
+
+static void createCloneOfCalleesWithNewParams(
+    Module &M, ValueMap<Function *, std::set<Function *>> &KernelToCallee,
+    ValueMap<Function *, Function *> &OldCalleeToNewCallee) {
+  // Construct clones of all callees which accept two additional new parameters:
+  // One is LDS layout base pointer which is of type `char*`, and other one is
+  // kernel number which is of type `Int64`.
+  //
+  // Collect all call graph paths between kernels and callees.
+  SmallVector<SetVector<Function *>, 8> CGPaths;
+  for (auto KI = KernelToCallee.begin(), KE = KernelToCallee.end(); KI != KE;
+       ++KI)
+    for (auto *Callee : KI->second)
+      collectCallGraphPathsBetweenKernelAndCallee(M, KI->first, Callee,
+                                                  CGPaths);
+
+  // Traverse all call graph paths from kernels to callees, and create clone of
+  // all callees along the paths.
+  traverseCGPathsAndCloneCallees(M, OldCalleeToNewCallee, CGPaths);
+}
+
+static void
+processCallees(Module &M, ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+               ValueMap<Function *, std::set<Function *>> &KernelToCallee,
+               std::map<Function *, uint64_t> &KernelToID,
+               std::map<Function *, Instruction *> &KernelToBasePtrInst) {
+  // Create clones of callees which accept new parameters.
+  ValueMap<Function *, Function *> OldCalleeToNewCallee;
+  createCloneOfCalleesWithNewParams(M, KernelToCallee, OldCalleeToNewCallee);
+
+  // Update all the required data structures to point to new cloned functions in
+  // place of their old counterparts.
+  updateLDSToFunctionMap(LDSToFunction, OldCalleeToNewCallee);
+  updateKernelToCalleeList(KernelToCallee, OldCalleeToNewCallee);
+
+  // Appropriately update all users of all old callees to refer to corresponding
+  // new callees.
+  updateCalleeUsers(M, KernelToID, KernelToBasePtrInst, OldCalleeToNewCallee);
+
+  // By now, all old functions are dead without any reference being made to
+  // them, erase them now from the module.
+  eraseOldCallees(OldCalleeToNewCallee);
+
+#ifndef NDEBUG
+  assert(OldCalleeToNewCallee.empty() &&
+         "All the old callees should have dead by now\n");
+#endif
+}
+
+static Instruction *
+insertBasePointerAccessInstructionWithinKernel(Module &M, Function *K,
+                                               GlobalVariable *NewLDS) {
+  // Suffix the names of the instructions with unique integer values.
+  static int Suffix = 0;
+  ++Suffix;
+
+  // Get the first insert point of the entry basic block of current kernel.
+  auto BI = K->getEntryBlock().getFirstInsertionPt();
+#ifndef NDEBUG
+  assert(BI != K->getEntryBlock().end() &&
+         "Entry basic block of the kernel cannot be empty, otherwise control "
+         "would not reach this point\n");
+#endif
+  auto &EI = *BI;
+
+  // Insert GEP instruction which access the address `NewLDS + 0`, say, the
+  // result is `GEPInst` which is of type `char*`.
+  Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())),
+                      Constant::getIntegerValue(
+                          Type::getInt64Ty(M.getContext()), APInt(32, 0))};
+
+  Instruction *GEPInst = GetElementPtrInst::CreateInBounds(
+      NewLDS->getValueType(), const_cast<GlobalVariable *>(NewLDS), Indices,
+      Twine("dssv.gep.") + Twine(Suffix), const_cast<Instruction *>(&EI));
+
+  return GEPInst;
+}
+
+static GlobalVariable *constructLDSOffsetTable(
+    Module &M, SetVector<Function *> &Kernels,
+    SetVector<GlobalVariable *> &LDSGlobals,
+    std::map<Function *, std::map<GlobalVariable *, uint64_t>>
+        &KernelToLDSOffset,
+    std::map<Function *, uint64_t> &KernelToID,
+    std::map<GlobalVariable *, uint64_t> &LDSToID,
+    std::map<uint64_t, Function *> &IDToKernel,
+    std::map<uint64_t, GlobalVariable *> &IDToLDS) {
+  // Get type of LDS offset table.
+  auto *EleTy = Type::getInt64Ty(M.getContext());
+  auto *Arr1DTy = ArrayType::get(EleTy, LDSGlobals.size());
+  auto *Arr2DTy = ArrayType::get(Arr1DTy, Kernels.size());
+
+  // Create offset initialization list.
+  SmallVector<llvm::Constant *, 16> Init2DValues;
+  for (unsigned K = 0; K < Kernels.size(); ++K) {
+    auto *Kernel = IDToKernel[K];
+#ifndef NDEBUG
+    assert(KernelToLDSOffset.find(Kernel) != KernelToLDSOffset.end() &&
+           "Expected LDS offset list\n");
+#endif
+    auto &LDSToOffset = KernelToLDSOffset[Kernel];
+    SmallVector<llvm::Constant *, 16> Init1DValues;
+    for (unsigned L = 0; L < LDSGlobals.size(); ++L) {
+      auto *LDS = IDToLDS[L];
+      auto Offset =
+          LDSToOffset.find(LDS) != LDSToOffset.end() ? LDSToOffset[LDS] : -1;
+      auto *C = Constant::getIntegerValue(EleTy, APInt(64, Offset));
+      Init1DValues.push_back(C);
+    }
+    auto *Const1D = ConstantArray::get(Arr1DTy, Init1DValues);
+    Init2DValues.push_back(Const1D);
+  }
+  auto *Const2D = ConstantArray::get(Arr2DTy, Init2DValues);
+
+  // Create LDS offset table with offset initialization.
+  auto *LDSOffsetTable = new GlobalVariable(
+      M, Arr2DTy, false, GlobalValue::InternalLinkage, Const2D,
+      Twine("__LDSGlobalsOffsetTable__"), nullptr,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  // Set proper alignment.
+  LDSOffsetTable->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  LDSOffsetTable->setAlignment(
+      MaybeAlign(M.getDataLayout().getPreferredAlign(LDSOffsetTable)));
+
+  return LDSOffsetTable;
+}
+
+static GlobalVariable *
+createSingleContiguousLayout(Module &M, Function *K,
+                             uint64_t &TotalLDSSizeInBytes) {
+  // Create a new LDS global which is nothing but a single contigeous shared
+  // memory layout representing all the LDS globals which are defined within
+  // callees, and the size of this new contigeous LDS global layout is equal to
+  // the sum of the sizes of all these associated LDS globals.
+  auto *NewLDSTy =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes);
+  auto *NewLDS = new GlobalVariable(
+      M, NewLDSTy, false, GlobalValue::InternalLinkage,
+      UndefValue::get(NewLDSTy),
+      Twine(K->getName()) + Twine(".Unified.Device.Scope.LDS.Layout"), nullptr,
+      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+  NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS)));
+
+  return NewLDS;
+}
+
+static void
+computeTotalLDSSizeInBytes(ValueMap<GlobalVariable *, uint64_t> &LDSToSize,
+                           SetVector<GlobalVariable *> &KernelLDSList,
+                           std::map<GlobalVariable *, uint64_t> &LDSToOffset,
+                           uint64_t &TotalLDSSizeInBytes) {
+  // For the current kernel, compute the total size of all LDS globals, and also
+  // offsets associated with them within in the new LDS global.
+  TotalLDSSizeInBytes = 0;
+  for (auto *LDS : KernelLDSList) {
+    LDSToOffset[LDS] = TotalLDSSizeInBytes;
+    TotalLDSSizeInBytes += LDSToSize[LDS];
+  }
+}
+
+static void constructKernelSpecificLDSLayouts(
+    Module &M, SetVector<Function *> &Kernels,
+    SetVector<GlobalVariable *> &LDSGlobals,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToLDS,
+    ValueMap<GlobalVariable *, uint64_t> &LDSToSize,
+    std::map<Function *, uint64_t> &KernelToID,
+    std::map<Function *, std::map<GlobalVariable *, uint64_t>>
+        &KernelToLDSOffset,
+    ValueMap<Function *, GlobalVariable *> &KernelToNewLDSLayout) {
+  // Traverse through each kernel, and construct corresponding new LDS layout.
+  for (auto *K : Kernels) {
+    // Get LDS list for current kernel.
+    SetVector<GlobalVariable *> &KernelLDSList = KernelToLDS[K];
+#ifndef NDEBUG
+    assert(!KernelLDSList.empty() && "Expected non-empty LDS set\n");
+#endif
+
+    // Compute total aggregate size of all LDS globals associated with current
+    // kernel along with their offsets within new LDS layout.
+    uint64_t TotalLDSSizeInBytes;
+    std::map<GlobalVariable *, uint64_t> LDSToOffset;
+    computeTotalLDSSizeInBytes(LDSToSize, KernelLDSList, LDSToOffset,
+                               TotalLDSSizeInBytes);
+
+    // Create a single contigeous LDS layout for current kernel.
+    auto *NewLDS = createSingleContiguousLayout(M, K, TotalLDSSizeInBytes);
+
+    // Save offsets and new LDS layout.
+    KernelToLDSOffset[K] = LDSToOffset;
+    KernelToNewLDSLayout[K] = NewLDS;
+  }
+}
+
+static void MapKernelsAndLDSGlobalsToUniqueId(
+    SetVector<Function *> &Kernels, SetVector<GlobalVariable *> &LDSGlobals,
+    std::map<Function *, uint64_t> &KernelToID,
+    std::map<GlobalVariable *, uint64_t> &LDSToID,
+    std::map<uint64_t, Function *> &IDToKernel,
+    std::map<uint64_t, GlobalVariable *> &IDToLDS) {
+  // Map each kernel to unique integer and vice versa.
+  uint64_t KNum = 0;
+  for (auto *K : Kernels) {
+    KernelToID[K] = KNum;
+    IDToKernel[KNum] = K;
+    ++KNum;
+  }
+
+  // Map each LDS to unique integer and vice versa.
+  uint64_t LNum = 0;
+  for (auto *LDSGlobal : LDSGlobals) {
+    LDSToID[LDSGlobal] = LNum;
+    IDToLDS[LNum] = LDSGlobal;
+    ++LNum;
+  }
+}
+
+static void processDeviceScopeSharedVariables(
+    Module &M, SetVector<Function *> &Kernels,
+    SetVector<GlobalVariable *> &LDSGlobals,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, std::set<Function *>> &KernelToCallee,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToLDS,
+    ValueMap<GlobalVariable *, uint64_t> &LDSToSize) {
+  // Map each kernel and LDS global to unique integer and vice versa.
+  std::map<Function *, uint64_t> KernelToID;
+  std::map<GlobalVariable *, uint64_t> LDSToID;
+  std::map<uint64_t, Function *> IDToKernel;
+  std::map<uint64_t, GlobalVariable *> IDToLDS;
+  MapKernelsAndLDSGlobalsToUniqueId(Kernels, LDSGlobals, KernelToID, LDSToID,
+                                    IDToKernel, IDToLDS);
+
+  // Create a single contigeous LDS global layout for each kernel.
+  std::map<Function *, std::map<GlobalVariable *, uint64_t>> KernelToLDSOffset;
+  ValueMap<Function *, GlobalVariable *> KernelToNewLDSLayout;
+  constructKernelSpecificLDSLayouts(M, Kernels, LDSGlobals, KernelToLDS,
+                                    LDSToSize, KernelToID, KernelToLDSOffset,
+                                    KernelToNewLDSLayout);
+
+  // Create 2D LDS offset table which will be referred at runtime to access the
+  // kernel specific LDS layout offset value which correspond to original device
+  // scope LDS.
+  auto *LDSOffsetTable =
+      constructLDSOffsetTable(M, Kernels, LDSGlobals, KernelToLDSOffset,
+                              KernelToID, LDSToID, IDToKernel, IDToLDS);
+
+  // Insert GEP instruction - `LDS_LAYOUT_START_ADDR + 0` within each kernel
+  // which will be of type `char*`.
+  std::map<Function *, Instruction *> KernelToBasePtrInst;
+  for (auto *K : Kernels)
+    KernelToBasePtrInst[K] = insertBasePointerAccessInstructionWithinKernel(
+        M, K, KernelToNewLDSLayout[K]);
+
+  // Create clones of callees to accept new parameters and accordingly update
+  // corresponding call sites.
+  processCallees(M, LDSToFunction, KernelToCallee, KernelToID,
+                 KernelToBasePtrInst);
+
+  // Update end-callees where the LDS globals was originally defined so that all
+  // the references to original LDS globals within end-callees are appropriately
+  // replaced. And finally erase original LDS globals from the module.
+  for (auto *LDS : LDSGlobals) {
+    updateFunctionAssociatedWithLDS(M, LDS, LDSToFunction, LDSToID,
+                                    LDSOffsetTable);
+    LDS->eraseFromParent();
+  }
+}
+
+static void
+getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal,
+                        ValueMap<GlobalVariable *, uint64_t> &LDSToSize) {
+  LDSToSize[LDSGlobal] = getTypeStoreSizeInBytes(M, LDSGlobal->getValueType());
+}
+
+static void
+filterKernels(SetVector<Function *> &Kernels,
+              ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToLDS) {
+  // Collect all the kernels which can be removed.
+  SetVector<Function *> ToBeRemovedKernels;
+  for (auto *K : Kernels)
+    if (KernelToLDS.find(K) == KernelToLDS.end())
+      ToBeRemovedKernels.insert(K);
+
+  // Remove all those kernels which do not have any device scope variables
+  // associated with them.
+  for (auto *K : ToBeRemovedKernels)
+    Kernels.remove(K);
+}
+
+static void pairUpKernelWithLDSList(
+    Function *K, ValueMap<Function *, std::set<Function *>> &KernelToCallee,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToLDS) {
+  // Collect all the LDS globals defined within the end-callees associated with
+  // the current kernel.
+  SetVector<GlobalVariable *> LDSSet;
+  auto Callees = KernelToCallee[K];
+  for (auto *Callee : Callees) {
+    if (FunctionToLDS.find(Callee) == FunctionToLDS.end())
+      continue;
+    SetVector<GlobalVariable *> CalleeLDSSet = FunctionToLDS[Callee];
+    for (auto *CalleeLDS : CalleeLDSSet)
+      LDSSet.insert(CalleeLDS);
+  }
+  if (!LDSSet.empty())
+    KernelToLDS[K] = LDSSet;
+}
+
+static void filterDeviceScopeLDSGlobals(
+    SetVector<GlobalVariable *> &LDSGlobals,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction) {
+  // Filter out all LDS globals which are defined within kernels since we don`t
+  // need to handle them.
+  SetVector<GlobalVariable *> ToBeRemovedLDSList;
+  for (auto *LDS : LDSGlobals)
+    if (LDSToFunction.find(LDS) == LDSToFunction.end())
+      ToBeRemovedLDSList.insert(LDS);
+  for (auto *LDS : ToBeRemovedLDSList)
+    LDSGlobals.remove(LDS);
+}
+
+static void filterDeviceFunctions(
+    SetVector<GlobalVariable *> &LDSGlobals,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, std::set<Function *>> &KernelToCallee) {
+  // There might exist device functions with LDS globals defined within them,
+  // but without a call graph path from any of the kernels. Filter out such
+  // device functions and associated LDS globals.
+  //
+  // Collect all actives end-callees.
+  std::set<Function *> ActiveEndCallees;
+  for (auto KI = KernelToCallee.begin(), KE = KernelToCallee.end(); KI != KE;
+       ++KI) {
+    auto &CalleeSet = KI->second;
+    for (auto *Callee : CalleeSet)
+      ActiveEndCallees.insert(Callee);
+  }
+
+  // Filter `FunctionToLDS` data structure.
+  std::set<Function *> ToBeRemovedFunctions;
+  for (auto FI = FunctionToLDS.begin(), FE = FunctionToLDS.end(); FI != FE;
+       ++FI)
+    if (ActiveEndCallees.find(FI->first) == ActiveEndCallees.end())
+      ToBeRemovedFunctions.insert(FI->first);
+  for (auto *F : ToBeRemovedFunctions)
+    FunctionToLDS.erase(F);
+
+  // Filter `LDSToFunction` data structure.
+  std::set<GlobalVariable *> ToBeRemovedLDSGlobals;
+  for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE;
+       ++LI)
+    if (FunctionToLDS.find(LI->second) == FunctionToLDS.end())
+      ToBeRemovedLDSGlobals.insert(LI->first);
+  for (auto *LDS : ToBeRemovedLDSGlobals)
+    LDSToFunction.erase(LDS);
+
+  // Filter `LDSGlobals` data structure.
+  filterDeviceScopeLDSGlobals(LDSGlobals, LDSToFunction);
+}
+
+static void
+pushCallGraphNodes(CallGraphNode *CGNode,
+                   SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+                   std::map<CallGraphNode *, CallInst *> &CGNodeToCallInst) {
+#ifndef NDEBUG
+  assert(CGNode && "Call graph node associated with function definition cannot"
+                   " be null\n");
+#endif
+
+  for (auto GI = CGNode->begin(), GE = CGNode->end(); GI != GE; ++GI) {
+    auto *CI = dyn_cast<CallInst>(GI->first.getValue());
+    auto *CGN = GI->second;
+#ifndef NDEBUG
+    assert(CI && "Call instruction associated with call graph node cannot be"
+                 " null\n");
+    assert(CGN && "Call graph node associated with function definition cannot"
+                  " be null\n");
+#endif
+    CGNodeStack.push_back(CGN);
+    CGNodeToCallInst[CGN] = CI;
+  }
+}
+
+static void insertToCalleeSet(
+    CallGraphNode *CGNode,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    SmallVectorImpl<CallGraphNode *> &CGNodeStack,
+    std::map<CallGraphNode *, CallInst *> &CGNodeToCallInst,
+    std::set<Function *> &CalleeSet) {
+  auto *Callee = CGNode->getFunction();
+  if (Callee && !Callee->isDeclaration()) {
+    if (FunctionToLDS.find(Callee) != FunctionToLDS.end())
+      CalleeSet.insert(Callee);
+    pushCallGraphNodes(CGNode, CGNodeStack, CGNodeToCallInst);
+  }
+}
+
+static void pairUpKernelWithCalleeList(
+    Module &M, Function *K,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, std::set<Function *>> &KernelToCallee) {
+  // Get the call graph node associated with current kernel, traverse the call
+  // graph associated with it in DFS manner and collect all the associated
+  // callees which define LDS global(s).
+  auto CG = CallGraph(M);
+  auto *KernCGNode = CG[K];
+  SmallVector<CallGraphNode *, 16> CGNodeStack;
+  SetVector<CallGraphNode *> Visited;
+  std::map<CallGraphNode *, CallInst *> CGNodeToCallInst;
+  std::set<Function *> CalleeSet;
+
+  pushCallGraphNodes(KernCGNode, CGNodeStack, CGNodeToCallInst);
+
+  while (!CGNodeStack.empty()) {
+    auto *CGNode = CGNodeStack.pop_back_val();
+    if (!Visited.insert(CGNode))
+      continue;
+
+    if (CGNode->getFunction()) {
+      // Direct calls
+      insertToCalleeSet(CGNode, FunctionToLDS, CGNodeStack, CGNodeToCallInst,
+                        CalleeSet);
+    } else {
+      // Indirect calls
+      auto *CI = CGNodeToCallInst[CGNode];
+      if (auto *MD = CI->getMetadata(LLVMContext::MD_callees)) {
+        for (auto &Op : MD->operands())
+          insertToCalleeSet(CG[mdconst::extract_or_null<Function>(Op)],
+                            FunctionToLDS, CGNodeStack, CGNodeToCallInst,
+                            CalleeSet);
+      }
+
+      // Indirect call does not bind to any particular function, and the
+      // `CallGraphNode` is same for all the function pointers which have same
+      // signature.
+      Visited.remove(CGNode);
+    }
+  }
+
+  KernelToCallee[K] = CalleeSet;
+}
+
+static void createFunctionToLDSMap(
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS) {
+  for (auto LI = LDSToFunction.begin(), LE = LDSToFunction.end(); LI != LE;
+       ++LI) {
+    auto *LDSGlobal = LI->first;
+    auto *F = LI->second;
+    auto FI = FunctionToLDS.find(F);
+    if (FI == FunctionToLDS.end()) {
+      SetVector<GlobalVariable *> LDSSet;
+      LDSSet.insert(LDSGlobal);
+      FunctionToLDS[F] = LDSSet;
+    } else
+      FunctionToLDS[F].insert(LDSGlobal);
+  }
+}
+
+static void pairUpLDSGlobalWithItsAssociatedFunction(
+    GlobalVariable *LDSGlobal,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction) {
+  // Recursively visit user list of current LDS global and find the function
+  // within which the `LDSGlobal` is defined, and this function should always be
+  // successfully found.
+#ifndef NDEBUG
+  assert(!LDSGlobal->user_empty() &&
+         "LDS Global user list cannot be empty since it must have been defined "
+         "within either kernel or device function");
+#endif
+  SmallVector<User *, 16> UserStack;
+  SetVector<User *> Visited;
+
+  for (auto *U : LDSGlobal->users())
+    UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    auto *U = UserStack.pop_back_val();
+    if (!Visited.insert(U))
+      continue;
+
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      auto *F = I->getParent()->getParent();
+      if (!F)
+        continue;
+
+      // We are only interested in device scope shared variables.
+      if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
+        LDSToFunction[LDSGlobal] = F;
+
+      return;
+    }
+
+    for (auto *UU : U->users())
+      UserStack.push_back(UU);
+  }
+
+  llvm_unreachable("Control is not expected to reach this point");
+}
+
+static bool
+handleDeviceScopeSharedVariables(Module &M,
+                                 SetVector<GlobalVariable *> &LDSGlobals,
+                                 SetVector<Function *> &Kernels) {
+  // Pair up each LDS global with the function within which the LDS global is
+  // defined.
+  ValueMap<GlobalVariable *, Function *> LDSToFunction;
+  for (auto *LDSGlobal : LDSGlobals)
+    pairUpLDSGlobalWithItsAssociatedFunction(LDSGlobal, LDSToFunction);
+
+  // Filter out all LDS globals which are defined within kernels since we don`t
+  // need to handle them.
+  filterDeviceScopeLDSGlobals(LDSGlobals, LDSToFunction);
+
+  // Create reverse map from function to LDS globals.
+  ValueMap<Function *, SetVector<GlobalVariable *>> FunctionToLDS;
+  createFunctionToLDSMap(LDSToFunction, FunctionToLDS);
+
+  // Pair up kernels with end-callees which define LDS globals and there exist
+  // call graph paths from kernels to end-callees.
+  ValueMap<Function *, std::set<Function *>> KernelToCallee;
+  for (auto *K : Kernels)
+    pairUpKernelWithCalleeList(M, K, FunctionToLDS, KernelToCallee);
+
+  // There might exist device functions with LDS globals defined within them,
+  // but without a call graph path from any of the kernels. Filter out such
+  // device functions and associated LDS globals.
+  filterDeviceFunctions(LDSGlobals, LDSToFunction, FunctionToLDS,
+                        KernelToCallee);
+
+  // Pair up kernels with device scope LDS globals.
+  ValueMap<Function *, SetVector<GlobalVariable *>> KernelToLDS;
+  for (auto *K : Kernels)
+    pairUpKernelWithLDSList(K, KernelToCallee, FunctionToLDS, KernelToLDS);
+
+  // Consider only kernels which have device scope shared variables associated
+  // with them.
+  filterKernels(Kernels, KernelToLDS);
+
+  // If we do not have any kernel which is associated with device scope shared
+  // variables, then there is nothing to do, and there is no any module level
+  // changes to be done, just return `false`.
+  if (Kernels.empty())
+    return false;
+
+  // Get the size of each LDS global in bytes.
+  ValueMap<GlobalVariable *, uint64_t> LDSToSize;
+  for (auto *LDSGlobal : LDSGlobals)
+    getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize);
+
+  // Perform all the necessary processing required.
+  processDeviceScopeSharedVariables(M, Kernels, LDSGlobals, LDSToFunction,
+                                    KernelToCallee, KernelToLDS, LDSToSize);
+
+  // Module level changes are done, return `true`.
+  return true;
+}
+
+static bool handleDeviceScopeSharedVariables(Module &M) {
+  // Collect all the (static) LDS globals defined within the current module.
+  SetVector<GlobalVariable *> LDSGlobals;
+  for (auto &GV : M.globals())
+    if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        !GV.hasExternalLinkage())
+      LDSGlobals.insert(&GV);
+
+  if (LDSGlobals.empty()) {
+    LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName()
+                      << ", skipping handling device of scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  // Collect all the amdgpu kernels defined within the current module.
+  SetVector<Function *> Kernels;
+  for (auto &F : M.functions())
+    if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) &&
+        !F.isDeclaration())
+      Kernels.insert(&F);
+
+  if (Kernels.empty()) {
+    LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName()
+                      << ", skipping handling of device scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels);
+}
+
+bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) {
+  LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the "
+                       "module "
+                    << M.getName() << "\n");
+
+  // TODO: We only want to handle HIP kernels, and no kernels from from other
+  // programming languages, like OpenCL, OpenMP, etc. Do we need to add a
+  // condition here for it, and skip running the pass for non-HIP kernels?
+  if (skipModule(M)) {
+    LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables "
+                         "in the module "
+                      << M.getName() << "\n");
+    return false;
+  }
+
+  bool Changed = handleDeviceScopeSharedVariables(M);
+
+  LLVM_DEBUG(dbgs() << "===== Done with handling device scope shared variables "
+                       "in the module "
+                    << M.getName() << "\n");
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -40,6 +40,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableDeviceScopeSharedVariable;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -198,6 +198,12 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool, true> EnableDeviceScopeSharedVariable(
+    "amdgpu-enable-device-scope-shared-variable",
+    cl::desc("Support amdgpu device scope shared variables"),
+    cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -265,6 +271,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPUDeviceScopeSharedVariablePass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -394,6 +401,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -718,6 +726,12 @@
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  // We expect to run this pass as a first AMDGPU IR pass so that new
+  // instructions being added in this pass can possibly undergo further
+  // transformations via subsequent passes.
+  if (EnableDeviceScopeSharedVariable)
+    addPass(createAMDGPUDeviceScopeSharedVariablePass());
+
   addPass(createAMDGPUPrintfRuntimeBinding());
 
   // This must occur before inlining, as the inliner will not look through
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -50,6 +50,7 @@
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUDeviceScopeSharedVariable.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-deep-function-calls.ll
@@ -0,0 +1,166 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four
+; OLD-LDS: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two
+; OLD-LDS: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one
+; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four
+@_ZZ22function_four_with_ldsPiS_E17smem_in_func_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z22function_four_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep1.1 = getelementptr inbounds [2 x [1 x i64]], [2 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0
+; GCN-NEXT:  %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add
+; GCN-NEXT:  %6 = load i32, i32 addrspace(3)* %5, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %6, %3
+; GCN-NEXT:  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E17smem_in_func_four, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z24function_two_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z24function_one_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-indirect-call.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-indirect-call.ll
@@ -0,0 +1,154 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN,METADATA %s
+
+; OLD-LDS-NOT: @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six
+; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six
+@_ZZ21function_six_with_ldsPiS_E16smem_in_func_six = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z21function_six_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep1.1 = getelementptr inbounds [2 x [1 x i64]], [2 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0
+; GCN-NEXT: %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add
+; GCN-NEXT:  %6 = load i32, i32 addrspace(3)* %5, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %6, %3
+; GCN-NEXT:  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ21function_six_with_ldsPiS_E16smem_in_func_six, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal void @_Z25function_five_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal void @_Z25function_four_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z21function_six_with_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %2 = load i32, i32* %i_arg, align 4
+; GCN-NEXT:  %cmp = icmp eq i32 %2, 0
+; GCN-NEXT:  %3 = select i1 %cmp, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_four_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_five_with_no_ldsPiS_
+; GCN-NEXT:  tail call void %3(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1), !callees !0
+; GCN-NEXT:  ret void
+entry:
+  %0 = load i32, i32* %i_arg, align 4
+  %cmp = icmp eq i32 %0, 0
+  %func_three_fptr.0 = select i1 %cmp, void (i32*, i32*)* @_Z25function_four_with_no_ldsPiS_, void (i32*, i32*)* @_Z25function_five_with_no_ldsPiS_
+  tail call void %func_three_fptr.0(i32* nonnull %i_arg, i32* %o_arg), !callees !10
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal void @_Z24function_two_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal void @_Z24function_one_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* %i_arg.coerce, i32 addrspace(1)* %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4
+; GCN-NEXT:  %cmp = icmp eq i32 %1, 0
+; GCN-NEXT:  %2 = select i1 %cmp, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_two_with_no_ldsPiS_
+; GCN-NEXT:  %3 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  tail call void %2(i32* %0, i32* %3, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}), !callees !1
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4
+  %cmp = icmp eq i32 %1, 0
+  %kern_two_fptr.0 = select i1 %cmp, void (i32*, i32*)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*)* @_Z24function_two_with_no_ldsPiS_
+  %2 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  tail call void %kern_two_fptr.0(i32* nonnull %0, i32* %2), !callees !11
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* %i_arg.coerce, i32 addrspace(1)* %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4
+; GCN-NEXT:  %cmp = icmp eq i32 %1, 0
+; GCN-NEXT:  %2 = select i1 %cmp, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_two_with_no_ldsPiS_
+; GCN-NEXT:  %3 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  tail call void %2(i32* %0, i32* %3, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}}), !callees !1
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = load i32, i32 addrspace(1)* %i_arg.coerce, align 4
+  %cmp = icmp eq i32 %1, 0
+  %kern_one_fptr.0 = select i1 %cmp, void (i32*, i32*)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*)* @_Z24function_two_with_no_ldsPiS_
+  %2 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  tail call void %kern_one_fptr.0(i32* nonnull %0, i32* %2), !callees !11
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+;METADATA: !0 = !{void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_five_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z25function_four_with_no_ldsPiS_}
+;METADATA: !1 = !{void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*, i8 addrspace(3)*, i64)* @_Z24function_two_with_no_ldsPiS_}
+!10 = !{void (i32*, i32*)* @_Z25function_five_with_no_ldsPiS_, void (i32*, i32*)* @_Z25function_four_with_no_ldsPiS_}
+!11 = !{void (i32*, i32*)* @_Z24function_one_with_no_ldsPiS_, void (i32*, i32*)* @_Z24function_two_with_no_ldsPiS_}
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-2d-array.ll
@@ -0,0 +1,61 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+; NEW-LDS: @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+@_ZZ17function_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x [4 x i32]] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep1.1 = getelementptr inbounds [1 x [1 x i64]], [1 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0
+; GCN-NEXT:  %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x [4 x i32]] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* %dssv.cast.1, i32 0, i32 %2, i32 2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %4, align 8
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* %dssv.cast.1, i32 0, i32 %add, i32 2
+; GCN-NEXT:  %6 = load i32, i32 addrspace(3)* %5, align 8
+; GCN-NEXT:  %mul = mul nsw i32 %6, %3
+; GCN-NEXT:  %arrayidx15 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx15, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx42 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %0, i32 2
+  store i32 %1, i32 addrspace(3)* %arrayidx42, align 8
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx126 = getelementptr inbounds [256 x [4 x i32]], [256 x [4 x i32]] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %add, i32 2
+  %2 = load i32, i32 addrspace(3)* %arrayidx126, align 8
+  %mul = mul nsw i32 %2, %1
+  %arrayidx15 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx15, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z18kernel_with_no_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.1 = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(3)* @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.1, i64 0)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-with-different-data-types.ll
@@ -0,0 +1,119 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem
+; NEW-LDS: @_Z18kernel_with_no_ldsPcS_PiS0_PfS1_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem
+@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem = internal unnamed_addr addrspace(3) global [256 x i8] undef, align 16
+@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem = internal unnamed_addr addrspace(3) global [256 x float] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* nocapture readonly %ci_arg, i8* nocapture %co_arg, i32* nocapture readonly %ii_arg, i32* nocapture %io_arg, float* nocapture readonly %fi_arg, float* nocapture %fo_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep1.3 = getelementptr inbounds [1 x [3 x i64]], [1 x [3 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-2]}}
+; GCN-NEXT:  %dssv.load.3 = load i64, i64 addrspace(4)* %dssv.gep1.3, align 4
+; GCN-NEXT:  %dssv.gep2.3 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.3
+; GCN-NEXT:  %dssv.cast.3 = bitcast i8 addrspace(3)* %dssv.gep2.3 to [256 x {{[a-z]+[0-9]*}}] addrspace(3)*
+; GCN-NEXT:  %dssv.gep1.2 = getelementptr inbounds [1 x [3 x i64]], [1 x [3 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-2]}}
+; GCN-NEXT:  %dssv.load.2 = load i64, i64 addrspace(4)* %dssv.gep1.2, align 4
+; GCN-NEXT:  %dssv.gep2.2 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.2
+; GCN-NEXT:  %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep2.2 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %dssv.gep1.1 = getelementptr inbounds [1 x [3 x i64]], [1 x [3 x {{[a-z]+[0-9]*}}]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-2]}}
+; GCN-NEXT:  %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x {{[a-z]+[0-9]*}}] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i8, i8* %ci_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i8, i8* %arrayidx, align 1
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %2
+; GCN-NEXT:  store i8 %3, i8 addrspace(3)* %4, align 1
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %add
+; GCN-NEXT:  %6 = load i8, i8 addrspace(3)* %5, align 1
+; GCN-NEXT:  %mul = mul i8 %6, %3
+; GCN-NEXT:  %arrayidx14 = getelementptr inbounds i8, i8* %co_arg, i64 %idxprom
+; GCN-NEXT:  store i8 %mul, i8* %arrayidx14, align 1
+; GCN-NEXT:  %arrayidx17 = getelementptr inbounds i32, i32* %ii_arg, i64 %idxprom
+; GCN-NEXT:  %7 = load i32, i32* %arrayidx17, align 4
+; GCN-NEXT:  %8 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %2
+; GCN-NEXT:  store i32 %7, i32 addrspace(3)* %8, align 4
+; GCN-NEXT:  %9 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %add
+; GCN-NEXT:  %10 = load i32, i32 addrspace(3)* %9, align 4
+; GCN-NEXT:  %mul28 = mul nsw i32 %10, %7
+; GCN-NEXT:  %arrayidx31 = getelementptr inbounds i32, i32* %io_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul28, i32* %arrayidx31, align 4
+; GCN-NEXT:  %arrayidx34 = getelementptr inbounds float, float* %fi_arg, i64 %idxprom
+; GCN-NEXT:  %11 = load float, float* %arrayidx34, align 4
+; GCN-NEXT:  %12 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %2
+; GCN-NEXT:  store float %11, float addrspace(3)* %12, align 4
+; GCN-NEXT:  %13 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* %dssv.cast.{{[1-3]}}, i32 0, i32 %add
+; GCN-NEXT:  %14 = load float, float addrspace(3)* %13, align 4
+; GCN-NEXT:  %mul45 = fmul contract float %11, %14
+; GCN-NEXT:  %arrayidx48 = getelementptr inbounds float, float* %fo_arg, i64 %idxprom
+; GCN-NEXT:  store float %mul45, float* %arrayidx48, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %ci_arg, i64 %idxprom
+  %1 = load i8, i8* %arrayidx, align 1
+  %arrayidx32 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem, i32 0, i32 %0
+  store i8 %1, i8 addrspace(3)* %arrayidx32, align 1
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx94 = getelementptr inbounds [256 x i8], [256 x i8] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6c_smem, i32 0, i32 %add
+  %2 = load i8, i8 addrspace(3)* %arrayidx94, align 1
+  %mul = mul i8 %2, %1
+  %arrayidx14 = getelementptr inbounds i8, i8* %co_arg, i64 %idxprom
+  store i8 %mul, i8* %arrayidx14, align 1
+  %arrayidx17 = getelementptr inbounds i32, i32* %ii_arg, i64 %idxprom
+  %3 = load i32, i32* %arrayidx17, align 4
+  %arrayidx205 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem, i32 0, i32 %0
+  store i32 %3, i32 addrspace(3)* %arrayidx205, align 4
+  %arrayidx277 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6i_smem, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx277, align 4
+  %mul28 = mul nsw i32 %4, %3
+  %arrayidx31 = getelementptr inbounds i32, i32* %io_arg, i64 %idxprom
+  store i32 %mul28, i32* %arrayidx31, align 4
+  %arrayidx34 = getelementptr inbounds float, float* %fi_arg, i64 %idxprom
+  %5 = load float, float* %arrayidx34, align 4
+  %arrayidx378 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem, i32 0, i32 %0
+  store float %5, float addrspace(3)* %arrayidx378, align 4
+  %arrayidx4410 = getelementptr inbounds [256 x float], [256 x float] addrspace(3)* @_ZZ17function_with_ldsPcS_PiS0_PfS1_E6f_smem, i32 0, i32 %add
+  %6 = load float, float addrspace(3)* %arrayidx4410, align 4
+  %mul45 = fmul contract float %5, %6
+  %arrayidx48 = getelementptr inbounds float, float* %fo_arg, i64 %idxprom
+  store float %mul45, float* %arrayidx48, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z18kernel_with_no_ldsPcS_PiS0_PfS1_(i8 addrspace(1)* nocapture readonly %ci_arg.coerce, i8 addrspace(1)* nocapture %co_arg.coerce, i32 addrspace(1)* nocapture readonly %ii_arg.coerce, i32 addrspace(1)* nocapture %io_arg.coerce, float addrspace(1)* nocapture readonly %fi_arg.coerce, float addrspace(1)* nocapture %fo_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.1 = getelementptr inbounds [2304 x i8], [2304 x i8] addrspace(3)* @_Z18kernel_with_no_ldsPcS_PiS0_PfS1_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT: %0 = addrspacecast i8 addrspace(1)* %ci_arg.coerce to i8*
+; GCN-NEXT: %1 = addrspacecast i8 addrspace(1)* %co_arg.coerce to i8*
+; GCN-NEXT: %2 = addrspacecast i32 addrspace(1)* %ii_arg.coerce to i32*
+; GCN-NEXT: %3 = addrspacecast i32 addrspace(1)* %io_arg.coerce to i32*
+; GCN-NEXT: %4 = addrspacecast float addrspace(1)* %fi_arg.coerce to float*
+; GCN-NEXT: %5 = addrspacecast float addrspace(1)* %fo_arg.coerce to float*
+; GCN-NEXT: tail call fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* %0, i8* %1, i32* %2, i32* %3, float* %4, float* %5, i8 addrspace(3)* %dssv.gep.1, i64 0)
+; GCN-NEXT: ret void
+entry:
+  %0 = addrspacecast i8 addrspace(1)* %ci_arg.coerce to i8*
+  %1 = addrspacecast i8 addrspace(1)* %co_arg.coerce to i8*
+  %2 = addrspacecast i32 addrspace(1)* %ii_arg.coerce to i32*
+  %3 = addrspacecast i32 addrspace(1)* %io_arg.coerce to i32*
+  %4 = addrspacecast float addrspace(1)* %fi_arg.coerce to float*
+  %5 = addrspacecast float addrspace(1)* %fo_arg.coerce to float*
+  tail call fastcc void @_Z17function_with_ldsPcS_PiS0_PfS1_(i8* %0, i8* %1, i32* %2, i32* %3, float* %4, float* %5)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-function.ll
@@ -0,0 +1,61 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+; NEW-LDS: @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E4smem
+@_ZZ17function_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN:  %dssv.gep1.1 = getelementptr inbounds [1 x [1 x i64]], [1 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0
+; GCN-NEXT:  %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add
+; GCN-NEXT:  %6 = load i32, i32 addrspace(3)* %5, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %6, %3
+; GCN-NEXT:  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E4smem, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z18kernel_with_no_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN:  %dssv.gep.1 = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z18kernel_with_no_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.1, i64 0)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel-and-function.ll
@@ -0,0 +1,87 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E8smem_dev
+; OLD-LDS: @_ZZ15kernel_with_ldsPiS_E9smem_kern
+; NEW-LDS: @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ17function_with_ldsPiS_E8smem_dev
+@_ZZ17function_with_ldsPiS_E8smem_dev = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ15kernel_with_ldsPiS_E9smem_kern = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z17function_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep1.1 = getelementptr inbounds [1 x [1 x i64]], [1 x [1 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 0
+; GCN-NEXT:  %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.1, i32 0, i32 %add
+; GCN-NEXT:  %6 = load i32, i32 addrspace(3)* %5, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %6, %3
+; GCN-NEXT:  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx12, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx31 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E8smem_dev, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx31, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx93 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ17function_with_ldsPiS_E8smem_dev, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx93, align 4
+  %mul = mul nsw i32 %2, %1
+  %arrayidx12 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx12, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z15kernel_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.1 = getelementptr inbounds [1024 x i8], [1024 x i8] addrspace(3)* @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+; GCN-NEXT:  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+; GCN-NEXT:  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.1, i64 0)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E9smem_kern, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+  tail call fastcc void @_Z17function_with_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-lds-within-kernel.ll
@@ -0,0 +1,68 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; NEW-LDS-NOT: @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS: @_ZZ15kernel_with_ldsPiS_E4smem
+; NEW-LDS-NOT: @_Z15kernel_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+@_ZZ15kernel_with_ldsPiS_E4smem = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; OFFSET-TABLE-NOT: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM-NOT: i8 addrspace(3)* %0
+; NEW-PARAM-NOT: i64 %1
+define internal fastcc void @_Z20function_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %0 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %1 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %arrayidx3 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %1, i32* %arrayidx3, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %1, i32* %arrayidx3, align 4
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z15kernel_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+; GCN-NEXT:  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+; GCN-NEXT:  tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1)
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx3, align 4
+  %arrayidx54 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx54, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx116 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ15kernel_with_ldsPiS_E4smem, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx116, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx147 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx147, align 4
+  tail call fastcc void @_Z20function_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/device-scope-lds-test-two-lds-arguments.ll
@@ -0,0 +1,182 @@
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -implement-amdgpu-device-scope-shared-variable -S < %s | FileCheck -check-prefixes=NEW-LDS,OLD-LDS,OFFSET-TABLE,NEW-PARAM,GCN %s
+
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four
+; OLD-LDS: @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two
+; OLD-LDS: @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one
+; NEW-LDS: @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; NEW-LDS: @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four
+; OLD-LDS-NOT: @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four
+@_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+@_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one = internal unnamed_addr addrspace(3) global [256 x i32] undef, align 16
+
+; OFFSET-TABLE: @__LDSGlobalsOffsetTable__
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z22function_four_with_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep1.2 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-1]}}
+; GCN-NEXT:  %dssv.load.2 = load i64, i64 addrspace(4)* %dssv.gep1.2, align 4
+; GCN-NEXT:  %dssv.gep2.2 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.2
+; GCN-NEXT:  %dssv.cast.2 = bitcast i8 addrspace(3)* %dssv.gep2.2 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %dssv.gep1.1 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]] addrspace(4)* @__LDSGlobalsOffsetTable__, i32 0, i64 %1, i64 {{[0-1]}}
+; GCN-NEXT:  %dssv.load.1 = load i64, i64 addrspace(4)* %dssv.gep1.1, align 4
+; GCN-NEXT:  %dssv.gep2.1 = getelementptr inbounds i8, i8 addrspace(3)* %0, i64 %dssv.load.1
+; GCN-NEXT:  %dssv.cast.1 = bitcast i8 addrspace(3)* %dssv.gep2.1 to [256 x i32] addrspace(3)*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32* %arrayidx, align 4
+; GCN-NEXT:  %4 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %4, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %5 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %add
+; GCN-NEXT:  %6 = load i32, i32 addrspace(3)* %5, align 4
+; GCN-NEXT:  %add10 = add nsw i32 %3, %6
+; GCN-NEXT:  %7 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %2
+; GCN-NEXT:  store i32 %add10, i32 addrspace(3)* %7, align 4
+; GCN-NEXT:  %8 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* %dssv.cast.{{[1-2]}}, i32 0, i32 %add
+; GCN-NEXT:  %9 = load i32, i32 addrspace(3)* %8, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %9, %3
+; GCN-NEXT:  %arrayidx23 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32* %arrayidx23, align 4
+; GCN-NEXT:  ret void
+entry:
+  %0 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %i_arg, i64 %idxprom
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx32 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four, i32 0, i32 %0
+  store i32 %1, i32 addrspace(3)* %arrayidx32, align 4
+  %add = add nuw nsw i32 %0, 1
+  %arrayidx63 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_1_in_func_four, i32 0, i32 %add
+  %2 = load i32, i32 addrspace(3)* %arrayidx63, align 4
+  %add10 = add nsw i32 %1, %2
+  %arrayidx134 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four, i32 0, i32 %0
+  store i32 %add10, i32 addrspace(3)* %arrayidx134, align 4
+  %arrayidx206 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ22function_four_with_ldsPiS_E19smem_2_in_func_four, i32 0, i32 %add
+  %3 = load i32, i32 addrspace(3)* %arrayidx206, align 4
+  %mul = mul nsw i32 %3, %1
+  %arrayidx23 = getelementptr inbounds i32, i32* %o_arg, i64 %idxprom
+  store i32 %mul, i32* %arrayidx23, align 4
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z26function_three_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z22function_four_with_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z24function_two_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+; NEW-PARAM: i8 addrspace(3)* %0
+; NEW-PARAM: i64 %1
+define internal fastcc void @_Z24function_one_with_no_ldsPiS_(i32* nocapture readonly %i_arg, i32* nocapture %o_arg) unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg, i8 addrspace(3)* %0, i64 %1)
+; GCN-NEXT: ret void
+entry:
+  tail call fastcc void @_Z26function_three_with_no_ldsPiS_(i32* %i_arg, i32* %o_arg)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_two_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_two_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_two_with_ldsPiS_E13smem_kern_two, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+define protected amdgpu_kernel void @_Z19kernel_one_with_ldsPiS_(i32 addrspace(1)* nocapture readonly %i_arg.coerce, i32 addrspace(1)* nocapture %o_arg.coerce) local_unnamed_addr {
+; GCN-LABEL: entry:
+; GCN: %dssv.gep.{{[1-2]}} = getelementptr inbounds [2048 x i8], [2048 x i8] addrspace(3)* @_Z19kernel_one_with_ldsPiS_.Unified.Device.Scope.LDS.Layout, i32 0, i32 0
+; GCN-NEXT:  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+; GCN-NEXT:  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+; GCN-NEXT:  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+; GCN-NEXT:  %idxprom = zext i32 %2 to i64
+; GCN-NEXT:  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+; GCN-NEXT:  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+; GCN-NEXT:  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2
+; GCN-NEXT:  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+; GCN-NEXT:  %add = add nuw nsw i32 %2, 1
+; GCN-NEXT:  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add
+; GCN-NEXT:  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+; GCN-NEXT:  %mul = mul nsw i32 %4, %3
+; GCN-NEXT:  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+; GCN-NEXT:  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+; GCN-NEXT:  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1, i8 addrspace(3)* %dssv.gep.{{[1-2]}}, i64 {{[0-1]}})
+; GCN-NEXT:  ret void
+entry:
+  %0 = addrspacecast i32 addrspace(1)* %i_arg.coerce to i32*
+  %1 = addrspacecast i32 addrspace(1)* %o_arg.coerce to i32*
+  %2 = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %idxprom = zext i32 %2 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32 addrspace(1)* %i_arg.coerce, i64 %idxprom
+  %3 = load i32, i32 addrspace(1)* %arrayidx6, align 4
+  %arrayidx57 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %2
+  store i32 %3, i32 addrspace(3)* %arrayidx57, align 4
+  %add = add nuw nsw i32 %2, 1
+  %arrayidx119 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @_ZZ19kernel_one_with_ldsPiS_E13smem_kern_one, i32 0, i32 %add
+  %4 = load i32, i32 addrspace(3)* %arrayidx119, align 4
+  %mul = mul nsw i32 %4, %3
+  %arrayidx1410 = getelementptr inbounds i32, i32 addrspace(1)* %o_arg.coerce, i64 %idxprom
+  store i32 %mul, i32 addrspace(1)* %arrayidx1410, align 4
+  tail call fastcc void @_Z24function_one_with_no_ldsPiS_(i32* %0, i32* %1)
+  tail call fastcc void @_Z24function_two_with_no_ldsPiS_(i32* %0, i32* %1)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()