diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -271,6 +271,10 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPUDeviceScopeSharedVariablePass();
+void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &);
+extern char &AMDGPUDeviceScopeSharedVariableID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -123,13 +123,15 @@
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
 
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
@@ -0,0 +1,742 @@
+//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO:
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <llvm/ADT/SmallVector.h>
+
+#define DEBUG_TYPE "amdgpu-device-scope-shared-variable"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUDeviceScopeSharedVariable : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) {
+    initializeAMDGPUDeviceScopeSharedVariablePass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPUDeviceScopeSharedVariable::ID = 0;
+
+char &llvm::AMDGPUDeviceScopeSharedVariableID =
+    AMDGPUDeviceScopeSharedVariable::ID;
+
+ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() {
+  return new AMDGPUDeviceScopeSharedVariable();
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable,
+                      "implement-amdgpu-device-scope-shared-variable",
+                      "Implement AMDPGU Device Scope Shared Variable",
+                      false /*only look at the cfg*/, false /*analysis pass*/)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline)
+INITIALIZE_PASS_DEPENDENCY(SimpleInliner)
+INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable,
+                    "implement-amdgpu-device-scope-shared-variable",
+                    "Implement AMDPGU Device Scope Shared Variable",
+                    false /*only look at the cfg*/, false /*analysis pass*/)
+
+static void createFunctionToLDSMap(
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS) {
+  for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) {
+    GlobalVariable *LDSGlobal = it->first;
+    Function *EnclosingFunction = it->second;
+    auto rit = FunctionToLDS.find(EnclosingFunction);
+    if (rit == FunctionToLDS.end()) {
+      SetVector<GlobalVariable *> LDSSet;
+      LDSSet.insert(LDSGlobal);
+      FunctionToLDS[EnclosingFunction] = LDSSet;
+    } else
+      FunctionToLDS[EnclosingFunction].insert(LDSGlobal);
+  }
+}
+
+static void pairUpKernelWithLDSList(
+    Function *K, ValueMap<Function *, SetVector<Function *>> &KernelToCallie,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToDirectLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToIndirectLDS) {
+  // If direct LDS globals exist within the kernel, collect it
+  if (FunctionToLDS.find(K) != FunctionToLDS.end())
+    KernelToDirectLDS[K] = FunctionToLDS[K];
+
+  // Collect all the indirect LDS globals defined within the callie(s) of the
+  // kernel
+  SetVector<GlobalVariable *> IndirectLDSSet;
+  SetVector<Function *> Callies = KernelToCallie[K];
+  for (Function *Callie : Callies) {
+    if (FunctionToLDS.find(Callie) == FunctionToLDS.end())
+      continue;
+    SetVector<GlobalVariable *> CallieLDSList = FunctionToLDS[Callie];
+    for (GlobalVariable *CallieLDS : CallieLDSList)
+      IndirectLDSSet.insert(CallieLDS);
+  }
+  KernelToIndirectLDS[K] = IndirectLDSSet;
+}
+
+static void pairUpKernelWithCallieList(
+    Module &M, Function *K,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &FunctionToLDS,
+    ValueMap<Function *, SetVector<Function *>> &KernelToCallie) {
+  // Get the call graph node associated with current kernel, traverse the call
+  // graph associated with it in DFS manner and collect all the associated
+  // callies which define LDS global(s)
+  CallGraph CG = CallGraph(M);
+  CallGraphNode *KernCGNode = CG[K];
+  SmallVector<CallGraphNode *, 16> CGNodeStack;
+  SetVector<CallGraphNode *> Visited;
+
+#ifndef NDEBUG
+  assert(KernCGNode && "Call graph node associated with kernel definition "
+                       "cannot be null\n");
+#endif
+
+  for (auto it = KernCGNode->begin(); it != KernCGNode->end(); ++it) {
+    CallGraphNode *CGN = it->second;
+#ifndef NDEBUG
+    assert(CGN && "Call graph node associated with function definition cannot"
+                  " be null\n");
+#endif
+    CGNodeStack.push_back(CGN);
+  }
+
+  SetVector<Function *> CallieSet;
+  while (!CGNodeStack.empty()) {
+    CallGraphNode *CGNode = CGNodeStack.pop_back_val();
+    if (!Visited.insert(CGNode))
+      continue;
+
+    Function *F = CGNode->getFunction();
+    if (!F || F->isDeclaration()) {
+#ifndef NDEBUG
+      assert(CGNode->empty() && "Call graph node associated with function "
+                                "declaration should not have callie list\n");
+#endif
+      continue;
+    }
+
+    auto fit = FunctionToLDS.find(F);
+    if (fit != FunctionToLDS.end())
+      CallieSet.insert(F);
+
+    for (auto it = CGNode->begin(); it != CGNode->end(); ++it) {
+      CallGraphNode *CGN = it->second;
+#ifndef NDEBUG
+      assert(CGN && "Call graph node associated with function definition cannot"
+                    " be null\n");
+#endif
+      CGNodeStack.push_back(CGN);
+    }
+  }
+
+  KernelToCallie[K] = CallieSet;
+}
+
+static void pairUpLDSGlobalWithEnclosingFunction(
+    GlobalVariable *LDSGlobal,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction) {
+  // Recursively visit the user list of current LDS global, and find the
+  // enclosing function where the LDS global is defined, and the enclosing
+  // function should always be successfully found.
+  //
+  // TODO: Is there any other efficient way to find the enclosing functions of
+  // LDS globals?
+#ifndef NDEBUG
+  assert(!LDSGlobal->user_empty() &&
+         "LDS Global user list cannot be empty since it must have been defined "
+         "within either kernel or device function");
+#endif
+  SmallVector<User *, 16> UserStack;
+  SetVector<User *> Visited;
+
+  for (User *U : LDSGlobal->users())
+    UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    User *U = UserStack.pop_back_val();
+    if (!Visited.insert(U))
+      continue;
+
+    if (Instruction *I = dyn_cast<Instruction>(U)) {
+      Function *F = I->getParent()->getParent();
+      if (F) {
+        LDSToFunction[LDSGlobal] = F;
+        return;
+      }
+      continue;
+    }
+
+    for (User *UU : U->users())
+      UserStack.push_back(UU);
+  }
+#ifndef NDEBUG
+  assert(false && "Control is not expected to reach this point");
+#endif
+}
+
+static void
+getLDSGlobalSizeInBytes(Module &M, GlobalVariable *LDSGlobal,
+                        ValueMap<GlobalVariable *, uint64_t> &LDSToSize) {
+  Type *Ty = LDSGlobal->getValueType();
+  const DataLayout &DL = M.getDataLayout();
+  uint64_t SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8;
+  LDSToSize[LDSGlobal] = SizeInBytes;
+}
+
+static void handleDirectLDSGlobalWithinKernel(Module &M, Function *K,
+                                              GlobalVariable *LDS,
+                                              Instruction *BasePtr) {
+  // Suffix the names of the instructions with unique integer values
+  static int Suffix = 0;
+  ++Suffix;
+
+  // Traverse through each `use` of `LDS`, create a proper `ToBeReplacedInst`
+  // for each `use`, and accordingly replace it.
+  for (const User *U : LDS->users()) {
+    Instruction *UserInst = dyn_cast<Instruction>(const_cast<User *>(U));
+    if (!UserInst)
+      continue;
+
+    Instruction *ToBeReplacedInst = nullptr;
+
+    if (GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(UserInst)) {
+      // User instruction is GEP instruction, replace it as below
+      // 1. Extract the last operand of `GEPInst`, say, it is, `Offset`
+      // 2. Create pointer arithmetic instruction `BasePtr + Offset`
+#ifndef NDEBUG
+      assert(GEPInst->hasIndices() && "Expected one or more GEP indecies\n");
+#endif
+      Value *Offset = GEPInst->getOperand(GEPInst->getNumIndices());
+      ToBeReplacedInst = GetElementPtrInst::CreateInBounds(
+          GEPInst->getResultElementType(), BasePtr, Offset,
+          Twine(BasePtr->getName()) + Twine(".ptr.arith.") + Twine(Suffix),
+          UserInst);
+    } else if (LoadInst *LInst = dyn_cast<LoadInst>(UserInst)) {
+      // User instruction is LOAD instruction, replace pointer operand of
+      // LOAD instruction by `BasePtr`
+      ToBeReplacedInst = new LoadInst(LInst->getType(), BasePtr,
+                                      Twine(BasePtr->getName()) +
+                                          Twine(".load.") + Twine(Suffix),
+                                      UserInst);
+    } else if (StoreInst *SInst = dyn_cast<StoreInst>(UserInst)) {
+      // User instruction is STORE instruction, replace pointer operand of
+      // STORE instruction by `BasePtr`
+      ToBeReplacedInst =
+          new StoreInst(SInst->getValueOperand(), BasePtr, UserInst);
+    } else {
+      // TODO: Do we need to specially handle any other kind of instructions
+      // apart from GEP, LOAD, and STORE?
+#ifndef NDEBUG
+      assert(false && "Not implemented\n");
+#endif
+    }
+
+    // Replace `UserInst` by `ToBeReplacedInst` and erase `UserInst`.
+#ifndef NDEBUG
+    assert(ToBeReplacedInst && "To be replaced instruction cannot be null\n");
+#endif
+    ToBeReplacedInst->copyMetadata(*UserInst);
+    UserInst->replaceAllUsesWith(ToBeReplacedInst);
+    UserInst->eraseFromParent();
+  }
+}
+
+static void
+getClonedArgumentList(Function *K, Function *CurCaller, Instruction *BasePtr,
+                      CallInst *CI,
+                      ValueMap<Function *, Function *> &OldCallieToNewCallie,
+                      SmallVectorImpl<Value *> &NewArgs) {
+  for (auto it = CI->arg_begin(); it != CI->arg_end(); ++it)
+    NewArgs.push_back(*it);
+
+  Value *NewArg = nullptr;
+  if (CurCaller != K) {
+    Function *NewCurCaller = OldCallieToNewCallie[CurCaller];
+#ifndef NDEBUG
+    assert(NewCurCaller && "Proper new caller should exist\n");
+#endif
+
+    NewArg = NewCurCaller->getArg(NewCurCaller->arg_size() - 1);
+#ifndef NDEBUG
+    assert(NewArg && "Proper new parameter within new caller should exist\n");
+#endif
+  } else
+    NewArg = BasePtr;
+
+  NewArgs.push_back(NewArg);
+}
+
+static CallInst *
+getCallInstruction(Optional<WeakTrackingVH> &O, Function *K,
+                   Function *CurCaller,
+                   ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+#ifndef NDEBUG
+  assert(O.hasValue() && "Valid call instruction should exist\n");
+#endif
+
+  CallInst *CI = dyn_cast<CallInst>(O.getValue());
+#ifndef NDEBUG
+  assert(CI && "Valid call instruction should exist\n");
+  assert(CI->getParent()->getParent() == CurCaller && "Not a valid caller\n");
+#endif
+
+  // At this point, `CI` is a call instruction from `CurCaller`. If the
+  // `CurCaller` is kernel `K` itself, then return the same call instruction.
+  // Otherwise, we need to find the replica of `CI` within the new clone of
+  // `CurCaller`, and return it.
+  if (CurCaller == K)
+    return CI;
+
+  // Find replica of `CI` within the new clone of `CurCaller`, and return it.
+  // TODO: Yet to implement it
+  // Function *NewCaller = OldCallieToNewCallie[CurCaller];
+  return nullptr;
+}
+
+static void
+updateCurrentCaller(Module &M, Function *K, Instruction *BasePtr,
+                    CallGraphNode *CurCallerCGNode, Function *CurCallie,
+                    ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // Update the current caller to have a new call instruction(s) to new callie.
+  Function *CurCaller = CurCallerCGNode->getFunction();
+  for (auto it = CurCallerCGNode->begin(); it != CurCallerCGNode->end(); ++it) {
+    Function *Callie = it->second->getFunction();
+
+    // Not the callie in question? ignore it.
+    if (Callie != CurCallie)
+      continue;
+
+    // We have found a call site which has a call to `CurCallie`. Get the call
+    // instruction within the current caller which calls the current callie.
+    // Note: If the caller is a device function, then we should get the call
+    // instruction from the new clone of it, `not` from the original caller.
+    CallInst *CI =
+        getCallInstruction(it->first, K, CurCaller, OldCallieToNewCallie);
+
+    // Get the new argument list which is required to insert new call
+    // instruction
+    SmallVector<Value *, 8> NewArgs;
+    getClonedArgumentList(K, CurCaller, BasePtr, CI, OldCallieToNewCallie,
+                          NewArgs);
+
+    // Insert new call instruction `NewCI` just before the existing call
+    // instruction `CI`.
+    Function *NewCallie = OldCallieToNewCallie[CurCallie];
+#ifndef NDEBUG
+    assert(NewCallie && "Valid new callie should exist\n");
+#endif
+    CallInst *NewCI = CallInst::Create(NewCallie->getFunctionType(), NewCallie,
+                                       NewArgs, Twine("hsm-call"), CI);
+    NewCI->copyMetadata(*CI);
+  }
+}
+
+static Function *CloneFunction(Function *F, Type *BasePtrType) {
+  // Create a new function type by inserting `BasePtr` argument type to
+  // existing arg list
+  SmallVector<Type *, 8> NewParams;
+  FunctionType *FnTy = F->getFunctionType();
+  for (auto it = FnTy->param_begin(); it != FnTy->param_end(); ++it)
+    NewParams.push_back(*it);
+  NewParams.push_back(BasePtrType);
+  FunctionType *NewFnTy =
+      FunctionType::get(FnTy->getReturnType(), NewParams, FnTy->isVarArg());
+
+  // Create a copy of the current function with new function type
+  Function *NewF = Function::Create(NewFnTy, F->getLinkage(),
+                                    F->getAddressSpace(), F->getName());
+  ValueToValueMapTy VMap;
+  auto *NewFArgIt = NewF->arg_begin();
+  for (auto &Arg : F->args()) {
+    auto ArgName = Arg.getName();
+    NewFArgIt->setName(ArgName);
+    VMap[&Arg] = &(*NewFArgIt++);
+  }
+  // TODO: ModuleLevelChanges should be set to true or false?
+  SmallVector<ReturnInst *, 8> Returns;
+  CloneFunctionInto(NewF, F, VMap, /*ModuleLevelChanges=*/false, Returns);
+
+  // Copy all metadata
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F->getAllMetadata(MDs);
+  for (auto MDIt : MDs)
+    NewF->addMetadata(MDIt.first, *MDIt.second);
+
+  return NewF;
+}
+
+static void createCloneOfCalliesWithNewParameter(
+    Module &M, Function *K, Function *Callie, Instruction *BasePtr,
+    SmallVectorImpl<SetVector<CallGraphNode *>> &CGPaths,
+    ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  for (auto CGPath : CGPaths) {
+    // TODO: We can in-fact assert that length of `CGPath` is atleast two. But,
+    // we are okay for now.
+    if (CGPath.size() < 2)
+      continue;
+
+    // First function in the call graph path is always is kernel
+    CallGraphNode *CurCallerCGNode = *CGPath.begin();
+#ifndef NDEBUG
+    assert(CurCallerCGNode->getFunction() == K &&
+           "Should be kernel function\n");
+#endif
+
+    // We are interested in cloning of only device functions in the call graph
+    // path, hence we start from second node in the call graph path.
+    auto it = CGPath.begin() + 1;
+    for (; it != CGPath.end(); ++it) {
+      CallGraphNode *CurCallieCGNode = *it;
+      Function *CurCallie = CurCallieCGNode->getFunction();
+
+      // The `CurCallie` is encountered first time, then, create a clone of it.
+      if (OldCallieToNewCallie.find(CurCallie) == OldCallieToNewCallie.end()) {
+        // 1. Create a clone of the `OldCallie` by cloning it as `NewCallie`.
+        //    This new clone will have new parameter of type same as `BasePtr`
+        //    type.
+        // 2. Insert `NewCallie` into module just before the `OldCallie` but
+        //    keep the `OldCallie` until all set to remove it from the module.
+        Function *NewCallie = CloneFunction(CurCallie, BasePtr->getType());
+        M.getFunctionList().insert(CurCallie->getIterator(), NewCallie);
+        OldCallieToNewCallie[CurCallie] = NewCallie;
+      }
+
+      // Update the current caller to have a new call instruction to new callie
+      updateCurrentCaller(M, K, BasePtr, CurCallerCGNode, CurCallie,
+                          OldCallieToNewCallie);
+
+      // Current callie becomes next caller in the call graph path
+      CurCallerCGNode = CurCallieCGNode;
+    }
+  }
+}
+
+static void CollectCallGraphPathsBetweenKernelAndCallie(
+    Module &M, Function *K, Function *Callie,
+    SmallVectorImpl<SetVector<CallGraphNode *>> &CGPaths) {
+  // Traverse the call graph associated with the kernel in DFS manner and
+  // collect all the paths from kernel to callie.
+  //
+  // TODO: Note that this algorithm will not work if there exist recursive
+  // calls, and the current assumption here is that the call graph is acyclic.
+  // We need to visit it back again to handle call graph which could contain
+  // cycles.
+  CallGraph CG = CallGraph(M);
+  CallGraphNode *KernCGNode = CG[K];
+#ifndef NDEBUG
+  assert(KernCGNode && "Call graph node associated with kernel definition "
+                       "cannot be null\n");
+#endif
+
+  SmallVector<SetVector<CallGraphNode *>, 16> Stack;
+  SetVector<CallGraphNode *> Path;
+  Path.insert(KernCGNode);
+  Stack.push_back(Path);
+
+  while (!Stack.empty()) {
+    SetVector<CallGraphNode *> TopPath = Stack.pop_back_val();
+    CallGraphNode *CGNode = TopPath.back();
+    Function *F = CGNode->getFunction();
+    if (F == Callie) {
+      CGPaths.push_back(TopPath);
+      continue;
+    }
+
+    for (auto it = CGNode->begin(); it != CGNode->end(); ++it) {
+      CallGraphNode *CGN = it->second;
+#ifndef NDEBUG
+      assert(CGN && "Call graph node associated with function definition cannot"
+                    " be null\n");
+#endif
+      SetVector<CallGraphNode *> ClonedPath(TopPath.begin(), TopPath.end());
+      ClonedPath.insert(CGN);
+      Stack.push_back(ClonedPath);
+    }
+  }
+}
+
+static void handleIndirectLDSGlobalWithinCallie(
+    Module &M, Function *K, Function *Callie, Instruction *BasePtr,
+    ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // Collect all the call graph paths between the kernel and the callie
+  SmallVector<SetVector<CallGraphNode *>, 8> CGPaths;
+  CollectCallGraphPathsBetweenKernelAndCallie(M, K, Callie, CGPaths);
+
+  // Update callies to accept the new parameter which is of type same as
+  // `BasePtr` by creating their clones
+  createCloneOfCalliesWithNewParameter(M, K, Callie, BasePtr, CGPaths,
+                                       OldCallieToNewCallie);
+}
+
+static Instruction *insertBasePointerAccessInstructionsWithinKernel(
+    Module &M, Function *K, GlobalVariable *LDS, GlobalVariable *NewLDS,
+    uint64_t Offset) {
+  // Insert instructions as below at the begining of the entry basic block of
+  // the kernel
+  // 1. Insert GEP instruction which access the address `NewLDS + Offset`, say,
+  //    result is `GEPInst` which is of type `char*`.
+  // 2. Insert type cast instruction which type casts `GEPInst` from `char*` to
+  //    `basetype*` where `basetype` is base type of `LDS`, say the result is,
+  //    `CastInst`.
+  // 3. Return `CastInst`.
+
+  // Suffix the names of the instructions with unique integer values
+  static int Suffix = 0;
+  ++Suffix;
+
+  // Insert gep instruction
+  BasicBlock::const_iterator iit = K->getEntryBlock().getFirstInsertionPt();
+#ifndef NDEBUG
+  assert(iit != K->getEntryBlock().end() &&
+         "Entry basic block of the kernel cannot be empty, otherwise control "
+         "would not reach this point\n");
+#endif
+  const Instruction &EI = *iit;
+  Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(M.getContext())),
+                      Constant::getIntegerValue(
+                          Type::getInt64Ty(M.getContext()), APInt(64, Offset))};
+  Instruction *GEPInst = GetElementPtrInst::CreateInBounds(
+      NewLDS->getValueType(), const_cast<GlobalVariable *>(NewLDS), Indices,
+      Twine("dssv.gep.") + Twine(Suffix), const_cast<Instruction *>(&EI));
+
+  // Insert type-cast instruction
+  // TODO: Do we need to handle any other aggregate type apart from array type?
+  //       and, what about 2 and other higher dimensional arrays?
+  Type *LDSValueType = LDS->getValueType();
+  if (ArrayType *AT = dyn_cast<ArrayType>(LDSValueType))
+    LDSValueType = AT->getElementType();
+  PointerType *ToBeCastedType =
+      PointerType::get(LDSValueType, AMDGPUAS::LOCAL_ADDRESS);
+  Instruction *CastInst = new BitCastInst(GEPInst, ToBeCastedType,
+                                          Twine("dssv.cast.") + Twine(Suffix),
+                                          const_cast<Instruction *>(&EI));
+
+  // Return type-casted instruction
+  return CastInst;
+}
+
+static bool handleDeviceScopeSharedVariablesForCurKernel(
+    Module &M, Function *K,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<GlobalVariable *, uint64_t> &LDSToSize,
+    SetVector<GlobalVariable *> &DirectLDSList,
+    SetVector<GlobalVariable *> &IndirectLDSList,
+    ValueMap<Function *, Function *> &OldCallieToNewCallie) {
+  // Compute the total size of all LDS globals, and also offsets associated with
+  // them within in the new LDS global which will be created in a moment to
+  // replace all these LDS globals.
+  uint64_t TotalLDSSizeInBytes = 0;
+  ValueMap<GlobalVariable *, uint64_t> LDSToOffset;
+  for (GlobalVariable *LDS : DirectLDSList) {
+    LDSToOffset[LDS] = TotalLDSSizeInBytes;
+    TotalLDSSizeInBytes += LDSToSize[LDS];
+  }
+  for (GlobalVariable *LDS : IndirectLDSList) {
+    LDSToOffset[LDS] = TotalLDSSizeInBytes;
+    TotalLDSSizeInBytes += LDSToSize[LDS];
+  }
+
+  // Insert a new LDS global which is nothing but a single contigeous shared
+  // memory layout representing all the LDS globals associted with the kernel
+  // which includes those directly defined within the kernel and those
+  // indirectly defined within callies.
+  //
+  // The size of this new contigeous LDS global layout is equal to the sum of
+  // the sizes of all the associated LDS globals.
+  // TODO: what about the name of this new LDS global? is it fine or need to be
+  // changed?
+  Type *NewLDSTy =
+      ArrayType::get(IntegerType::get(M.getContext(), 8), TotalLDSSizeInBytes);
+  GlobalVariable *NewLDS = new GlobalVariable(
+      M, NewLDSTy, false, GlobalValue::InternalLinkage,
+      UndefValue::get(NewLDSTy), Twine(K->getName()) + Twine("_LDSLayout"),
+      nullptr, GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
+  NewLDS->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  NewLDS->setAlignment(MaybeAlign(M.getDataLayout().getPreferredAlign(NewLDS)));
+
+  // Now that we have all the necessary information available, next step is to
+  // properly replace the original LDS globals by their offset counterparts.
+  //
+  // First, for each LDS global (both direct and indirect ones), insert base
+  // pointer access instruction within kernel.
+  ValueMap<GlobalVariable *, Instruction *> DirectLDSToBasePtrInst;
+  ValueMap<GlobalVariable *, Instruction *> IndirectLDSToBasePtrInst;
+  for (GlobalVariable *LDS : DirectLDSList) {
+    Instruction *BasePtr = insertBasePointerAccessInstructionsWithinKernel(
+        M, K, LDS, NewLDS, LDSToOffset[LDS]);
+    DirectLDSToBasePtrInst[LDS] = BasePtr;
+  }
+  for (GlobalVariable *LDS : IndirectLDSList) {
+    Instruction *BasePtr = insertBasePointerAccessInstructionsWithinKernel(
+        M, K, LDS, NewLDS, LDSToOffset[LDS]);
+    IndirectLDSToBasePtrInst[LDS] = BasePtr;
+  }
+
+  // Next, handle all the indirect globals associated with the current kernel.
+  for (auto it = IndirectLDSToBasePtrInst.begin();
+       it != IndirectLDSToBasePtrInst.end(); ++it) {
+    handleIndirectLDSGlobalWithinCallie(M, K, LDSToFunction[it->first],
+                                        it->second, OldCallieToNewCallie);
+  }
+
+  // Finally, handle all the direct globals associated with the current kernel.
+  for (auto it = DirectLDSToBasePtrInst.begin();
+       it != DirectLDSToBasePtrInst.end(); ++it) {
+    handleDirectLDSGlobalWithinKernel(M, K, it->first, it->second);
+  }
+
+  return true;
+}
+
+static bool handleDeviceScopeSharedVariables(
+    Module &M, SetVector<Function *> &Kernels,
+    ValueMap<GlobalVariable *, Function *> &LDSToFunction,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToDirectLDS,
+    ValueMap<Function *, SetVector<GlobalVariable *>> &KernelToIndirectLDS,
+    ValueMap<GlobalVariable *, uint64_t> &LDSToSize) {
+  bool Changed = false;
+
+  // Process LDS globals associated with each kernel
+  ValueMap<Function *, Function *> OldCallieToNewCallie;
+  for (Function *K : Kernels) {
+    SetVector<GlobalVariable *> DirectLDSList;
+    if (KernelToDirectLDS.find(K) != KernelToDirectLDS.end())
+      DirectLDSList = KernelToDirectLDS[K];
+
+    SetVector<GlobalVariable *> IndirectLDSList;
+    if (KernelToIndirectLDS.find(K) != KernelToIndirectLDS.end())
+      IndirectLDSList = KernelToIndirectLDS[K];
+
+    // No LDS globals to process? ignore the kernel, goto next one
+    if (DirectLDSList.empty() && IndirectLDSList.empty())
+      continue;
+
+    // Process LDS globals
+    Changed |= handleDeviceScopeSharedVariablesForCurKernel(
+        M, K, LDSToFunction, LDSToSize, DirectLDSList, IndirectLDSList,
+        OldCallieToNewCallie);
+  }
+
+  return Changed;
+}
+
+static bool
+handleDeviceScopeSharedVariables(Module &M,
+                                 SetVector<GlobalVariable *> &LDSGlobals,
+                                 SetVector<Function *> &Kernels) {
+  // Pair up each LDS global with the enclosing function where the LDS global is
+  // defined
+  ValueMap<GlobalVariable *, Function *> LDSToFunction;
+  for (GlobalVariable *LDSGlobal : LDSGlobals)
+    pairUpLDSGlobalWithEnclosingFunction(LDSGlobal, LDSToFunction);
+
+  // Create reverse map from enclosing function to LDS global list
+  ValueMap<Function *, SetVector<GlobalVariable *>> FunctionToLDS;
+  createFunctionToLDSMap(LDSToFunction, FunctionToLDS);
+
+  // Pair up kernels with callie list which define LDS globals
+  ValueMap<Function *, SetVector<Function *>> KernelToCallie;
+  for (Function *K : Kernels)
+    pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie);
+
+  // Pair up kernels with all the LDS globals: both direct LDS globals (those
+  // directly defined within the kernels), and indirect LDS globals (those
+  // indirectly defined within the callies).
+  ValueMap<Function *, SetVector<GlobalVariable *>> KernelToDirectLDS;
+  ValueMap<Function *, SetVector<GlobalVariable *>> KernelToIndirectLDS;
+  for (Function *K : Kernels)
+    pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS, KernelToDirectLDS,
+                            KernelToIndirectLDS);
+
+  // Get the size of each LDS global in bytes
+  ValueMap<GlobalVariable *, uint64_t> LDSToSize;
+  for (GlobalVariable *LDSGlobal : LDSGlobals)
+    getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize);
+
+  return handleDeviceScopeSharedVariables(M, Kernels, LDSToFunction,
+                                          KernelToDirectLDS,
+                                          KernelToIndirectLDS, LDSToSize);
+}
+
+static bool handleDeviceScopeSharedVariables(Module &M) {
+  // Collect all the (static) LDS globals defined within the current module
+  SetVector<GlobalVariable *> LDSGlobals;
+  for (GlobalVariable &GV : M.globals())
+    if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        GV.hasInternalLinkage())
+      LDSGlobals.insert(&GV);
+
+  if (LDSGlobals.empty()) {
+    LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName()
+                      << ", skipping handling device of scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  // Collect all the amdgpu kernels defined within the current module
+  SetVector<Function *> Kernels;
+  for (Function &F : M.functions()) {
+    if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) &&
+        !F.isDeclaration())
+      Kernels.insert(&F);
+  }
+
+  if (Kernels.empty()) {
+    LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName()
+                      << ", skipping handling of device scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels);
+}
+
+bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) {
+  LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the "
+                       "module "
+                    << M.getName() << "\n");
+
+  // TODO: We only want to handle HIP kernels, and no kernels from from other
+  // programming languages, like OpenCL, OpenMP, etc. Do we need to add a
+  // condition here for it, and skip running the pass for non-HIP kernels?
+  if (skipModule(M)) {
+    LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables "
+                         "in the module "
+                      << M.getName() << "\n");
+    return false;
+  }
+
+  bool Changed = handleDeviceScopeSharedVariables(M);
+
+  LLVM_DEBUG(dbgs() << "===== Done with hanlding device scope shared variables "
+                       "in the module "
+                    << M.getName() << "\n");
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -40,6 +40,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableDeviceScopeSharedVariable;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -198,6 +198,12 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool, true> EnableDeviceScopeSharedVariable(
+    "amdgpu-enable-device-scope-shared-variable",
+    cl::desc("Support amdgpu device scope shared variables"),
+    cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -265,6 +271,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPUDeviceScopeSharedVariablePass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -394,6 +401,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -771,6 +779,11 @@
   // but EarlyCSE can do neither of them.
   if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
     addEarlyCSEOrGVNPass();
+
+  // We expect to run this pass as a last IR pass. Hence make sure that this
+  // pass is added as a last IR pass
+  if (EnableDeviceScopeSharedVariable)
+    addPass(createAMDGPUDeviceScopeSharedVariablePass());
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUDeviceScopeSharedVariable.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp