diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -267,6 +267,10 @@
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 
+ModulePass *createAMDGPUDeviceScopeSharedVariablePass();
+void initializeAMDGPUDeviceScopeSharedVariablePass(PassRegistry &);
+extern char &AMDGPUDeviceScopeSharedVariableID;
+
 namespace AMDGPU {
 enum TargetIndex {
   TI_CONSTDATA_START,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -123,13 +123,15 @@
   // should only appear when IPO passes manages to move LDs defined in a kernel
   // into a single user function.
 
-  for (GlobalVariable &GV : M.globals()) {
-    // TODO: Region address
-    unsigned AS = GV.getAddressSpace();
-    if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
-      continue;
-
-    recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+  if (!AMDGPUTargetMachine::EnableDeviceScopeSharedVariable) {
+    for (GlobalVariable &GV : M.globals()) {
+      // TODO: Region address
+      unsigned AS = GV.getAddressSpace();
+      if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS)
+        continue;
+
+      recursivelyVisitUsers(GV, FuncsToAlwaysInline);
+    }
   }
 
   if (!AMDGPUTargetMachine::EnableFunctionCalls || StressCalls) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUDeviceScopeSharedVariable.cpp
@@ -0,0 +1,320 @@
+//===-- AMDGPUDeviceScopeSharedVariables.cpp ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO:
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "amdgpu-device-scope-shared-variable"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUDeviceScopeSharedVariable : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPUDeviceScopeSharedVariable() : ModulePass(ID) {
+    initializeAMDGPUDeviceScopeSharedVariablePass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+};
+
+} // namespace
+
+char AMDGPUDeviceScopeSharedVariable::ID = 0;
+
+char &llvm::AMDGPUDeviceScopeSharedVariableID =
+    AMDGPUDeviceScopeSharedVariable::ID;
+
+ModulePass *llvm::createAMDGPUDeviceScopeSharedVariablePass() {
+  return new AMDGPUDeviceScopeSharedVariable();
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPUDeviceScopeSharedVariable,
+                      "implement-amdgpu-device-scope-shared-variable",
+                      "Implement AMDPGU Device Scope Shared Variable",
+                      false /*only look at the cfg*/, false /*analysis pass*/)
+INITIALIZE_PASS_DEPENDENCY(AMDGPUAlwaysInline)
+INITIALIZE_PASS_DEPENDENCY(SimpleInliner)
+INITIALIZE_PASS_END(AMDGPUDeviceScopeSharedVariable,
+                    "handle-amdgpu-device-scope-shared-variable",
+                    "Handle AMDPGU Device Scope Shared Variable",
+                    false /*only look at the cfg*/, false /*analysis pass*/)
+
+static void createFunctionToLDSMap(
+    ValueMap<const GlobalVariable *, const Function *> &LDSToFunction,
+    ValueMap<const Function *, SetVector<const GlobalVariable *>>
+        &FunctionToLDS) {
+  for (auto it = LDSToFunction.begin(); it != LDSToFunction.end(); ++it) {
+    const GlobalVariable *LDSGlobal = it->first;
+    const Function *EnclosingFunction = it->second;
+    auto rit = FunctionToLDS.find(EnclosingFunction);
+    if (rit == FunctionToLDS.end()) {
+      SetVector<const GlobalVariable *> LDSSet;
+      LDSSet.insert(LDSGlobal);
+      FunctionToLDS[EnclosingFunction] = LDSSet;
+    } else
+      FunctionToLDS[EnclosingFunction].insert(LDSGlobal);
+  }
+}
+
+static void pairUpKernelWithLDSList(
+    const Function *K,
+    ValueMap<const Function *, SetVector<const Function *>> &KernelToCallie,
+    ValueMap<const Function *, SetVector<const GlobalVariable *>>
+        &FunctionToLDS,
+    ValueMap<const Function *, SetVector<const GlobalVariable *>>
+        &KernelToExtendedLDS) {
+  // Set which holds all the LDS globals which are defined either directly in
+  // kernel or indirectly in within callie(s)
+  SetVector<const GlobalVariable *> ExtendedLDSSet;
+
+  // Collect all the LDS globals defined within kernel
+  if (FunctionToLDS.find(K) != FunctionToLDS.end())
+    ExtendedLDSSet = FunctionToLDS[K];
+
+  // Collect all the LDS globals defined within the callie(s) of kernel
+  SetVector<const Function *> Callies = KernelToCallie[K];
+  for (const Function *Callie : Callies) {
+    if (FunctionToLDS.find(Callie) == FunctionToLDS.end())
+      continue;
+    SetVector<const GlobalVariable *> CallieLDSList = FunctionToLDS[Callie];
+    for (const GlobalVariable *CallieLDS : CallieLDSList)
+      ExtendedLDSSet.insert(CallieLDS);
+  }
+
+  KernelToExtendedLDS[K] = ExtendedLDSSet;
+}
+
+static void pairUpKernelWithCallieList(
+    Module &M, const Function *K,
+    ValueMap<const Function *, SetVector<const GlobalVariable *>>
+        &FunctionToLDS,
+    ValueMap<const Function *, SetVector<const Function *>> &KernelToCallie) {
+  // Get the call graph node associated with current kernel, traverse the call
+  // graph associated with the it in DFS manner and collect all the associated
+  // callies which define LDS global(s)
+  CallGraph CG = CallGraph(M);
+  const CallGraphNode *KernCGNode = CG[K];
+  SmallVector<const CallGraphNode *, 16> CGNodeStack;
+  SetVector<const CallGraphNode *> Visited;
+
+#ifndef NDEBUG
+  assert(KernCGNode && "Call graph node associated with kernel definition "
+                       "cannot be null\n");
+#endif
+
+  for (auto it = KernCGNode->begin(); it != KernCGNode->end(); ++it) {
+    const CallGraphNode *CGN = it->second;
+#ifndef NDEBUG
+    assert(CGN && "Call graph node associated with function definition cannot"
+                  " be null\n");
+#endif
+    CGNodeStack.push_back(CGN);
+  }
+
+  SetVector<const Function *> CallieSet;
+  while (!CGNodeStack.empty()) {
+    const CallGraphNode *CGNode = CGNodeStack.pop_back_val();
+    if (!Visited.insert(CGNode))
+      continue;
+
+    Function *F = CGNode->getFunction();
+    if (!F || F->isDeclaration()) {
+#ifndef NDEBUG
+      assert(CGNode->empty() && "Call graph node associated with function "
+                                "declaration should not have callie list\n");
+#endif
+      continue;
+    }
+
+    auto fit = FunctionToLDS.find(F);
+    if (fit != FunctionToLDS.end())
+      CallieSet.insert(F);
+
+    for (auto it = CGNode->begin(); it != CGNode->end(); ++it) {
+      const CallGraphNode *CGN = it->second;
+#ifndef NDEBUG
+      assert(CGN && "Call graph node associated with function definition cannot"
+                    " be null\n");
+#endif
+      CGNodeStack.push_back(CGN);
+    }
+  }
+
+  KernelToCallie[K] = CallieSet;
+}
+
+static void pairUpLDSGlobalWithEnclosingFunction(
+    const GlobalVariable *LDSGlobal,
+    ValueMap<const GlobalVariable *, const Function *> &LDSToFunction) {
+  // Recursively visit the user list of current LDS global, and find the
+  // enclosing function where the LDS global is defined, and the enclosing
+  // function should always be successfully found.
+  //
+  // TODO: Is there any other efficient way to find the enclosing functions of
+  // LDS globals?
+#ifndef NDEBUG
+  assert(!LDSGlobal->user_empty() &&
+         "LDS Global user list cannot be empty since it must have been defined "
+         "within either kernel or device function");
+#endif
+  SmallVector<const User *, 16> UserStack;
+  SetVector<const User *> Visited;
+
+  for (const User *U : LDSGlobal->users())
+    UserStack.push_back(U);
+
+  while (!UserStack.empty()) {
+    const User *U = UserStack.pop_back_val();
+    if (!Visited.insert(U))
+      continue;
+
+    if (const Instruction *I = dyn_cast<Instruction>(U)) {
+      const Function *F = I->getParent()->getParent();
+      if (F) {
+        LDSToFunction[LDSGlobal] = F;
+        return;
+      }
+      continue;
+    }
+
+    for (const User *UU : U->users())
+      UserStack.push_back(UU);
+  }
+#ifndef NDEBUG
+  assert(false && "Control is not expected to reach this point");
+#endif
+}
+
+static void
+getLDSGlobalSizeInBytes(Module &M, const GlobalVariable *LDSGlobal,
+                        ValueMap<const GlobalVariable *, uint64_t> &LDSToSize) {
+  Type *Ty = LDSGlobal->getValueType();
+  const DataLayout &DL = M.getDataLayout();
+  uint64_t SizeInBytes = DL.getTypeSizeInBits(Ty).getFixedSize() / 8;
+  LDSToSize[LDSGlobal] = SizeInBytes;
+}
+
+static bool handleDeviceScopeSharedVariables(
+    Module &M,
+    ValueMap<const GlobalVariable *, const Function *> &LDSToFunction,
+    ValueMap<const Function *, SetVector<const GlobalVariable *>>
+        &FunctionToLDS,
+    ValueMap<const Function *, SetVector<const Function *>> &KernelToCallie,
+    ValueMap<const Function *, SetVector<const GlobalVariable *>>
+        &KernelToExtendedLDS,
+    ValueMap<const GlobalVariable *, uint64_t> &LDSToSize) {
+  return false;
+}
+
+static bool handleDeviceScopeSharedVariables(
+    Module &M, const SetVector<const GlobalVariable *> &LDSGlobals,
+    const SetVector<const Function *> &Kernels) {
+  // Pair up each LDS global with the enclosing function where the LDS global is
+  // defined
+  ValueMap<const GlobalVariable *, const Function *> LDSToFunction;
+  for (const GlobalVariable *LDSGlobal : LDSGlobals)
+    pairUpLDSGlobalWithEnclosingFunction(LDSGlobal, LDSToFunction);
+
+  // Create reverse map from enclosing function to LDS global list
+  ValueMap<const Function *, SetVector<const GlobalVariable *>> FunctionToLDS;
+  createFunctionToLDSMap(LDSToFunction, FunctionToLDS);
+
+  // Pair up kernels with callie list which define LDS globals
+  ValueMap<const Function *, SetVector<const Function *>> KernelToCallie;
+  for (const Function *K : Kernels)
+    pairUpKernelWithCallieList(M, K, FunctionToLDS, KernelToCallie);
+
+  // Pair up kernels with all the associated LDS globals including those
+  // directly defined within the kernel and those indirectly defined within
+  // the callies
+  ValueMap<const Function *, SetVector<const GlobalVariable *>>
+      KernelToExtendedLDS;
+  for (const Function *K : Kernels)
+    pairUpKernelWithLDSList(K, KernelToCallie, FunctionToLDS,
+                            KernelToExtendedLDS);
+
+  // Get the size of each LDS global in bytes
+  ValueMap<const GlobalVariable *, uint64_t> LDSToSize;
+  for (const GlobalVariable *LDSGlobal : LDSGlobals)
+    getLDSGlobalSizeInBytes(M, LDSGlobal, LDSToSize);
+
+  return handleDeviceScopeSharedVariables(M, LDSToFunction, FunctionToLDS,
+                                          KernelToCallie, KernelToExtendedLDS,
+                                          LDSToSize);
+}
+
+static bool handleDeviceScopeSharedVariables(Module &M) {
+  // Collect all the (static) LDS globals defined within the current module
+  SetVector<const GlobalVariable *> LDSGlobals;
+  for (GlobalVariable &GV : M.globals())
+    if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
+        GV.hasInternalLinkage())
+      LDSGlobals.insert(&GV);
+
+  if (LDSGlobals.empty()) {
+    LLVM_DEBUG(dbgs() << "No LDS globals defined in the module " << M.getName()
+                      << ", skipping handling device of scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  // Collect all the amdgpu kernels defined within the current module
+  SetVector<const Function *> Kernels;
+  for (Function &F : M.functions()) {
+    if ((F.getCallingConv() == CallingConv::AMDGPU_KERNEL) &&
+        !F.isDeclaration())
+      Kernels.insert(&F);
+  }
+
+  if (Kernels.empty()) {
+    LLVM_DEBUG(dbgs() << "No kernels defined in the module " << M.getName()
+                      << ", skipping handling of device scope shared variables"
+                      << "\n");
+    return false;
+  }
+
+  return handleDeviceScopeSharedVariables(M, LDSGlobals, Kernels);
+}
+
+bool AMDGPUDeviceScopeSharedVariable::runOnModule(Module &M) {
+  LLVM_DEBUG(dbgs() << "===== Handling device scope shared variables in the "
+                       "module "
+                    << M.getName() << "\n");
+
+  // TODO: We only want to handle HIP kernels, and no kernels from from other
+  // programming languages, like OpenCL, OpenMP, etc. Do we need to add a
+  // condition here for it, and skip running the pass for non-HIP kernels?
+  if (skipModule(M)) {
+    LLVM_DEBUG(dbgs() << "Skipping handling of device scope shared variables "
+                         "in the module "
+                      << M.getName() << "\n");
+    return false;
+  }
+
+  bool Changed = handleDeviceScopeSharedVariables(M);
+
+  LLVM_DEBUG(dbgs() << "===== Done with hanlding device scope shared variables "
+                       "in the module "
+                    << M.getName() << "\n");
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -40,6 +40,7 @@
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
   static bool EnableFixedFunctionABI;
+  static bool EnableDeviceScopeSharedVariable;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -198,6 +198,12 @@
     cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
     cl::Hidden);
 
+static cl::opt<bool, true> EnableDeviceScopeSharedVariable(
+    "amdgpu-enable-device-scope-shared-variable",
+    cl::desc("Support amdgpu device scope shared variables"),
+    cl::location(AMDGPUTargetMachine::EnableDeviceScopeSharedVariable),
+    cl::init(false), cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -264,6 +270,7 @@
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
   initializeSIAddIMGInitPass(*PR);
+  initializeAMDGPUDeviceScopeSharedVariablePass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -393,6 +400,7 @@
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
 bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
+bool AMDGPUTargetMachine::EnableDeviceScopeSharedVariable = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -770,6 +778,11 @@
   // but EarlyCSE can do neither of them.
   if (getOptLevel() != CodeGenOpt::None && EnableScalarIRPasses)
     addEarlyCSEOrGVNPass();
+
+  // We expect to run this pass as a last IR pass. Hence make sure that this
+  // pass is added as a last IR pass
+  if (EnableDeviceScopeSharedVariable)
+    addPass(createAMDGPUDeviceScopeSharedVariablePass());
 }
 
 void AMDGPUPassConfig::addCodeGenPrepare() {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -48,6 +48,7 @@
   AMDGPUAtomicOptimizer.cpp
   AMDGPUCallLowering.cpp
   AMDGPUCodeGenPrepare.cpp
+  AMDGPUDeviceScopeSharedVariable.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFixFunctionBitcasts.cpp
   AMDGPUFrameLowering.cpp