Index: include/llvm/CodeGen/TargetPassConfig.h
===================================================================
--- include/llvm/CodeGen/TargetPassConfig.h
+++ include/llvm/CodeGen/TargetPassConfig.h
@@ -355,6 +355,10 @@
   /// instructions in SSA form.
   virtual void addMachineSSAOptimization();
 
+  /// Add passes that lower variables to a
+  /// particular memory type.
+  virtual void addMachineSSALowering() {}
+
   /// Add passes that optimize instruction level parallelism for out-of-order
   /// targets. These passes are run while the machine code is still in SSA
   /// form, so they can use MachineTraceMetrics to control their heuristics.
Index: lib/CodeGen/TargetPassConfig.cpp
===================================================================
--- lib/CodeGen/TargetPassConfig.cpp
+++ lib/CodeGen/TargetPassConfig.cpp
@@ -813,6 +813,11 @@
   if (getOptLevel() != CodeGenOpt::None) {
     addMachineSSAOptimization();
   } else {
+    // Ensure lowering to the appropriate memroy type occurs even when no
+    // optimizations are enabled. This type of lowering is required for
+    // correctness by the NVPTX backend.
+    addMachineSSALowering();
+
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
     addPass(&LocalStackSlotAllocationID, false);
Index: lib/Target/NVPTX/CMakeLists.txt
===================================================================
--- lib/Target/NVPTX/CMakeLists.txt
+++ lib/Target/NVPTX/CMakeLists.txt
@@ -24,11 +24,13 @@
   NVPTXPeephole.cpp
   NVPTXMCExpr.cpp
   NVPTXPrologEpilogPass.cpp
+  NVPTXLowerSharedFrameIndicesPass.cpp
   NVPTXRegisterInfo.cpp
   NVPTXReplaceImageHandles.cpp
   NVPTXSubtarget.cpp
   NVPTXTargetMachine.cpp
   NVPTXTargetTransformInfo.cpp
+  NVPTXFunctionDataSharing.cpp
   NVPTXUtilities.cpp
   NVVMIntrRange.cpp
   NVVMReflect.cpp
Index: lib/Target/NVPTX/NVPTX.h
===================================================================
--- lib/Target/NVPTX/NVPTX.h
+++ lib/Target/NVPTX/NVPTX.h
@@ -48,10 +48,12 @@
 FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion);
 FunctionPass *createNVVMReflectPass();
 MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXLowerSharedFrameIndicesPass();
 MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
 FunctionPass *createNVPTXImageOptimizerPass();
 FunctionPass *createNVPTXLowerArgsPass(const NVPTXTargetMachine *TM);
 BasicBlockPass *createNVPTXLowerAllocaPass();
+FunctionPass *createNVPTXFunctionDataSharingPass(const NVPTXTargetMachine *TM);
 MachineFunctionPass *createNVPTXPeephole();
 
 Target &getTheNVPTXTarget32();
Index: lib/Target/NVPTX/NVPTXAsmPrinter.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -92,6 +92,7 @@
 using namespace llvm;
 
 #define DEPOTNAME "__local_depot"
+#define SHARED_DEPOTNAME "__shared_depot"
 
 static cl::opt<bool>
 EmitLineNumbers("nvptx-emit-line-numbers", cl::Hidden,
@@ -1721,6 +1722,10 @@
   // virtual register number starting from 1 with that class.
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   //unsigned numRegClasses = TRI->getNumRegClasses();
+  bool IsKernelFunction = isKernelFunction(MF.getFunction());
+
+  bool GenerateSharedDepot =
+      MF.getFunction().hasFnAttribute("has-nvptx-shared-depot");
 
   // Emit the Fake Stack Object
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1728,12 +1733,25 @@
   if (NumBytes) {
     O << "\t.local .align " << MFI.getMaxAlignment() << " .b8 \t" << DEPOTNAME
       << getFunctionNumber() << "[" << NumBytes << "];\n";
+    if (IsKernelFunction && GenerateSharedDepot) {
+      O << "\t.shared .align " << MFI.getMaxAlignment()
+        << " .b8 \t" << SHARED_DEPOTNAME << getFunctionNumber()
+        << "[" << NumBytes << "];\n";
+    }
     if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
       O << "\t.reg .b64 \t%SP;\n";
       O << "\t.reg .b64 \t%SPL;\n";
+      if (IsKernelFunction && GenerateSharedDepot) {
+        O << "\t.reg .b64 \t%SPS;\n";
+        O << "\t.reg .b64 \t%SPSH;\n";
+      }
     } else {
       O << "\t.reg .b32 \t%SP;\n";
       O << "\t.reg .b32 \t%SPL;\n";
+      if (IsKernelFunction && GenerateSharedDepot) {
+        O << "\t.reg .b32 \t%SPS;\n";
+        O << "\t.reg .b32 \t%SPSH;\n";
+      }
     }
   }
 
@@ -2362,6 +2380,8 @@
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
       if (MO.getReg() == NVPTX::VRDepot)
         O << DEPOTNAME << getFunctionNumber();
+      else if (MO.getReg() == NVPTX::VRSharedDepot)
+        O << SHARED_DEPOTNAME << getFunctionNumber();
       else
         O << NVPTXInstPrinter::getRegisterName(MO.getReg());
     } else {
Index: lib/Target/NVPTX/NVPTXFrameLowering.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -16,6 +16,7 @@
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXSubtarget.h"
 #include "NVPTXTargetMachine.h"
+#include "NVPTXUtilities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -61,6 +62,37 @@
     BuildMI(MBB, MI, dl, MF.getSubtarget().getInstrInfo()->get(MovDepotOpcode),
             NVPTX::VRFrameLocal)
         .addImm(MF.getFunctionNumber());
+
+    bool SharedStackPointerInit =
+        MF.getFunction().hasFnAttribute("has-nvptx-shared-depot");
+
+    // Only emit a shared depot for the main kernel function.
+    // The other device functions need to get a handle on this shared depot
+    // by interacting with a runtime library.
+    //
+    // Clang can trigger this at any point via the has-nvptx-shared-depot function
+    // attribute.
+    //
+    // Currently this situation arises as a consequence of OpenMP semantics.
+    // The interaction in that case invovles the OpenMP runtime.
+    if (isKernelFunction(MF.getFunction()) && SharedStackPointerInit) {
+      // Emits
+      //   mov %SHSPL, %shared_depot;
+      //   cvta.shared %SHSP, %SHSPL;
+      // For the time being just emit it even if it's not used.
+      unsigned CvtaSharedOpcode =
+          Is64Bit ? NVPTX::cvta_shared_yes_64 : NVPTX::cvta_shared_yes;
+      unsigned MovSharedDepotOpcode =
+          Is64Bit ? NVPTX::MOV_SHARED_DEPOT_ADDR_64 : NVPTX::MOV_SHARED_DEPOT_ADDR;
+      MI = BuildMI(MBB, MI, dl,
+                   MF.getSubtarget().getInstrInfo()->get(CvtaSharedOpcode),
+                   NVPTX::VRShared)
+               .addReg(NVPTX::VRFrameShared);
+      BuildMI(MBB, MI, dl,
+              MF.getSubtarget().getInstrInfo()->get(MovSharedDepotOpcode),
+              NVPTX::VRFrameShared)
+          .addImm(MF.getFunctionNumber());
+    }
   }
 }
 
Index: lib/Target/NVPTX/NVPTXFunctionDataSharing.h
===================================================================
--- /dev/null
+++ lib/Target/NVPTX/NVPTXFunctionDataSharing.h
@@ -0,0 +1,37 @@
+//===--- NVPTXFrameLowering.h - Define frame lowering for NVPTX -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXFUNCTIONDATASHARING_H
+#define LLVM_LIB_TARGET_NVPTX_NVPTXFUNCTIONDATASHARING_H
+
+namespace llvm {
+
+class NVPTXFunctionDataSharing : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXFunctionDataSharing(const NVPTXTargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
+  StringRef getPassName() const override {
+    return "Function level data sharing pass.";
+  }
+
+private:
+  const NVPTXTargetMachine *TM;
+};
+} // End llvm namespace
+
+#endif
\ No newline at end of file
Index: lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
===================================================================
--- /dev/null
+++ lib/Target/NVPTX/NVPTXFunctionDataSharing.cpp
@@ -0,0 +1,127 @@
+//===-- FunctionDataSharing.cpp - Mark pointers as shared -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// For all alloca instructions, add a pair of cast to shared address for
+// each of them. For example,
+//
+//   %A = alloca i32
+//   store i32 0, i32* %A ; emits st.u32
+//
+// will be transformed to
+//
+//   %A = alloca i32
+//   %Local = addrspacecast i32* %A to i32 addrspace(3)*
+//   %Shared = addrspacecast i32 addrspace(3)* %A to i32*
+//   store i32 0, i32 addrspace(5)* %Shared ; emits st.shared.u32
+//
+// And we will rely on NVPTXInferAddressSpaces to combine the last two
+// instructions.
+//
+// This pass is invoked for -O0 only.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeNVPTXFunctionDataSharingPass(PassRegistry &);
+}
+
+namespace {
+class NVPTXFunctionDataSharing : public FunctionPass {
+  bool runOnFunction(Function &F) override;
+  bool runOnKernelFunction(Function &F);
+  bool runOnDeviceFunction(Function &F);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  NVPTXFunctionDataSharing(const NVPTXTargetMachine *TM = nullptr)
+      : FunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "Function level data sharing pass.";
+  }
+};
+} // namespace
+
+char NVPTXFunctionDataSharing::ID = 1;
+
+INITIALIZE_PASS(NVPTXFunctionDataSharing, "nvptx-function-data-sharing",
+                "Function Data Sharing (NVPTX)", false, false)
+
+static void markPointerAsShared(Value *Ptr) {
+  if (Ptr->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED)
+    return;
+
+  // Deciding where to emit the addrspacecast pair.
+  // Insert right after Ptr if Ptr is an instruction.
+  BasicBlock::iterator InsertPt =
+      std::next(cast<Instruction>(Ptr)->getIterator());
+  assert(InsertPt != InsertPt->getParent()->end() &&
+         "We don't call this function with Ptr being a terminator.");
+
+  auto *PtrInShared = new AddrSpaceCastInst(
+      Ptr, PointerType::get(Ptr->getType()->getPointerElementType(),
+                            ADDRESS_SPACE_SHARED),
+      Ptr->getName(), &*InsertPt);
+  // Old version
+  auto *PtrInGeneric = new AddrSpaceCastInst(PtrInShared, Ptr->getType(),
+                                             Ptr->getName(), &*InsertPt);
+  // Replace with PtrInGeneric all uses of Ptr except PtrInShared.
+  Ptr->replaceAllUsesWith(PtrInGeneric);
+  PtrInShared->setOperand(0, Ptr);
+}
+
+// =============================================================================
+// Main function for this pass.
+// =============================================================================
+bool NVPTXFunctionDataSharing::runOnKernelFunction(Function &F) {
+  bool Modified = false;
+
+  // Skip pass if no data sharing is required.
+  if (!F.hasFnAttribute("has-nvptx-shared-depot"))
+    return Modified;
+
+  for (auto &B : F) {
+    for (auto &I : B) {
+      auto *AI = dyn_cast<AllocaInst>(&I);
+      if (!AI)
+        continue;
+      if (AI->getType()->isPointerTy() && ptrIsStored(AI)) {
+        markPointerAsShared(AI);
+        Modified = true;
+      }
+    }
+  }
+
+  return Modified;
+}
+
+// Device functions only need to copy byval args into local memory.
+bool NVPTXFunctionDataSharing::runOnDeviceFunction(Function &F) {
+  return true;
+}
+
+bool NVPTXFunctionDataSharing::runOnFunction(Function &F) {
+  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
+}
+
+FunctionPass *
+llvm::createNVPTXFunctionDataSharingPass(const NVPTXTargetMachine *TM) {
+  return new NVPTXFunctionDataSharing(TM);
+}
Index: lib/Target/NVPTX/NVPTXInstrInfo.td
===================================================================
--- lib/Target/NVPTX/NVPTXInstrInfo.td
+++ lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1583,6 +1583,10 @@
                                      "mov.u32 \t$d, __local_depot$num;", []>;
   def MOV_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
                                     "mov.u64 \t$d, __local_depot$num;", []>;
+  def MOV_SHARED_DEPOT_ADDR :    NVPTXInst<(outs Int32Regs:$d), (ins i32imm:$num),
+                                           "mov.u32 \t$d, __shared_depot$num;", []>;
+  def MOV_SHARED_DEPOT_ADDR_64 : NVPTXInst<(outs Int64Regs:$d), (ins i32imm:$num),
+                                           "mov.u64 \t$d, __shared_depot$num;", []>;
 }
 
 
Index: lib/Target/NVPTX/NVPTXLowerAlloca.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXLowerAlloca.cpp
+++ lib/Target/NVPTX/NVPTXLowerAlloca.cpp
@@ -23,6 +23,9 @@
 // And we will rely on NVPTXInferAddressSpaces to combine the last two
 // instructions.
 //
+// In the case of OpenMP shared variables, perform the same transformation as
+// for local variables but using the shared address space.
+//
 //===----------------------------------------------------------------------===//
 
 #include "NVPTX.h"
@@ -71,13 +74,36 @@
       Changed = true;
       auto PTy = dyn_cast<PointerType>(allocaInst->getType());
       auto ETy = PTy->getElementType();
-      auto LocalAddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
-      auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, LocalAddrTy, "");
-      auto GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
+
+      // In the CUDA case, this is always a local address.
+      // In offloading to a device using OpenMP this may be an
+      // address allocated in the shared memory of the device.
+      auto *AddrTy = PointerType::get(ETy, ADDRESS_SPACE_LOCAL);
+      bool PtrIsStored = ptrIsStored(allocaInst);
+      bool RequiresSharedMemory =
+          BB.getParent()->hasFnAttribute("has-nvptx-shared-depot");
+
+      // Handle shared args: currently shared args are declared as
+      // an alloca in LLVM-IR code generation and lowered to
+      // shared memory.
+      if (PtrIsStored && RequiresSharedMemory)
+       AddrTy = PointerType::get(ETy, ADDRESS_SPACE_SHARED);
+
+      auto NewASCToLocal = new AddrSpaceCastInst(allocaInst, AddrTy, "");
+      auto *GenericAddrTy = PointerType::get(ETy, ADDRESS_SPACE_GENERIC);
       auto NewASCToGeneric = new AddrSpaceCastInst(NewASCToLocal,
                                                     GenericAddrTy, "");
       NewASCToLocal->insertAfter(allocaInst);
       NewASCToGeneric->insertAfter(NewASCToLocal);
+
+      // If a value is shared then the additional conversions are required for
+      // correctness.
+      if (PtrIsStored && RequiresSharedMemory) {
+        allocaInst->replaceAllUsesWith(NewASCToGeneric);
+        NewASCToLocal->setOperand(0, allocaInst);
+        continue;
+      }
+
       for (Value::use_iterator UI = allocaInst->use_begin(),
                                 UE = allocaInst->use_end();
             UI != UE; ) {
@@ -93,9 +119,15 @@
           continue;
         }
         auto SI = dyn_cast<StoreInst>(AllocaUse.getUser());
-        if (SI && SI->getPointerOperand() == allocaInst && !SI->isVolatile()) {
-          SI->setOperand(SI->getPointerOperandIndex(), NewASCToGeneric);
-          continue;
+        if (SI && !SI->isVolatile()){
+          unsigned Idx;
+          if (SI->getPointerOperand() == allocaInst)
+            Idx = SI->getPointerOperandIndex();
+          else if (SI->getValueOperand() == allocaInst)
+            Idx = 0;
+          else
+            continue;
+          SI->setOperand(Idx, NewASCToGeneric);
         }
         auto GI = dyn_cast<GetElementPtrInst>(AllocaUse.getUser());
         if (GI && GI->getPointerOperand() == allocaInst) {
Index: lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
===================================================================
--- /dev/null
+++ lib/Target/NVPTX/NVPTXLowerSharedFrameIndicesPass.cpp
@@ -0,0 +1,291 @@
+//===-- NVPTXLowerSharedFrameIndicesPass.cpp - NVPTX lowering  ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the frame indices that need to reference the shared memry
+// depot to the shared framed virtual register VRShared. This pass only performs
+// the lowering if the function has the has-nvptx-shared-depot attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "NVPTXRegisterInfo.h"
+#include "NVPTXSubtarget.h"
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-lower-shared-frame-indices"
+
+namespace {
+class NVPTXLowerSharedFrameIndicesPass : public MachineFunctionPass {
+public:
+  static char ID;
+  NVPTXLowerSharedFrameIndicesPass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  void calculateSharedFrameObjectOffsets(MachineFunction &Fn);
+};
+}
+
+MachineFunctionPass *llvm::createNVPTXLowerSharedFrameIndicesPass() {
+  return new NVPTXLowerSharedFrameIndicesPass();
+}
+
+char NVPTXLowerSharedFrameIndicesPass::ID = 0;
+
+static bool isSharedFrame(
+      MachineBasicBlock::iterator II,
+      MachineFunction &MF) {
+  MachineInstr &currentMI = *II;
+
+  if (!currentMI.getOperand(0).isReg())
+    return false;;
+
+  bool useSharedFrame = false;
+  unsigned AllocRegisterNumber = currentMI.getOperand(0).getReg();
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.getOpcode() == NVPTX::cvta_to_shared_yes_64 ||
+          MI.getOpcode() == NVPTX::cvta_to_shared_yes) {
+        if (AllocRegisterNumber == MI.getOperand(1).getReg()) {
+          useSharedFrame = true;
+          break;
+        }
+      }
+    }
+  }
+  return useSharedFrame;
+}
+
+bool NVPTXLowerSharedFrameIndicesPass::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+  bool IsKernel = isKernelFunction(MF.getFunction());
+
+  // Skip pass if function is not the kernel.
+  if (!IsKernel)
+    return Modified;
+
+  // Skip pass if no data sharing is required.
+  if (!MF.getFunction().hasFnAttribute("has-nvptx-shared-depot"))
+    return Modified;
+
+  SmallVector<int, 16> SharedFrameIndices;
+
+  calculateSharedFrameObjectOffsets(MF);
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+
+        if (i + 1 >= MI.getNumOperands())
+          continue;
+
+        bool IsSharedFrame = false;
+        int FrameIndex = MI.getOperand(i).getIndex();
+
+        for(int SFI : SharedFrameIndices)
+          if (FrameIndex == SFI)
+            IsSharedFrame = true;
+
+        if (!IsSharedFrame && isSharedFrame(MI, MF)) {
+          SharedFrameIndices.push_back(FrameIndex);
+          IsSharedFrame = true;
+        }
+
+        if (IsSharedFrame) {
+          // Change Frame index to use shared stack.
+          MachineFunction &MF = *MI.getParent()->getParent();
+          int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex) +
+                       MI.getOperand(i + 1).getImm();
+
+          // Using I0 as the frame pointer
+          // For shared data use the appropriate virtual register: VRShared
+          MI.getOperand(i).ChangeToRegister(NVPTX::VRShared, false);
+          MI.getOperand(i + 1).ChangeToImmediate(Offset);
+        }
+        Modified = true;
+      }
+    }
+  }
+
+  return Modified;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI.getObjectSize(FrameIdx);
+
+  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI.setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI.setObjectOffset(FrameIdx, Offset);
+    Offset += MFI.getObjectSize(FrameIdx);
+  }
+}
+
+/// This function computes the offset inside the shared stack.
+///
+/// TODO: For simplicity, currently, the offsets conincide with
+/// the local stack frame offsets - the local and stack frame
+/// offsets are the same length.
+void
+NVPTXLowerSharedFrameIndicesPass::calculateSharedFrameObjectOffsets(
+      MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo &MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // We currently don't support filling in holes in between fixed sized
+  // objects, so we adjust 'Offset' to point to the end of last fixed sized
+  // preallocated object.
+  for (int i = MFI.getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI.getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI.getObjectOffset(i) + MFI.getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // NOTE: We do not have a call stack
+
+  unsigned MaxAlign = MFI.getMaxAlignment();
+
+  // No scavenger
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI.getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI.getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = (Offset + Align - 1) / Align * Align;
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI.getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI.getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI.setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI.getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // No stack protector
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i) {
+    if (MFI.isObjectPreAllocated(i) &&
+        MFI.getUseLocalStackAllocationBlock())
+      continue;
+    if (MFI.isDeadObjectIndex(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign);
+  }
+
+  // No scavenger
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI.adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI.getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    unsigned AlignMask = StackAlign - 1;
+    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI.setStackSize(StackSize);
+}
Index: lib/Target/NVPTX/NVPTXRegisterInfo.h
===================================================================
--- lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -45,6 +45,8 @@
 
   unsigned getFrameRegister(const MachineFunction &MF) const override;
 
+  unsigned getSharedFrameRegister(const MachineFunction &MF) const;
+
   ManagedStringPool *getStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
   }
Index: lib/Target/NVPTX/NVPTXRegisterInfo.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -130,3 +130,8 @@
 unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return NVPTX::VRFrame;
 }
+
+unsigned NVPTXRegisterInfo::getSharedFrameRegister(
+    const MachineFunction &MF) const {
+  return NVPTX::VRShared;
+}
Index: lib/Target/NVPTX/NVPTXRegisterInfo.td
===================================================================
--- lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -25,9 +25,12 @@
 // Special Registers used as stack pointer
 def VRFrame         : NVPTXReg<"%SP">;
 def VRFrameLocal    : NVPTXReg<"%SPL">;
+def VRShared        : NVPTXReg<"%SPS">;
+def VRFrameShared   : NVPTXReg<"%SPSH">;
 
 // Special Registers used as the stack
 def VRDepot  : NVPTXReg<"%Depot">;
+def VRSharedDepot  : NVPTXReg<"%SharedDepot">;
 
 // We use virtual registers, but define a few physical registers here to keep
 // SDAG and the MachineInstr layers happy.
@@ -69,5 +72,5 @@
 def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;
 
 // Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
-def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot,
+def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame, VRFrameLocal, VRDepot, VRShared, VRFrameShared, VRSharedDepot,
                                             (sequence "ENVREG%u", 0, 31))>;
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -54,6 +54,7 @@
 void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
 void initializeNVPTXLowerArgsPass(PassRegistry &);
 void initializeNVPTXLowerAllocaPass(PassRegistry &);
+void initializeNVPTXFunctionDataSharingPass(PassRegistry &);
 
 } // end namespace llvm
 
@@ -72,6 +73,7 @@
   initializeNVPTXAssignValidGlobalNamesPass(PR);
   initializeNVPTXLowerArgsPass(PR);
   initializeNVPTXLowerAllocaPass(PR);
+  initializeNVPTXFunctionDataSharingPass(PR);
   initializeNVPTXLowerAggrCopiesPass(PR);
 }
 
@@ -148,6 +150,7 @@
   bool addInstSelector() override;
   void addPostRegAlloc() override;
   void addMachineSSAOptimization() override;
+  void addMachineSSALowering() override;
 
   FunctionPass *createTargetRegisterAllocator(bool) override;
   void addFastRegAlloc(FunctionPass *RegAllocPass) override;
@@ -248,10 +251,16 @@
   // before the address space inference passes.
   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
   if (getOptLevel() != CodeGenOpt::None) {
+    // Add address space inference passes
     addAddressSpaceInferencePasses();
     if (!DisableLoadStoreVectorizer)
       addPass(createLoadStoreVectorizerPass());
     addStraightLineScalarOptimizationPasses();
+  } else {
+    // When the shared depot is generated, even when no optimizations are
+    // used, we need to lower certain alloca instructions to the appropriate
+    // memory type for correctness.
+    addPass(createNVPTXFunctionDataSharingPass(&getNVPTXTargetMachine()));
   }
 
   // === LSR and other generic IR passes ===
@@ -329,6 +338,11 @@
   printAndVerify("After StackSlotColoring");
 }
 
+void NVPTXPassConfig::addMachineSSALowering() {
+  // Lower shared frame indices.
+  addPass(createNVPTXLowerSharedFrameIndicesPass(), false);
+}
+
 void NVPTXPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
   if (addPass(&EarlyTailDuplicateID))
@@ -338,6 +352,11 @@
   // instructions dead.
   addPass(&OptimizePHIsID);
 
+  // To avoid SSA optimizations on the local frame indices from treating
+  // shared and local frame indices the same, we will lower shared frame
+  // before the optimizations are applied.
+  addMachineSSALowering();
+
   // This pass merges large allocas. StackSlotColoring is a different pass
   // which merges spill slots.
   addPass(&StackColoringID);
Index: lib/Target/NVPTX/NVPTXUtilities.h
===================================================================
--- lib/Target/NVPTX/NVPTXUtilities.h
+++ lib/Target/NVPTX/NVPTXUtilities.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 #define LLVM_LIB_TARGET_NVPTX_NVPTXUTILITIES_H
 
+#include "NVPTXTargetMachine.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -60,6 +62,8 @@
 bool getAlign(const Function &, unsigned index, unsigned &);
 bool getAlign(const CallInst &, unsigned index, unsigned &);
 
+bool ptrIsStored(Value *Ptr);
+
 }
 
 #endif
Index: lib/Target/NVPTX/NVPTXUtilities.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXUtilities.cpp
+++ lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -12,6 +12,7 @@
 
 #include "NVPTXUtilities.h"
 #include "NVPTX.h"
+#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -28,6 +29,8 @@
 
 namespace llvm {
 
+#define DEBUG_TYPE "nvptx-utilities"
+
 namespace {
 typedef std::map<std::string, std::vector<unsigned> > key_val_pair_t;
 typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
@@ -314,4 +317,10 @@
   return false;
 }
 
+/// Returns true if there are any instructions storing
+/// the address of this pointer.
+bool ptrIsStored(Value *Ptr) {
+  return PointerMayBeCaptured(Ptr, false, true);
+}
+
 } // namespace llvm
Index: test/CodeGen/NVPTX/insert-shared-depot.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/insert-shared-depot.ll
@@ -0,0 +1,42 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; PTX32: {{.*}}kernel()
+; PTX64: {{.*}}kernel()
+
+; PTX32: .local .align 8{{.*}}.b8{{.*}}__local_depot0
+; PTX64: .local .align 8{{.*}}.b8{{.*}}__local_depot0
+
+; PTX32: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0
+; PTX64: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0
+
+; PTX32: .reg .b32{{.*}}%SPS;
+; PTX64: .reg .b64{{.*}}%SPS;
+
+; PTX32: .reg .b32{{.*}}%SPSH;
+; PTX64: .reg .b64{{.*}}%SPSH;
+
+; PTX32: mov.u32{{.*}}%SPSH, __shared_depot0;
+; PTX64: mov.u64{{.*}}%SPSH, __shared_depot0;
+
+; PTX32: cvta.shared.u32{{.*}}%SPS, %SPSH;
+; PTX64: cvta.shared.u64{{.*}}%SPS, %SPSH;
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() #0 {
+; LABEL: @linsert_shared_depot
+  %A = alloca i32, align 4
+  %shared_args = alloca i8**, align 8
+  call void @callee(i8*** %shared_args)
+  store i32 10, i32* %A
+  ret void
+}
+
+declare void @callee(i8***)
+
+attributes #0 = {"has-nvptx-shared-depot"}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
Index: test/CodeGen/NVPTX/lower-alloca-shared.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/lower-alloca-shared.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -S -nvptx-lower-alloca -infer-address-spaces | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() #0 {
+; LABEL: @lower_shared_alloca
+; PTX-LABEL: .visible .entry kernel(
+  %A = alloca i32
+; CHECK: addrspacecast i32* %A to i32 addrspace(3)*
+; CHECK: addrspacecast i32 addrspace(3)* %1 to i32*
+; CHECK: store i32 0, i32 addrspace(3)* {{%.+}}
+; PTX: add.u64 {{%rd[0-9]+}}, %SPS, 0;
+; PTX: cvta.to.shared.u64 {{%rd[0-9]+}}, {{%rd[0-9]+}};
+; PTX: st.shared.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
+  %shared_args = alloca i32**
+  call void @callee(i32*** %shared_args)
+  %1 = load i32**, i32*** %shared_args
+  %2 = getelementptr inbounds i32*, i32** %1, i64 0
+  store i32* %A, i32** %2
+  store i32 0, i32* %A
+  ret void
+}
+
+declare void @callee(i32***)
+
+attributes #0 = {"has-nvptx-shared-depot"}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
Index: test/CodeGen/NVPTX/no-shared-depot.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/no-shared-depot.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s --check-prefix=PTX32
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX64
+
+; PTX32: {{.*}}kernel()
+; PTX64: {{.*}}kernel()
+
+; PTX32: .local .align 8{{.*}}.b8{{.*}}__local_depot0
+; PTX64: .local .align 8{{.*}}.b8{{.*}}__local_depot0
+
+; PTX32-NOT: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0
+; PTX64-NOT: .shared .align 8{{.*}}.b8{{.*}}__shared_depot0
+
+; PTX32-NOT: .reg .b32{{.*}}%SPS;
+; PTX64-NOT: .reg .b64{{.*}}%SPS;
+
+; PTX32-NOT: .reg .b32{{.*}}%SPSH;
+; PTX64-NOT: .reg .b64{{.*}}%SPSH;
+
+; PTX32-NOT: mov.u32{{.*}}%SPSH, __shared_depot0;
+; PTX64-NOT: mov.u64{{.*}}%SPSH, __shared_depot0;
+
+; PTX32-NOT: cvta.shared.u32{{.*}}%SPS, %SPSH;
+; PTX64-NOT: cvta.shared.u64{{.*}}%SPS, %SPSH;
+
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() {
+; LABEL: @linsert_shared_depot
+  %A = alloca i32, align 4
+  %shared_args = alloca i8**, align 8
+  call void @callee(i8*** %shared_args)
+  store i32 10, i32* %A
+  ret void
+}
+
+declare void @callee(i8***)
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}
Index: test/CodeGen/NVPTX/nvptx-function-data-sharing.ll
===================================================================
--- /dev/null
+++ test/CodeGen/NVPTX/nvptx-function-data-sharing.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -S -nvptx-function-data-sharing -infer-address-spaces | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+target triple = "nvptx64-unknown-unknown"
+
+define void @kernel() #0 {
+; LABEL: @lower_shared_alloca
+; PTX-LABEL: .visible .entry kernel(
+  %A = alloca i32
+; CHECK: addrspacecast i32* %A to i32 addrspace(3)*
+; CHECK: addrspacecast i32 addrspace(3)* %A1 to i32*
+; CHECK: store i32 0, i32 addrspace(3)* {{%.+}}
+; PTX: add.u64 {{%rd[0-9]+}}, %SPS, 0;
+; PTX: cvta.to.shared.u64 {{%rd[0-9]+}}, {{%rd[0-9]+}};
+; PTX: st.shared.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
+  %shared_args = alloca i32**
+  call void @callee(i32*** %shared_args)
+  %1 = load i32**, i32*** %shared_args
+  %2 = getelementptr inbounds i32*, i32** %1, i64 0
+  store i32* %A, i32** %2
+  store i32 0, i32* %A
+  ret void
+}
+
+declare void @callee(i32***)
+
+attributes #0 = {"has-nvptx-shared-depot"}
+
+!nvvm.annotations = !{!0}
+!0 = !{void ()* @kernel, !"kernel", i32 1}