diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -439,16 +439,11 @@
 
   /// The data for the single globalized variable.
   struct MappedVarData {
+    MappedVarData() = default;
     /// Corresponding field in the global record.
-    const FieldDecl *FD = nullptr;
+    llvm::Value *GlobalizedVal = nullptr;
     /// Corresponding address.
     Address PrivateAddr = Address::invalid();
-    /// true, if only one element is required (for latprivates in SPMD mode),
-    /// false, if need to create based on the warp-size.
-    bool IsOnePerTeam = false;
-    MappedVarData() = delete;
-    MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false)
-        : FD(FD), IsOnePerTeam(IsOnePerTeam) {}
   };
   /// The map of local variables to their addresses in the global memory.
   using DeclToAddrMapTy = llvm::MapVector<const Decl *, MappedVarData>;
@@ -460,29 +455,12 @@
     EscapedParamsTy EscapedParameters;
     llvm::SmallVector<const ValueDecl*, 4> EscapedVariableLengthDecls;
     llvm::SmallVector<llvm::Value *, 4> EscapedVariableLengthDeclsAddrs;
-    const RecordDecl *GlobalRecord = nullptr;
-    llvm::Optional<const RecordDecl *> SecondaryGlobalRecord = llvm::None;
-    llvm::Value *GlobalRecordAddr = nullptr;
     llvm::Value *IsInSPMDModeFlag = nullptr;
     std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams;
   };
   /// Maps the function to the list of the globalized variables with their
   /// addresses.
   llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
-  /// List of records for the globalized variables in target/teams/distribute
-  /// contexts. Inner records are going to be joined into the single record,
-  /// while those resulting records are going to be joined into the single
-  /// union. This resulting union (one per CU) is the entry point for the static
-  /// memory management runtime functions.
-  struct GlobalPtrSizeRecsTy {
-    llvm::GlobalVariable *UseSharedMemory = nullptr;
-    llvm::GlobalVariable *RecSize = nullptr;
-    llvm::GlobalVariable *Buffer = nullptr;
-    SourceLocation Loc;
-    llvm::SmallVector<const RecordDecl *, 2> Records;
-    unsigned RegionCounter = 0;
-  };
-  llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords;
   llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
   /// List of the records with the list of fields for the reductions across the
   /// teams. Used to build the intermediate buffer for the fast teams
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1096,17 +1096,6 @@
   } Action(EST, WST);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
-  // Reserve place for the globalized memory.
-  GlobalizedRecords.emplace_back();
-  if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::UndefValue::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
-  }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
   IsInTTDRegion = false;
@@ -1156,10 +1145,6 @@
                           CGM.getModule(), OMPRTL___kmpc_kernel_init),
                       Args);
 
-  // For data sharing, we need to initialize the stack.
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-      CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
-
   emitGenericVarsProlog(CGF, WST.Loc);
 }
 
@@ -1228,17 +1213,6 @@
   } Action(*this, EST, D);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
-  // Reserve place for the globalized memory.
-  GlobalizedRecords.emplace_back();
-  if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::UndefValue::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
-  }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
   IsInTTDRegion = false;
@@ -1260,12 +1234,6 @@
                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
                       Args);
 
-  if (RequiresFullRuntime) {
-    // For data sharing, we need to initialize the stack.
-    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-        CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
-  }
-
   CGF.EmitBranch(ExecuteBB);
 
   CGF.EmitBlock(ExecuteBB);
@@ -1671,7 +1639,6 @@
           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
       if (GlobalizedRD) {
         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
-        I->getSecond().GlobalRecord = GlobalizedRD;
         I->getSecond().MappedParams =
             std::make_unique<CodeGenFunction::OMPMapVars>();
         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
@@ -1679,8 +1646,7 @@
           assert(Pair.getFirst()->isCanonicalDecl() &&
                  "Expected canonical declaration");
           Data.insert(std::make_pair(Pair.getFirst(),
-                                     MappedVarData(Pair.getSecond(),
-                                                   /*IsOnePerTeam=*/true)));
+                                     MappedVarData()));
         }
       }
       Rt.emitGenericVarsProlog(CGF, Loc);
@@ -1709,282 +1675,69 @@
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I == FunctionGlobalizedDecls.end())
     return;
-  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
-    QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
-    QualType SecGlobalRecTy;
 
-    // Recover pointer to this function's global record. The runtime will
-    // handle the specifics of the allocation of the memory.
-    // Use actual memory size of the record including the padding
-    // for alignment purposes.
-    unsigned Alignment =
-        CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
-    unsigned GlobalRecordSize =
-        CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
-    GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
-
-    llvm::PointerType *GlobalRecPtrTy =
-        CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
-    llvm::Value *GlobalRecCastAddr;
-    llvm::Value *IsTTD = nullptr;
-    if (!IsInTTDRegion &&
-        (WithSPMDCheck ||
-         getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
-      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-      llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
-      llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
-      if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
-        llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-        llvm::Value *ThreadID = getThreadID(CGF, Loc);
-        llvm::Value *PL = CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
-                                                  OMPRTL___kmpc_parallel_level),
-            {RTLoc, ThreadID});
-        IsTTD = Bld.CreateIsNull(PL);
-      }
-      llvm::Value *IsSPMD = Bld.CreateIsNotNull(
-          CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
-      Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
-      // There is no need to emit line number for unconditional branch.
-      (void)ApplyDebugLocation::CreateEmpty(CGF);
-      CGF.EmitBlock(SPMDBB);
-      Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
-                               CharUnits::fromQuantity(Alignment));
-      CGF.EmitBranch(ExitBB);
-      // There is no need to emit line number for unconditional branch.
-      (void)ApplyDebugLocation::CreateEmpty(CGF);
-      CGF.EmitBlock(NonSPMDBB);
-      llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
-      if (const RecordDecl *SecGlobalizedVarsRecord =
-              I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
-        SecGlobalRecTy =
-            CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
-
-        // Recover pointer to this function's global record. The runtime will
-        // handle the specifics of the allocation of the memory.
-        // Use actual memory size of the record including the padding
-        // for alignment purposes.
-        unsigned Alignment =
-            CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
-        unsigned GlobalRecordSize =
-            CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
-        GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
-        Size = Bld.CreateSelect(
-            IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
-      }
-      // TODO: allow the usage of shared memory to be controlled by
-      // the user, for now, default to global.
-      llvm::Value *GlobalRecordSizeArg[] = {
-          Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
-      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
-          GlobalRecordSizeArg);
-      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          GlobalRecValue, GlobalRecPtrTy);
-      CGF.EmitBlock(ExitBB);
-      auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
-                                /*NumReservedValues=*/2, "_select_stack");
-      Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
-      Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
-      GlobalRecCastAddr = Phi;
-      I->getSecond().GlobalRecordAddr = Phi;
-      I->getSecond().IsInSPMDModeFlag = IsSPMD;
-    } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
-      assert(GlobalizedRecords.back().Records.size() < 2 &&
-             "Expected less than 2 globalized records: one for target and one "
-             "for teams.");
-      unsigned Offset = 0;
-      for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
-        QualType RDTy = CGM.getContext().getRecordType(RD);
-        unsigned Alignment =
-            CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
-        unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
-        Offset =
-            llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
-      }
-      unsigned Alignment =
-          CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
-      Offset = llvm::alignTo(Offset, Alignment);
-      GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
-      ++GlobalizedRecords.back().RegionCounter;
-      if (GlobalizedRecords.back().Records.size() == 1) {
-        assert(KernelStaticGlobalized &&
-               "Kernel static pointer must be initialized already.");
-        auto *UseSharedMemory = new llvm::GlobalVariable(
-            CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
-            llvm::GlobalValue::InternalLinkage, nullptr,
-            "_openmp_static_kernel$is_shared");
-        UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-        QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
-            /*DestWidth=*/16, /*Signed=*/0);
-        llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
-            Address(UseSharedMemory,
-                    CGM.getContext().getTypeAlignInChars(Int16Ty)),
-            /*Volatile=*/false, Int16Ty, Loc);
-        auto *StaticGlobalized = new llvm::GlobalVariable(
-            CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
-            llvm::GlobalValue::CommonLinkage, nullptr);
-        auto *RecSize = new llvm::GlobalVariable(
-            CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
-            llvm::GlobalValue::InternalLinkage, nullptr,
-            "_openmp_static_kernel$size");
-        RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-        llvm::Value *Ld = CGF.EmitLoadOfScalar(
-            Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
-            CGM.getContext().getSizeType(), Loc);
-        llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-            KernelStaticGlobalized, CGM.VoidPtrPtrTy);
-        llvm::Value *GlobalRecordSizeArg[] = {
-            llvm::ConstantInt::get(
-                CGM.Int16Ty,
-                getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
-            StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
-            GlobalRecordSizeArg);
-        GlobalizedRecords.back().Buffer = StaticGlobalized;
-        GlobalizedRecords.back().RecSize = RecSize;
-        GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
-        GlobalizedRecords.back().Loc = Loc;
-      }
-      assert(KernelStaticGlobalized && "Global address must be set already.");
-      Address FrameAddr = CGF.EmitLoadOfPointer(
-          Address(KernelStaticGlobalized, CGM.getPointerAlign()),
-          CGM.getContext()
-              .getPointerType(CGM.getContext().VoidPtrTy)
-              .castAs<PointerType>());
-      llvm::Value *GlobalRecValue =
-          Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
-      I->getSecond().GlobalRecordAddr = GlobalRecValue;
-      I->getSecond().IsInSPMDModeFlag = nullptr;
-      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
-    } else {
-      // TODO: allow the usage of shared memory to be controlled by
-      // the user, for now, default to global.
-      bool UseSharedMemory =
-          IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
-      llvm::Value *GlobalRecordSizeArg[] = {
-          llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
-          CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
-      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(),
-              IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
-                            : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
-          GlobalRecordSizeArg);
-      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          GlobalRecValue, GlobalRecPtrTy);
-      I->getSecond().GlobalRecordAddr = GlobalRecValue;
-      I->getSecond().IsInSPMDModeFlag = nullptr;
-    }
-    LValue Base =
-        CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
-
-    // Emit the "global alloca" which is a GEP from the global declaration
-    // record using the pointer returned by the runtime.
-    LValue SecBase;
-    decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
-    if (IsTTD) {
-      SecIt = I->getSecond().SecondaryLocalVarData->begin();
-      llvm::PointerType *SecGlobalRecPtrTy =
-          CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
-      SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
-          Bld.CreatePointerBitCastOrAddrSpaceCast(
-              I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
-          SecGlobalRecTy);
+  for (auto &Rec : I->getSecond().LocalVarData) {
+    const auto *VD = cast<VarDecl>(Rec.first);
+    bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
+    QualType VarTy = VD->getType();
+
+    // Get the local allocation of a firstprivate variable before sharing
+    llvm::Value *ParValue;
+    if (EscapedParam) {
+      LValue ParLVal =
+          CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
+      ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
     }
-    for (auto &Rec : I->getSecond().LocalVarData) {
-      bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
-      llvm::Value *ParValue;
-      if (EscapedParam) {
-        const auto *VD = cast<VarDecl>(Rec.first);
-        LValue ParLVal =
-            CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
-        ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
-      }
-      LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
-      // Emit VarAddr basing on lane-id if required.
-      QualType VarTy;
-      if (Rec.second.IsOnePerTeam) {
-        VarTy = Rec.second.FD->getType();
-      } else {
-        Address Addr = VarAddr.getAddress(CGF);
-        llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
-            Addr.getElementType(), Addr.getPointer(),
-            {Bld.getInt32(0), getNVPTXLaneID(CGF)});
-        VarTy =
-            Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
-        VarAddr = CGF.MakeAddrLValue(
-            Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
-            AlignmentSource::Decl);
-      }
-      Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
-      if (!IsInTTDRegion &&
-          (WithSPMDCheck ||
-           getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
-        assert(I->getSecond().IsInSPMDModeFlag &&
-               "Expected unknown execution mode or required SPMD check.");
-        if (IsTTD) {
-          assert(SecIt->second.IsOnePerTeam &&
-                 "Secondary glob data must be one per team.");
-          LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
-          VarAddr.setAddress(
-              Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
-                                       VarAddr.getPointer(CGF)),
-                      VarAddr.getAlignment()));
-          Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
-        }
-        Address GlobalPtr = Rec.second.PrivateAddr;
-        Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
-        Rec.second.PrivateAddr = Address(
-            Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
-                             LocalAddr.getPointer(), GlobalPtr.getPointer()),
-            LocalAddr.getAlignment());
-      }
-      if (EscapedParam) {
-        const auto *VD = cast<VarDecl>(Rec.first);
-        CGF.EmitStoreOfScalar(ParValue, VarAddr);
-        I->getSecond().MappedParams->setVarAddr(CGF, VD,
-                                                VarAddr.getAddress(CGF));
-      }
-      if (IsTTD)
-        ++SecIt;
+
+    // Allocate space for the variable to be globalized
+    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
+    llvm::Instruction *VoidPtr =
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
+                            AllocArgs, VD->getName());
+
+    // Cast the void pointer and get the address of the globalized variable.
+    llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
+    llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
+    LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
+    Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
+    Rec.second.GlobalizedVal = VoidPtr;
+
+    // Assign the local allocation to the newly globalized location.
+    if (EscapedParam) {
+      CGF.EmitStoreOfScalar(ParValue, VarAddr);
+      I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
     }
+    if (auto *DI = CGF.getDebugInfo())
+      VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
   }
-  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
-    // Recover pointer to this function's global record. The runtime will
-    // handle the specifics of the allocation of the memory.
-    // Use actual memory size of the record including the padding
+  for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
+    // Use actual memory size of the VLA object including the padding
     // for alignment purposes.
-    CGBuilderTy &Bld = CGF.Builder;
     llvm::Value *Size = CGF.getTypeSize(VD->getType());
     CharUnits Align = CGM.getContext().getDeclAlign(VD);
     Size = Bld.CreateNUWAdd(
         Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
     llvm::Value *AlignVal =
         llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
+
     Size = Bld.CreateUDiv(Size, AlignVal);
     Size = Bld.CreateNUWMul(Size, AlignVal);
-    // TODO: allow the usage of shared memory to be controlled by
-    // the user, for now, default to global.
-    llvm::Value *GlobalRecordSizeArg[] = {
-        Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
-    llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
-        GlobalRecordSizeArg);
-    llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
-    LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
+
+    // Allocate space for this VLA object to be globalized.
+    llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
+    llvm::Instruction *VoidPtr =
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
+                            AllocArgs, VD->getName());
+
+    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr);
+    LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
                                      CGM.getContext().getDeclAlign(VD),
                                      AlignmentSource::Decl);
     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
                                             Base.getAddress(CGF));
-    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
   }
   I->getSecond().MappedParams->apply(CGF);
 }
@@ -1997,60 +1750,20 @@
 
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I != FunctionGlobalizedDecls.end()) {
-    I->getSecond().MappedParams->restore(CGF);
-    if (!CGF.HaveInsertPoint())
-      return;
+    // Deallocate the memory for each globalized VLA object
     for (llvm::Value *Addr :
          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
-          Addr);
+      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                              CGM.getModule(), OMPRTL___kmpc_free_shared),
+                          Addr);
     }
-    if (I->getSecond().GlobalRecordAddr) {
-      if (!IsInTTDRegion &&
-          (WithSPMDCheck ||
-           getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
-        CGBuilderTy &Bld = CGF.Builder;
-        llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-        llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
-        Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(NonSPMDBB);
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
-            CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
-        CGF.EmitBlock(ExitBB);
-      } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
-        assert(GlobalizedRecords.back().RegionCounter > 0 &&
-               "region counter must be > 0.");
-        --GlobalizedRecords.back().RegionCounter;
-        // Emit the restore function only in the target region.
-        if (GlobalizedRecords.back().RegionCounter == 0) {
-          QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
-              /*DestWidth=*/16, /*Signed=*/0);
-          llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
-              Address(GlobalizedRecords.back().UseSharedMemory,
-                      CGM.getContext().getTypeAlignInChars(Int16Ty)),
-              /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
-          llvm::Value *Args[] = {
-              llvm::ConstantInt::get(
-                  CGM.Int16Ty,
-                  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
-              IsInSharedMemory};
-          CGF.EmitRuntimeCall(
-              OMPBuilder.getOrCreateRuntimeFunction(
-                  CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
-              Args);
-        }
-      } else {
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
-            I->getSecond().GlobalRecordAddr);
-      }
+    // Deallocate the memory for each globalized value
+    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
+      I->getSecond().MappedParams->restore(CGF);
+
+      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                              CGM.getModule(), OMPRTL___kmpc_free_shared),
+                          {Rec.second.GlobalizedVal});
     }
   }
 }
@@ -4333,6 +4046,7 @@
   }
   if (!Body)
     return;
+
   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
   VarChecker.Visit(Body);
   const RecordDecl *GlobalizedVarsRecord =
@@ -4346,7 +4060,6 @@
   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
   I->getSecond().MappedParams =
       std::make_unique<CodeGenFunction::OMPMapVars>();
-  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
   I->getSecond().EscapedParameters.insert(
       VarChecker.getEscapedParameters().begin(),
       VarChecker.getEscapedParameters().end());
@@ -4355,21 +4068,16 @@
   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
-    const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-    Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
+    Data.insert(std::make_pair(VD, MappedVarData()));
   }
   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
     CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
     VarChecker.Visit(Body);
-    I->getSecond().SecondaryGlobalRecord =
-        VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
     I->getSecond().SecondaryLocalVarData.emplace();
     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
-      const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-      Data.insert(
-          std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
+      Data.insert(std::make_pair(VD, MappedVarData()));
     }
   }
   if (!NeedToDelayGlobalization) {
@@ -4660,185 +4368,8 @@
   CGOpenMPRuntime::processRequiresDirective(D);
 }
 
-/// Get number of SMs and number of blocks per SM.
-static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
-  std::pair<unsigned, unsigned> Data;
-  if (CGM.getLangOpts().OpenMPCUDANumSMs)
-    Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
-  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
-    Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
-  if (Data.first && Data.second)
-    return Data;
-  switch (getCudaArch(CGM)) {
-  case CudaArch::SM_20:
-  case CudaArch::SM_21:
-  case CudaArch::SM_30:
-  case CudaArch::SM_32:
-  case CudaArch::SM_35:
-  case CudaArch::SM_37:
-  case CudaArch::SM_50:
-  case CudaArch::SM_52:
-  case CudaArch::SM_53:
-    return {16, 16};
-  case CudaArch::SM_60:
-  case CudaArch::SM_61:
-  case CudaArch::SM_62:
-    return {56, 32};
-  case CudaArch::SM_70:
-  case CudaArch::SM_72:
-  case CudaArch::SM_75:
-  case CudaArch::SM_80:
-  case CudaArch::SM_86:
-    return {84, 32};
-  case CudaArch::GFX600:
-  case CudaArch::GFX601:
-  case CudaArch::GFX602:
-  case CudaArch::GFX700:
-  case CudaArch::GFX701:
-  case CudaArch::GFX702:
-  case CudaArch::GFX703:
-  case CudaArch::GFX704:
-  case CudaArch::GFX705:
-  case CudaArch::GFX801:
-  case CudaArch::GFX802:
-  case CudaArch::GFX803:
-  case CudaArch::GFX805:
-  case CudaArch::GFX810:
-  case CudaArch::GFX900:
-  case CudaArch::GFX902:
-  case CudaArch::GFX904:
-  case CudaArch::GFX906:
-  case CudaArch::GFX908:
-  case CudaArch::GFX909:
-  case CudaArch::GFX90a:
-  case CudaArch::GFX90c:
-  case CudaArch::GFX1010:
-  case CudaArch::GFX1011:
-  case CudaArch::GFX1012:
-  case CudaArch::GFX1030:
-  case CudaArch::GFX1031:
-  case CudaArch::GFX1032:
-  case CudaArch::GFX1033:
-  case CudaArch::UNUSED:
-  case CudaArch::UNKNOWN:
-    break;
-  case CudaArch::LAST:
-    llvm_unreachable("Unexpected Cuda arch.");
-  }
-  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
-}
-
 void CGOpenMPRuntimeGPU::clear() {
-  if (!GlobalizedRecords.empty() &&
-      !CGM.getLangOpts().OpenMPCUDATargetParallel) {
-    ASTContext &C = CGM.getContext();
-    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
-    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
-    RecordDecl *StaticRD = C.buildImplicitRecord(
-        "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
-    StaticRD->startDefinition();
-    RecordDecl *SharedStaticRD = C.buildImplicitRecord(
-        "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
-    SharedStaticRD->startDefinition();
-    for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
-      if (Records.Records.empty())
-        continue;
-      unsigned Size = 0;
-      unsigned RecAlignment = 0;
-      for (const RecordDecl *RD : Records.Records) {
-        QualType RDTy = C.getRecordType(RD);
-        unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
-        RecAlignment = std::max(RecAlignment, Alignment);
-        unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
-        Size =
-            llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
-      }
-      Size = llvm::alignTo(Size, RecAlignment);
-      llvm::APInt ArySize(/*numBits=*/64, Size);
-      QualType SubTy = C.getConstantArrayType(
-          C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
-      const bool UseSharedMemory = Size <= SharedMemorySize;
-      auto *Field =
-          FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
-                            SourceLocation(), SourceLocation(), nullptr, SubTy,
-                            C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
-                            /*BW=*/nullptr, /*Mutable=*/false,
-                            /*InitStyle=*/ICIS_NoInit);
-      Field->setAccess(AS_public);
-      if (UseSharedMemory) {
-        SharedStaticRD->addDecl(Field);
-        SharedRecs.push_back(&Records);
-      } else {
-        StaticRD->addDecl(Field);
-        GlobalRecs.push_back(&Records);
-      }
-      Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
-      Records.UseSharedMemory->setInitializer(
-          llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
-    }
-    // Allocate SharedMemorySize buffer for the shared memory.
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore this code as sson as nvlink is fixed.
-    if (!SharedStaticRD->field_empty()) {
-      llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
-      QualType SubTy = C.getConstantArrayType(
-          C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
-      auto *Field = FieldDecl::Create(
-          C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
-          C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
-          /*BW=*/nullptr, /*Mutable=*/false,
-          /*InitStyle=*/ICIS_NoInit);
-      Field->setAccess(AS_public);
-      SharedStaticRD->addDecl(Field);
-    }
-    SharedStaticRD->completeDefinition();
-    if (!SharedStaticRD->field_empty()) {
-      QualType StaticTy = C.getRecordType(SharedStaticRD);
-      llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
-          llvm::UndefValue::get(LLVMStaticTy),
-          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
-          llvm::GlobalValue::NotThreadLocal,
-          C.getTargetAddressSpace(LangAS::cuda_shared));
-      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-          GV, CGM.VoidPtrTy);
-      for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
-        Rec->Buffer->replaceAllUsesWith(Replacement);
-        Rec->Buffer->eraseFromParent();
-      }
-    }
-    StaticRD->completeDefinition();
-    if (!StaticRD->field_empty()) {
-      QualType StaticTy = C.getRecordType(StaticRD);
-      std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
-      llvm::APInt Size1(32, SMsBlockPerSM.second);
-      QualType Arr1Ty =
-          C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
-                                 /*IndexTypeQuals=*/0);
-      llvm::APInt Size2(32, SMsBlockPerSM.first);
-      QualType Arr2Ty =
-          C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
-                                 /*IndexTypeQuals=*/0);
-      llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
-      // FIXME: nvlink does not handle weak linkage correctly (object with the
-      // different size are reported as erroneous).
-      // Restore CommonLinkage as soon as nvlink is fixed.
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(LLVMArr2Ty),
-          "_openmp_static_glob_rd_$_");
-      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-          GV, CGM.VoidPtrTy);
-      for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
-        Rec->Buffer->replaceAllUsesWith(Replacement);
-        Rec->Buffer->eraseFromParent();
-      }
-    }
-  }
+
   if (!TeamsReductions.empty()) {
     ASTContext &C = CGM.getContext();
     RecordDecl *StaticRD = C.buildImplicitRecord(
diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp
--- a/clang/test/OpenMP/assumes_include_nvptx.cpp
+++ b/clang/test/OpenMP/assumes_include_nvptx.cpp
@@ -19,8 +19,6 @@
 // CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() [[attr1]]
 // CHECK: declare void @__kmpc_kernel_init(i32, i16)
 // CHECK-NOT: #
-// CHECK: declare void @__kmpc_data_sharing_init_stack()
-// CHECK-NOT: #
 // CHECK: declare float @_Z3sinf(float) [[attr2:#[0-9]*]]
 // CHECK: declare void @__kmpc_kernel_deinit(i16)
 // CHECK-NOT: #
diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp
--- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp
+++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+_(.*)_l[0-9]+"
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-nvidia-cuda -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s
 // expected-no-diagnostics
@@ -9,8 +10,6 @@
   return foo(a);
 }
 
-// CHECK: define weak void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+5]](i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}})
-// CHECK-NOT: @__kmpc_data_sharing_coalesced_push_stack
 
 int maini1() {
   int a;
@@ -23,36 +22,67 @@
 }
 
 // parallel region
-// CHECK: define {{.*}}void @{{.*}}(i32* noalias {{.*}}, i32* noalias {{.*}}, i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}})
-// CHECK-NOT: call i8* @__kmpc_data_sharing_coalesced_push_stack(
-// CHECK: [[B_ADDR:%.+]] = alloca i32,
-// CHECK: call {{.*}}[[FOO:@.*foo.*]](i32* nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) [[B_ADDR]])
-// CHECK: call {{.*}}[[BAR:@.*bar.*]]()
-// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack(
-// CHECK: ret void
 
-// CHECK: define {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}})
-// CHECK-NOT: @__kmpc_data_sharing_coalesced_push_stack
-
-// CHECK: define {{.*}}[[BAR]]()
-// CHECK: alloca i32,
-// CHECK: [[A_LOCAL_ADDR:%.+]] = alloca i32,
-// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode()
-// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0
-// CHECK: br i1 [[IS_SPMD]], label
-// CHECK: br label
-// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0)
-// CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%.+]]*
-// CHECK: br label
-// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[GLOBALS]], {{.+}} ]
-// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
-// CHECK: [[A_GLOBAL_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ADDR]], i32 0, i32 [[LID]]
-// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_GLOBAL_ADDR]]
-// CHECK: call {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}} [[A_ADDR]])
-// CHECK: br i1 [[IS_SPMD]], label
-// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8*
-// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]])
-// CHECK: br label
-// CHECK: ret i32
+// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z6maini1v_l[0-9]+}}
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 0, i32* [[DOTBOUND_ZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK-NEXT:    call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
+// CHECK-NEXT:    br label [[DOTEXECUTE:%.*]]
+// CHECK:       .execute:
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
+// CHECK-NEXT:    store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4
+// CHECK-NEXT:    call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTBOUND_ZERO_ADDR]], i32* [[TMP0]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    br label [[DOTOMP_DEINIT:%.*]]
+// CHECK:       .omp.deinit:
+// CHECK-NEXT:    call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+// CHECK-NEXT:    br label [[DOTEXIT:%.*]]
+// CHECK:       .exit:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
+// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    [[B:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[B]]) #[[ATTR4:[0-9]+]]
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @_Z3barv() #[[ATTR4]]
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    store i32 [[ADD]], i32* [[TMP0]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z3fooRi
+// CHECK-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32*, align 8
+// CHECK-NEXT:    store i32* [[A]], i32** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4
+// CHECK-NEXT:    ret i32 [[TMP1]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z3barv
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK-NEXT:    [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
+// CHECK-NEXT:    call void @__kmpc_free_shared(i8* [[A]])
+// CHECK-NEXT:    ret i32 [[CALL]]
+//
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ b/clang/test/OpenMP/nvptx_data_sharing.cpp
@@ -1,9 +1,9 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-function-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+_(.*)_l[0-9]+"
 // Test device global memory data sharing codegen.
 ///==========================================================================///
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s
 
 // expected-no-diagnostics
 
@@ -27,93 +27,75 @@
     }
   }
 }
-// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
 
-/// ========= In the worker function ========= ///
-// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker()
-// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CK1-NOT: call void @__kmpc_data_sharing_init_stack
-
-/// ========= In the kernel function ========= ///
-
-// CK1: {{.*}}define weak void @__omp_offloading{{.*}}test_ds{{.*}}()
-// CK1: [[SHAREDARGS1:%.+]] = alloca i8**
-// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
-// CK1: call void @__kmpc_kernel_init
-// CK1: call void @__kmpc_data_sharing_init_stack
-// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
-// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1)
-// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
-// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
-// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
-// CK1: store i32 10, i32* [[A]]
-// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}})
-// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1)
-// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
-// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0
-// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8*
-// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]]
-// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CK1: call void @__kmpc_end_sharing_variables()
-// CK1: store i32 100, i32* [[B]]
-// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}})
-// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2)
-// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]]
-// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0
-// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8*
-// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]]
-// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1
-// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8*
-// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]]
-// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CK1: call void @__kmpc_end_sharing_variables()
-// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]])
-// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]])
-// CK1: call void @__kmpc_kernel_deinit(i16 1)
-
-/// ========= In the data sharing wrapper function ========= ///
-
-// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
-// CK1: [[SHAREDARGS4:%.+]] = alloca i8**
-// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]])
-// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]]
-// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0
-// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32**
-// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]]
-// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]])
-
-/// outlined function for the second parallel region ///
-
-// CK1: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}})
-// CK1-NOT: call i8* @__kmpc_data_sharing_push_stack(
-// CK1: [[C_ADDR:%.+]] = alloca i32,
-// CK1: store i32* [[C_ADDR]], i32** %
-// CK1i-NOT: call void @__kmpc_data_sharing_pop_stack(
-
-/// ========= In the data sharing wrapper function ========= ///
-
-// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
-// CK1: [[SHAREDARGS3:%.+]] = alloca i8**
-// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]])
-// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]]
-// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0
-// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32**
-// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]]
-// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1
-// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32**
-// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]]
-// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]])
+// CHECK:  entry:
+// CHECK:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK:    [[SHARED_ARG_REFS:%.*]] = alloca i8**, align 8
+// CHECK:    [[C:%.*]] = alloca i32, align 4
+// CHECK:    [[DOTZERO_ADDR7:%.*]] = alloca i32, align 4
+// CHECK:    [[SHARED_ARG_REFS8:%.*]] = alloca i8**, align 8
+// CHECK:    store i32 0, i32* [[DOTZERO_ADDR7]], align 4
+// CHECK:    store i32 0, i32* [[DOTZERO_ADDR]], align 4
+// CHECK:    [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK:    [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK:    [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
+// CHECK:    [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK:    [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
+// CHECK:    br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
+// CHECK:       .worker:
+// CHECK:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z7test_dsv_l[0-9]+}}_worker() #[[ATTR4:[0-9]+]]
+// CHECK:    br label [[DOTEXIT:%.*]]
+// CHECK:       .mastercheck:
+// CHECK:    [[NVPTX_NUM_THREADS1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK:    [[NVPTX_WARP_SIZE2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK:    [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE2]], 1
+// CHECK:    [[TMP2:%.*]] = xor i32 [[TMP1]], -1
+// CHECK:    [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS1]], 1
+// CHECK:    [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP2]]
+// CHECK:    [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK:    [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID3]], [[MASTER_TID]]
+// CHECK:    br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
+// CHECK:       .master:
+// CHECK:    [[NVPTX_WARP_SIZE4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+// CHECK:    [[NVPTX_NUM_THREADS5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+// CHECK:    [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE4]]
+// CHECK:    call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
+// CHECK:    [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK:    [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
+// CHECK:    [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
+// CHECK:    [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
+// CHECK:    store i32 10, i32* [[A_ON_STACK]], align 4
+// CHECK:    call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*))
+// CHECK:    call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_ARG_REFS]], i64 1)
+// CHECK:    [[TMP5:%.*]] = load i8**, i8*** [[SHARED_ARG_REFS]], align 8
+// CHECK:    [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP5]], i64 0
+// CHECK:    [[TMP7:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
+// CHECK:    store i8* [[TMP7]], i8** [[TMP6]], align 8
+// CHECK:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:    call void @__kmpc_end_sharing_variables()
+// CHECK:    store i32 100, i32* [[B_ON_STACK]], align 4
+// CHECK:    store i32 1000, i32* [[C]], align 4
+// CHECK:    call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*))
+// CHECK:    call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_ARG_REFS8]], i64 2)
+// CHECK:    [[TMP8:%.*]] = load i8**, i8*** [[SHARED_ARG_REFS8]], align 8
+// CHECK:    [[TMP9:%.*]] = getelementptr inbounds i8*, i8** [[TMP8]], i64 0
+// CHECK:    [[TMP10:%.*]] = bitcast i32* [[B_ON_STACK]] to i8*
+// CHECK:    store i8* [[TMP10]], i8** [[TMP9]], align 8
+// CHECK:    [[TMP11:%.*]] = getelementptr inbounds i8*, i8** [[TMP8]], i64 1
+// CHECK:    [[TMP12:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
+// CHECK:    store i8* [[TMP12]], i8** [[TMP11]], align 8
+// CHECK:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:    call void @__kmpc_end_sharing_variables()
+// CHECK:    call void @__kmpc_free_shared(i8* [[B]])
+// CHECK:    call void @__kmpc_free_shared(i8* [[A]])
+// CHECK:    br label [[DOTTERMINATION_NOTIFIER:%.*]]
+// CHECK:       .termination.notifier:
+// CHECK:    call void @__kmpc_kernel_deinit(i16 1)
+// CHECK:    call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:    br label [[DOTEXIT]]
+// CHECK:       .exit:
+// CHECK:    ret void
 
 #endif
-
diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
--- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -1,21 +1,15 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK
 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK 
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK
 
 // expected-no-diagnostics
 #ifndef HEADER
@@ -34,36 +28,17 @@
   return 0;
 }
 
-// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
-// CHECK-DAG: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0
+// CHECK: @__omp_offloading_{{.*}}_main_[[LINE:l.+]]_exec_mode = weak constant i8 0
 
 // CHECK: define weak void @__omp_offloading_{{.*}}_main_[[LINE]]([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})
-// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
-// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1)
-// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty*
-// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]],
+// CHECK: [[CVOIDPTR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 40)
+// CHECK: [[CSTACK:%.+]] = bitcast i8* [[CVOIDPTR]] to [10 x i{{32|64}}]*
 // CHECK: call void @__kmpc_for_static_init_4(
 
 // CHECK: call void [[PARALLEL:@.+]](
 
 // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
 
-// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
-// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]])
-
-// CHECK: define internal void [[PARALLEL]](
-// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
-
-// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack(
+// CHECK: call void @__kmpc_free_shared(i8* [[CVOIDPTR]])
 
 #endif
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -26,15 +26,14 @@
 // CHECK-DAG: [[CAP1:%.+]] = type { [[S]]* }
 // CHECK-DAG: [[CAP2:%.+]] = type { i32*, i32*, i32*, i32**, i32* }
 
-// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker()
-// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}})
+// CLASS: define internal void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_{{.*}}_worker()
+// CLASS: define weak void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_{{.*}}([[S]]* {{%.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) {{%.+}})
 // CLASS-NOT: getelementptr
 // CLASS: br i1 %
-// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_l67_worker()
+// CLASS: call void @__omp_offloading_{{.*}}_{{.*}}foo{{.*}}_{{.*}}_worker()
 // CLASS: br label %
 // CLASS: br i1 %
 // CLASS: call void @__kmpc_kernel_init(
-// CLASS: call void @__kmpc_data_sharing_init_stack()
 // CLASS: call void @llvm.memcpy.
 // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]],
 // CLASS: [[THIS_REF:%.+]] = getelementptr inbounds [[CAP1]], [[CAP1]]* [[L]], i32 0, i32 0
@@ -43,7 +42,7 @@
 // CLASS: call i32 [[LAMBDA1:@.+foo.+]]([[CAP1]]* {{[^,]*}} [[L]])
 // CLASS: ret void
 
-// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_l69([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}})
+// CLASS: define weak void @__omp_offloading_{{.+}}foo{{.+}}_{{.*}}([[S]]* %{{.+}}, [[CAP1]]* nonnull align 8 dereferenceable(8) %{{.+}})
 // CLASS-NOT: getelementptr
 // CLASS: call void @llvm.memcpy.
 // CLASS: [[L:%.+]] = load [[CAP1]]*, [[CAP1]]** [[L_ADDR:%.+]],
@@ -72,15 +71,14 @@
   }
 } s;
 
-// FUN: define internal void @__omp_offloading_{{.+}}_main_l124_worker()
-// FUN: define weak void @__omp_offloading_{{.+}}_main_l124(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}})
+// FUN: define internal void @__omp_offloading_{{.+}}_main_{{.*}}_worker()
+// FUN: define weak void @__omp_offloading_{{.+}}_main_{{.*}}(i64 %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}})
 // FUN-NOT: getelementptr
 // FUN: br i1 %
-// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_l124_worker()
+// FUN: call void @__omp_offloading_{{.*}}_{{.*}}main{{.*}}_{{.*}}_worker()
 // FUN: br label %
 // FUN: br i1 %
 // FUN: call void @__kmpc_kernel_init(
-// FUN: call void @__kmpc_data_sharing_init_stack()
 // FUN: call void @llvm.memcpy.
 // FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]],
 // FUN: [[ARGC_CAP:%.+]] = getelementptr inbounds [[CAP2]], [[CAP2]]* [[L]], i32 0, i32 0
@@ -97,7 +95,7 @@
 // FUN: call i64 [[LAMBDA2:@.+main.+]]([[CAP2]]* {{[^,]*}} [[L]])
 // FUN: ret void
 
-// FUN: define weak void @__omp_offloading_{{.+}}_main_l126(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}})
+// FUN: define weak void @__omp_offloading_{{.+}}_main_{{.*}}(i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}} i32* nonnull align 4 dereferenceable(4) %{{.+}}, i32* %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, [[CAP2]]* nonnull align 8 dereferenceable(40) %{{.+}})
 // FUN-NOT: getelementptr
 // FUN: call void @llvm.memcpy.
 // FUN: [[L:%.+]] = load [[CAP2]]*, [[CAP2]]** [[L_ADDR:%.+]],
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -1,12 +1,9 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -75,15 +72,9 @@
   return a;
 }
 
-// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
+// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
 
-// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker()
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}_worker()
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
 // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
 // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
 // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -136,7 +127,7 @@
 // CHECK: [[EXIT]]
 // CHECK: ret void
 
-// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l29]](i[[SZ:32|64]]
+// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
 // Create local storage for each capture.
 // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
 // CHECK-DAG:  store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
@@ -202,7 +193,7 @@
 // CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
 // CHECK: ret void
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l46}}_worker()
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
 // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
 // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
 // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -246,7 +237,7 @@
 // CHECK: [[EXIT]]
 // CHECK: ret void
 
-// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l46]](i[[SZ:32|64]]
+// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
 // Create local storage for each capture.
 // CHECK:  [[LOCAL_N:%.+]] = alloca i[[SZ]],
 // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]],
@@ -326,25 +317,17 @@
 
 // CHECK: declare void @__kmpc_barrier(%struct.ident_t*, i32) #[[#CONVERGENT:]]
 
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}_worker()
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}_worker()
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}(
 // CHECK-32: [[A_ADDR:%.+]] = alloca i32,
 // CHECK-64: [[A_ADDR:%.+]] = alloca i64,
 // CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32*
-// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
-// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
-// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty*
 // CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]],
 // CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]],
-// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]],
-// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
-// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]])
+// CHECK: [[STACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to i32*
+// CHECK: store i32 [[A]], i32* [[BC]],
+// CHECK: call void @__kmpc_free_shared(i8* [[STACK]])
 
 // CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.*}})
 // CHECK:  [[CC:%.+]] = alloca i32,
diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
--- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
@@ -1,7 +1,6 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -31,35 +30,21 @@
   return a;
 }
 
-// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l13}}_worker()
+// CHECK-LABEL: define {{.*}} void @__omp_offloading_{{.*}}_worker()
 // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
 // CHECK: call i1 @__kmpc_kernel_parallel(
 // CHECK: call void @__omp_outlined___wrapper(
 
-// CHECK: define weak void @__omp_offloading_{{.*}}l13(
-// CHECK: call void @__omp_offloading_{{.*}}l13_worker()
+// CHECK: define weak void @__omp_offloading_{{.*}}(
+// CHECK: call void @__omp_offloading_{{.*}}_worker()
 // CHECK: call void @__kmpc_kernel_init(
-// CHECK: call void @__kmpc_data_sharing_init_stack()
-// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
-// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
+// CHECK: [[STACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
 // CHECK: call void @__kmpc_kernel_prepare_parallel(
 // CHECK: call void @__kmpc_begin_sharing_variables({{.*}}, i64 2)
 // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
 // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
 // CHECK: call void @__kmpc_end_sharing_variables()
-// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
-// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]])
+// CHECK: call void @__kmpc_free_shared(i8* [[STACK]])
 // CHECK: call void @__kmpc_kernel_deinit(i16 1)
 
 // CHECK: define internal void @__omp_outlined__(
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -36,7 +36,7 @@
 // CHECK: store i32** [[PTR2_REF]], i32*** [[PTR2_REF_PTR:%.+]],
 // CHECK: [[PTR2_REF:%.+]] = load i32**, i32*** [[PTR2_REF_PTR]],
 // CHECK: call void @__kmpc_spmd_kernel_init(
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
+//
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
 // CHECK: call void @{{.+}}(i32* [[THREADID]], i32* %{{.+}}, i32** [[PTR1_ADDR]], i32** [[PTR2_REF]])
@@ -571,66 +571,38 @@
 // CHECK: ret void
 
 // CHECK: define{{ hidden | }}i32 [[BAZ]](i32 [[F:%.*]], double* nonnull align {{[0-9]+}} dereferenceable{{.*}})
-// CHECK: alloca i32,
-// CHECK: [[LOCAL_F_PTR:%.+]] = alloca i32,
-// CHECK: [[ZERO_ADDR:%.+]] = alloca i32,
-// CHECK: [[BND_ZERO_ADDR:%.+]] = alloca i32,
-// CHECK: store i32 0, i32* [[BND_ZERO_ADDR]]
-// CHECK: store i32 0, i32* [[ZERO_ADDR]]
-// CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* [[UNKNOWN]]
-// CHECK: [[PAR_LEVEL:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]])
-// CHECK: [[IS_TTD:%.+]] = icmp eq i16 %1, 0
-// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode()
-// CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0
-// CHECK: br i1 [[IS_SPMD]], label
-// CHECK: br label
-// CHECK: [[SIZE:%.+]] = select i1 [[IS_TTD]], i{{64|32}} 4, i{{64|32}} 128
-// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i{{64|32}} [[SIZE]], i16 0)
-// CHECK: [[REC_ADDR:%.+]] = bitcast i8* [[PTR]] to [[GLOBAL_ST:%.+]]*
-// CHECK: br label
-// CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[REC_ADDR]], {{.+}} ]
-// CHECK: [[TTD_ITEMS:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to [[SEC_GLOBAL_ST:%.+]]*
-// CHECK: [[F_PTR_ARR:%.+]] = getelementptr inbounds [[GLOBAL_ST]], [[GLOBAL_ST]]* [[ITEMS]], i32 0, i32 0
-// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
-// CHECK: [[GLOBAL_F_PTR_PAR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F_PTR_ARR]], i32 0, i32 [[LID]]
-// CHECK: [[GLOBAL_F_PTR_TTD:%.+]] = getelementptr inbounds [[SEC_GLOBAL_ST]], [[SEC_GLOBAL_ST]]* [[TTD_ITEMS]], i32 0, i32 0
-// CHECK: [[GLOBAL_F_PTR:%.+]] = select i1 [[IS_TTD]], i32* [[GLOBAL_F_PTR_TTD]], i32* [[GLOBAL_F_PTR_PAR]]
-// CHECK: [[F_PTR:%.+]] = select i1 [[IS_SPMD]], i32* [[LOCAL_F_PTR]], i32* [[GLOBAL_F_PTR]]
-// CHECK: store i32 %{{.+}}, i32* [[F_PTR]],
-
-// CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode()
-// CHECK: icmp ne i8 [[RES]], 0
-// CHECK: br i1
-
-// CHECK: [[RES:%.+]] = call i16 @__kmpc_parallel_level(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]])
-// CHECK: icmp ne i16 [[RES]], 0
-// CHECK: br i1
-
-// CHECK: call void @__kmpc_serialized_parallel(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]])
-// CHECK: call void [[OUTLINED:@.+]](i32* [[ZERO_ADDR]], i32* [[BND_ZERO_ADDR]], i32* [[F_PTR]], double* %{{.+}})
-// CHECK: call void @__kmpc_end_serialized_parallel(%struct.ident_t* [[UNKNOWN]], i32 [[GTID]])
-// CHECK: br label
-
-// CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*))
-// CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_PTR:%.+]], i{{64|32}} 2)
-// CHECK: [[SHARED:%.+]] = load i8**, i8*** [[SHARED_PTR]],
-// CHECK: [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{64|32}} 0
-// CHECK: [[F_REF:%.+]] = bitcast i32* [[F_PTR]] to i8*
-// CHECK: store i8* [[F_REF]], i8** [[REF]],
-// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-// CHECK: call void @__kmpc_end_sharing_variables()
-// CHECK: br label
-
-// CHECK: [[RES:%.+]] = load i32, i32* [[F_PTR]],
-// CHECK: store i32 [[RES]], i32* [[RET:%.+]],
-// CHECK: br i1 [[IS_SPMD]], label
-// CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8*
-// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[BC]])
-// CHECK: br label
-// CHECK: [[RES:%.+]] = load i32, i32* [[RET]],
-// CHECK: ret i32 [[RES]]
+// CHECK:  [[ZERO_ADDR:%.+]] = alloca i32
+// CHECK:  [[BND_ZERO_ADDR:%.+]] = alloca i32
+// CHECK:  [[SHARED_ARG:%.+]] = alloca i8**
+// CHECK:  [[GTID:%.+]] = call i32 @__kmpc_global_thread_num({{.*}})
+// CHECK:  [[F_PTR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+// CHECK:  [[F_ON_STACK:%.+]] = bitcast i8* [[F_PTR]] to i32*
+// CHECK:  store i32 [[F]], i32* [[F_ON_STACK]]
+// CHECK:  [[SPMD_EXEC:%.+]] = call i8 @__kmpc_is_spmd_exec_mode()
+// CHECK:  br i1 %{{.+}} label [[SEQ:%.+]], label [[PARCHECK:%.+]]
+//
+// CHECK:  [[PAR_LEVEL:%.+]] = call i16 @__kmpc_parallel_level({{.*}})
+// CHECK:  br i1 %{{.+}} label [[SEQ:%.+]], label [[MASTER:%.+]]
+//
+// CHECK:  call void @__kmpc_serialized_parallel({{.*}})
+// CHECK:  call void @__omp_outlined__1(i32* [[ZERO_ADDR]], i32* [[BND_ZERO_ADDR]], i32* [[F_ON_STACK]], double* %{{.+}})
+// CHECK:  call void @__kmpc_end_serialized_parallel({{.*}})
+// CHECK:  br label
+//
+// CHECK:  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*))
+// CHECK:  call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_ARG]], i{{32|64}} 2)
+// CHECK:  [[SHARED:%.+]] = load i8**, i8*** %shared_arg_refs
+// CHECK:  [[REF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED]], i{{32|64}} 0
+// CHECK:  [[F_REF:%.+]] = bitcast i32* %f_on_stack to i8*
+// CHECK:  store i8* [[F_REF]], i8** [[REF]]
+// CHECK:  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+// CHECK:  call void @__kmpc_end_sharing_variables()
+// CHECK:  br label
+//
+// CHECK:  [[RET:%.+]] = load i32, i32* [[F_ON_STACK]]
+// CHECK:  call void @__kmpc_free_shared(i8* [[F_PTR]])
+// CHECK:  ret i32 [[RET]]
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l331}}_worker()
 // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
@@ -57,12 +57,11 @@
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}(
 // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
-// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack
+// CHECK-NOT: call i8* @__kmpc_alloc_shared
 // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
 // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
 // CHECK: [[EXEC]]
@@ -100,7 +99,6 @@
 // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
 // CHECK: [[EXEC]]
diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
@@ -54,7 +54,6 @@
 // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
 // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i16* [[AA]])
@@ -85,7 +84,6 @@
 // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
 // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
 // CHECK: [[GTID:%.+]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @{{.+}})
 // CHECK: store i32 [[GTID]], i32* [[THREADID:%.+]],
 // CHECK: call void [[OUTLINED:@.+]](i32* [[THREADID]], i32* %{{.+}}, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
@@ -55,7 +55,6 @@
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}(
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
 // CHECK: [[EXEC]]
@@ -73,7 +72,6 @@
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}(
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
 // CHECK: [[EXEC]]
@@ -91,7 +89,6 @@
 
 // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXEC:.+]]
 //
 // CHECK: [[EXEC]]
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -55,7 +55,6 @@
 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
 //
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXECUTE:.+]]
 //
 // CHECK: [[EXECUTE]]
@@ -239,7 +238,6 @@
 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
 //
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXECUTE:.+]]
 //
 // CHECK: [[EXECUTE]]
@@ -501,7 +499,6 @@
 // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
 //
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
 // CHECK: br label {{%?}}[[EXECUTE:.+]]
 //
 // CHECK: [[EXECUTE]]
diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -233,12 +233,11 @@
 
 // CHECK: define weak void @__omp_offloading_{{.*}}ftemplate{{.*}}_l37(
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
-// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
-// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
+// CHECK-NOT: call i8* @__kmpc_alloc_shared(
 // CHECK-NOT: call void @__kmpc_serialized_parallel(
 // CHECK: call void [[L0:@.+]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.+}})
 // CHECK-NOT: call void @__kmpc_end_serialized_parallel(
-// CHECK-NOT: call void @__kmpc_data_sharing_pop_stack(
+// CHECK-NOT: call void @__kmpc_free_shared(
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
 // CHECK: ret
 
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
@@ -1,21 +1,13 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
-// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: {{@__omp_offloading_.+}}_l23_exec_mode = weak constant i8 1
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
-
 template<typename tx>
 tx ftemplate(int n) {
   int i;
@@ -38,10 +30,10 @@
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l23}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+}}_worker()
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l23}}()
+  // CHECK: define {{.*}}void {{@__omp_offloading_.+template.+}}()
 
   // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
@@ -51,7 +43,7 @@
   // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l23}}_worker()
+  // CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+}}_worker()
   // CHECK: br label {{%?}}[[EXIT:.+]]
   //
   // CHECK: [[CHECK_MASTER]]
@@ -78,21 +70,15 @@
   // CHECK: ret void
 
   // CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
-  // SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-  // SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-  // SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**))
-  // SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]],
-  // SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
-  // PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
-  // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to [[GLOB_TY:%.+]]*
-  // CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0
+  // CHECK: [[ADDR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+  // CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to i32*
   //
   // CHECK: call void @__kmpc_for_static_init_4(
   // CHECK: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @{{.+}} to i8*))
   // CHECK: call void @__kmpc_begin_sharing_variables(i8*** [[SHARED_VARS_PTR:%.+]], i{{64|32}} 1)
   // CHECK: [[SHARED_VARS_BUF:%.+]] = load i8**, i8*** [[SHARED_VARS_PTR]],
   // CHECK: [[VARS_BUF:%.+]] = getelementptr inbounds i8*, i8** [[SHARED_VARS_BUF]], i{{64|32}} 0
-  // CHECK: [[I_ADDR_BC:%.+]] = bitcast i32* [[I_ADDR]] to i8*
+  // CHECK: [[I_ADDR_BC:%.+]] = bitcast i32* [[RD]] to i8*
   // CHECK: store i8* [[I_ADDR_BC]], i8** [[VARS_BUF]],
   // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
   // CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -1,37 +1,29 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l56}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l61}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l66}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l74}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l81}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
 
 #define N 1000
 #define M 10
@@ -92,33 +84,19 @@
   return a;
 }
 
-// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l50(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void [[PARALLEL:@.+]](
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
 // CHECK: define internal void [[PARALLEL]](
-// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0
-// PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
-// CHECK: [[BC:%.+]] = bitcast i8* [[ADDR]] to [[REC:%.+]]*
-// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[ADDR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+// CHECK: [[L_ON_STACK:%.+]] = bitcast i8* [[ADDR]] to i32*
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
 // CHECK: {{call|invoke}} void [[OUTL1:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
-// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
-// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[ADDR]])
+// CHECK: call void @__kmpc_free_shared(i8* [[ADDR]])
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL1]](
@@ -251,13 +229,13 @@
 // CHECK: call void @__kmpc_for_static_fini(
 // CHECK: ret void
 
-// CHECK: define weak void @__omp_offloading_{{.*}}_l74(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}})
+// CHECK: define weak void @__omp_offloading_{{.*}}(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}})
 // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}})
 // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}})
 // CHECK-DIV64: div i64
 // CHECK-DIV32-NO: div i64
 
-// CHECK: define weak void @__omp_offloading_{{.*}}_l81(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}})
+// CHECK: define weak void @__omp_offloading_{{.*}}(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}})
 // CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}})
 // CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{.*}})
 
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -1,31 +1,25 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
 
 // Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode.
-// CHECK-DAG: {{@__omp_offloading_.+l43}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0
-// CHECK-DAG: {{@__omp_offloading_.+l59}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+}}_exec_mode = weak constant i8 0
 
 #define N 1000
 #define M 10
@@ -75,31 +69,17 @@
   return a;
 }
 
-// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
-
-// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43(
+// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}(
 // CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0)
 // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
 
-// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0
-// PAR: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
-// CHECK: [[BC:%.+]] = bitcast i8* [[PTR]] to [[REC:%.+]]*
-// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
+// CHECK: [[PTR:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+// CHECK: [[L_ON_STACK:%.+]] = bitcast i8* [[PTR]] to i32*
 // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
 // CHECK: {{call|invoke}} void [[OUTL1:@.+]](
 // CHECK: call void @__kmpc_for_static_fini(
-// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
-// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
-// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]])
+// CHECK: call void @__kmpc_free_shared(i8* [[PTR]])
 // CHECK: ret void
 
 // CHECK: define internal void [[OUTL1]](
diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@@ -1,10 +1,8 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix SEQ
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix PAR
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
 // RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix SEQ
-// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix PAR
+// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -29,31 +27,18 @@
   return tmain(argv);
 }
 
-// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
-// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
-// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
-
 // only nvptx side: do not outline teams region and do not call fork_teams
 // CK1:  define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
 // CK1:  [[ARGCADDR:%.+]] = alloca i{{[0-9]+}},
 // CK1:  store {{.+}} 0, {{.+}},
-// CK1:  store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
-// CK1-64:  [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
-// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
-// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
-// CK1-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
-// CK1-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
-// CK1:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK1:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
-// CK1:  call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]])
+// CK1:  store i{{32|64}} [[ARGC]], i{{32|64}}* [[ARGCADDR]],
+// CK1-64:  [[CONV:%.+]] = bitcast i{{32|64}}* [[ARGCADDR]] to i{{32|64}}*
+// CK1-64:  [[ARG:%.+]] = load i{{32|64}}, i{{32|64}}* [[CONV]]
+// CK1-32:  [[ARG:%.+]] = load i{{32|64}}, i{{32|64}}* [[ARGCADDR]]
+// CK1:  [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+// CK1:  [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{32|64}}*
+// CK1:  store i{{32|64}} [[ARG]], i{{32|64}}* [[ARGC_ON_STACK]],
+// CK1:  call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGC_ON_STACK]])
 // CK1:  ret void
 // CK1-NEXT: }
 
@@ -65,16 +50,11 @@
 // CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]])
 // CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
 // CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
-// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
-// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
-// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
-// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1)
 // CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
-// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
-// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]])
+// CK1: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} {{4|8}})
+// CK1: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{[0-9]+}}*
+// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGC_ON_STACK]],
+// CK1: call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGC_ON_STACK]])
 // CK1:  ret void
 // CK1-NEXT: }
 
@@ -87,11 +67,9 @@
 
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix SEQ2
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix PAR2
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
 // RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix SEQ2
-// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix PAR2
+// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
 // expected-no-diagnostics
 #ifdef CK2
 
@@ -118,14 +96,6 @@
   return tmain(argv);
 }
 
-// SEQ2: [[MEM_TY:%.+]] = type { [128 x i8] }
-// SEQ2-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
-// SEQ2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
-// SEQ2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
-// SEQ2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
-// SEQ2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
-
 // CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
 // CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
 // CK2: [[BADDR:%.+]] = alloca i{{[0-9]+}},
@@ -136,18 +106,13 @@
 // CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
 // CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
 // CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
-// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
-// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
-// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
-// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
 // CK2-64:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
 // CK2-32:  [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
-// CK2:  [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK2:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGCADDR]],
+// CK2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} 4)
+// CK2:  [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{32|64}}*
+// CK2:  store i{{[0-9]+}} [[ARG]], i{{[0-9]+}}* [[ARGC_ON_STACK]],
 // CK2:  {{%.+}} = call i32 @__kmpc_global_thread_num(
-// CK2:  call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGCADDR]])
+// CK2:  call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i32* [[ARGC_ON_STACK]])
 // CK2: ret
 
 // CK2:  define internal void [[OUTLINED]](
@@ -162,17 +127,12 @@
 // CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
 // CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
 // CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
-// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
-// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
-// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
-// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
-// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
-// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1)
 // CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
-// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
-// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
+// CK2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_alloc_shared(i{{32|64}} {{4|8}})
+// CK2: [[ARGC_ON_STACK:%.+]] = bitcast i8* [[GLOBALSTACK]] to i{{[0-9]+}}*
+// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGC_ON_STACK]],
 // CK2: {{%.+}} = call i32 @__kmpc_global_thread_num(
-// CK2:  call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGCADDR]])
+// CK2:  call void [[OUTLINED:@.+]](i32* %{{.+}}, i32* %{{.+}}, i8*** [[ARGC_ON_STACK]])
 // CK2:  ret void
 
 // CK2:  define internal void [[OUTLINED]](
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -1,12 +1,9 @@
 // Test target codegen - host bc file has to be created first.
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix  PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
-// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@@ -15,21 +12,14 @@
 // CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] }
 // CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] }
 // CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] }
-// SEQ-DAG: [[MAP_TY:%.+]] = type { [128 x i8] }
-
-// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
-// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
-// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
-// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
-// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
 
 // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
 // CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
 
 // Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
-// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 1
-// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 0
 
 // CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer
 
@@ -73,9 +63,9 @@
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l44}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l34}}_worker()
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l44]](
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l34]](
   //
   // CHECK: {{call|invoke}} void [[T1]]_worker()
   //
@@ -340,9 +330,9 @@
   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
   // CHECK: ret void
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker()
 
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l50]](
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l40]](
   //
   // CHECK: {{call|invoke}} void [[T2]]_worker()
 
@@ -707,10 +697,9 @@
   // CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
   // CHECK: ret void
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}(
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}(
   //
   // CHECK: call void @__kmpc_spmd_kernel_init(
-  // CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
   // CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
 
   // CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -425,12 +425,9 @@
           GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
 
 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
-__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, )
-__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, )
 
-__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16)
-__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16)
-__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr)
+__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
+__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
 __OMP_RTL(__kmpc_end_sharing_variables, false, Void, )
 __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1112,9 +1112,8 @@
   }
 
   void analysisGlobalization() {
-    RuntimeFunction GlobalizationRuntimeIDs[] = {
-        OMPRTL___kmpc_data_sharing_coalesced_push_stack,
-        OMPRTL___kmpc_data_sharing_push_stack};
+    RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared,
+                                                 OMPRTL___kmpc_free_shared};
 
     for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
       auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@@ -1,145 +1,41 @@
+; RUN: opt -openmpopt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
 ; RUN: opt -passes=openmpopt -pass-remarks-analysis=openmp-opt -disable-output < %s 2>&1 | FileCheck %s
-; ModuleID = 'declare_target_codegen_globalization.cpp'
-source_filename = "declare_target_codegen_globalization.cpp"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
+target triple = "nvptx64"
 
-%struct.ident_t = type { i32, i32, i32, i32, i8* }
-%struct._globalized_locals_ty = type { [32 x i32] }
+; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
-@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1
-@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8
-@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0
-@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata"
+@S = external local_unnamed_addr global i8*
 
-; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
-; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
-
-; Function Attrs: norecurse nounwind
-define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 {
+define void @foo() {
 entry:
-  %nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13
-  tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12
-  tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12
-  %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
-  %.not.i.i = icmp eq i8 %1, 0
-  br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit
-
-.non-spmd2.i.i:                                   ; preds = %entry
-  %2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12
-  tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14
-  br label %__omp_outlined__.exit, !dbg !14
-
-__omp_outlined__.exit:                            ; preds = %entry, %.non-spmd2.i.i
-  tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19
-  ret void, !dbg !20
+  %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8
+  %x_on_stack = bitcast i8* %0 to i32*
+  %1 = bitcast i32* %x_on_stack to i8*
+  call void @share(i8* %1)
+  call void @__kmpc_free_shared(i8* %0)
+  ret void
 }
 
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
-
-declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr
-
-declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr
-
-; Function Attrs: norecurse nounwind readonly
-define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 {
+define void @share(i8* %x) {
 entry:
-  %0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23
-  ret i32 %0, !dbg !27
+  store i8* %x, i8** @S
+  ret void
 }
 
-; Function Attrs: nounwind
-define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 {
-entry:
-  %a1 = alloca i32, align 4
-  %0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
-  %.not = icmp eq i8 %0, 0
-  br i1 %.not, label %.non-spmd, label %.exit
-
-.non-spmd:                                        ; preds = %entry
-  %1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31
-  %2 = bitcast i8* %1 to %struct._globalized_locals_ty*
-  br label %.exit
+declare i8* @__kmpc_alloc_shared(i64)
 
-.exit:                                            ; preds = %entry, %.non-spmd
-  %_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ]
-  %nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28
-  %nvptx_lane_id = and i32 %nvptx_tid, 31
-  %3 = zext i32 %nvptx_lane_id to i64
-  %4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3
-  %5 = select i1 %.not, i32* %4, i32* %a1
-  %6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23
-  br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31
+declare void @__kmpc_free_shared(i8*)
 
-.non-spmd2:                                       ; preds = %.exit
-  %7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31
-  tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31
-  br label %.exit3, !dbg !31
-
-.exit3:                                           ; preds = %.non-spmd2, %.exit
-  ret i32 %6, !dbg !31
-}
-
-declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr
-
-declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr
-
-declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
-
-declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr
-
-; Function Attrs: nounwind
-declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4
-
-declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr
-
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!omp_offload.info = !{!3}
-!nvvm.annotations = !{!4}
-!llvm.module.flags = !{!5, !6, !7, !8}
-!llvm.ident = !{!9}
+!llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP")
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c")
 !2 = !{}
-!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0}
-!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1}
-!5 = !{i32 7, !"Dwarf Version", i32 2}
-!6 = !{i32 2, !"Debug Info Version", i32 3}
-!7 = !{i32 1, !"wchar_size", i32 4}
-!8 = !{i32 7, !"PIC Level", i32 2}
-!9 = !{!"clang version 12.0.0"}
-!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!11 = !DISubroutineType(types: !2)
-!12 = !DILocation(line: 17, column: 1, scope: !10)
-!13 = !{i32 1, i32 1025}
-!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16)
-!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18)
-!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!18 = distinct !DILocation(line: 17, column: 1, scope: !10)
-!19 = !DILocation(line: 17, column: 40, scope: !10)
-!20 = !DILocation(line: 21, column: 3, scope: !10)
-!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!22 = !DILocation(line: 5, column: 26, scope: !21)
-!23 = !{!24, !24, i64 0}
-!24 = !{!"int", !25, i64 0}
-!25 = !{!"omnipotent char", !26, i64 0}
-!26 = !{!"Simple C++ TBAA"}
-!27 = !DILocation(line: 5, column: 19, scope: !21)
-!28 = !{i32 0, i32 1024}
-!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30)
-!30 = distinct !DILocation(line: 9, column: 10, scope: !15)
-!31 = !DILocation(line: 10, column: 1, scope: !15)
-
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 5, column: 7, scope: !6)
diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
--- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
+++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu
@@ -24,187 +24,16 @@
 // Runtime functions for trunk data sharing scheme.
 ////////////////////////////////////////////////////////////////////////////////
 
-INLINE static void data_sharing_init_stack_common() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  omptarget_nvptx_TeamDescr *teamDescr =
-      &omptarget_nvptx_threadPrivateContext->TeamContext();
-
-  for (int WID = 0; WID < DS_Max_Warp_Number; WID++) {
-    __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID);
-    DataSharingState.SlotPtr[WID] = RootS;
-    DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0];
-  }
+// Allocate memory that can be shared between the threads.
+// TODO: Add a small buffer of shared memory to allocate memory from
+// TODO: Add an INFO message to communicate with the user
+EXTERN void *__kmpc_alloc_shared(size_t DataSize) {
+  return (void *)SafeMalloc(DataSize, "Alloc Shared");
 }
 
-// Initialize data sharing data structure. This function needs to be called
-// once at the beginning of a data sharing context (coincides with the kernel
-// initialization). This function is called only by the MASTER thread of each
-// team in non-SPMD mode.
-EXTERN void __kmpc_data_sharing_init_stack() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  // This function initializes the stack pointer with the pointer to the
-  // statically allocated shared memory slots. The size of a shared memory
-  // slot is pre-determined to be 256 bytes.
-  data_sharing_init_stack_common();
-  omptarget_nvptx_globalArgs.Init();
-}
-
-// Initialize data sharing data structure. This function needs to be called
-// once at the beginning of a data sharing context (coincides with the kernel
-// initialization). This function is called in SPMD mode only.
-EXTERN void __kmpc_data_sharing_init_stack_spmd() {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized.");
-  // This function initializes the stack pointer with the pointer to the
-  // statically allocated shared memory slots. The size of a shared memory
-  // slot is pre-determined to be 256 bytes.
-  if (GetThreadIdInBlock() == 0)
-    data_sharing_init_stack_common();
-
-  __kmpc_impl_threadfence_block();
-}
-
-INLINE static void *data_sharing_push_stack_common(size_t PushSize) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  // Only warp active master threads manage the stack.
-  bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0;
-
-  // Add worst-case padding to DataSize so that future stack allocations are
-  // correctly aligned.
-  const size_t Alignment = 8;
-  PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment;
-
-  // Frame pointer must be visible to all workers in the same warp.
-  const unsigned WID = GetWarpId();
-  void *FrameP = 0;
-  __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask();
-
-  if (IsWarpMaster) {
-    // SlotP will point to either the shared memory slot or an existing
-    // global memory slot.
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Check if we have room for the data in the current slot.
-    const uintptr_t StartAddress = (uintptr_t)StackP;
-    const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd;
-    const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize;
-
-    // If we requested more data than there is room for in the rest
-    // of the slot then we need to either re-use the next slot, if one exists,
-    // or create a new slot.
-    if (EndAddress < RequestedEndAddress) {
-      __kmpc_data_sharing_slot *NewSlot = 0;
-      size_t NewSize = PushSize;
-
-      // Allocate at least the default size for each type of slot.
-      // Master is a special case and even though there is only one thread,
-      // it can share more things with the workers. For uniformity, it uses
-      // the full size of a worker warp slot.
-      size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size;
-      if (DefaultSlotSize > NewSize)
-        NewSize = DefaultSlotSize;
-      NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc(
-          sizeof(__kmpc_data_sharing_slot) + NewSize,
-          "Global memory slot allocation.");
-
-      NewSlot->Next = 0;
-      NewSlot->Prev = SlotP;
-      NewSlot->PrevSlotStackPtr = StackP;
-      NewSlot->DataEnd = &NewSlot->Data[0] + NewSize;
-
-      // Make previous slot point to the newly allocated slot.
-      SlotP->Next = NewSlot;
-      // The current slot becomes the new slot.
-      SlotP = NewSlot;
-      // The stack pointer always points to the next free stack frame.
-      StackP = &NewSlot->Data[0] + PushSize;
-      // The frame pointer always points to the beginning of the frame.
-      FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0];
-    } else {
-      // Add the data chunk to the current slot. The frame pointer is set to
-      // point to the start of the new frame held in StackP.
-      FrameP = DataSharingState.FramePtr[WID] = StackP;
-      // Reset stack pointer to the requested address.
-      StackP = (void *)RequestedEndAddress;
-    }
-  }
-  // Get address from lane 0.
-  int *FP = (int *)&FrameP;
-  FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
-  if (sizeof(FrameP) == 8)
-    FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
-
-  return FrameP;
-}
-
-EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
-                                                      int16_t UseSharedMemory) {
-  return data_sharing_push_stack_common(DataSize);
-}
-
-// Called at the time of the kernel initialization. This is used to initilize
-// the list of references to shared variables and to pre-allocate global storage
-// for holding the globalized variables.
-//
-// By default the globalized variables are stored in global memory. If the
-// UseSharedMemory is set to true, the runtime will attempt to use shared memory
-// as long as the size requested fits the pre-allocated size.
-EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize,
-                                            int16_t UseSharedMemory) {
-  // Compute the total memory footprint of the requested data.
-  // The master thread requires a stack only for itself. A worker
-  // thread (which at this point is a warp master) will require
-  // space for the variables of each thread in the warp,
-  // i.e. one DataSize chunk per warp lane.
-  // TODO: change WARPSIZE to the number of active threads in the warp.
-  size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode()))
-                        ? DataSize
-                        : WARPSIZE * DataSize;
-
-  // Compute the start address of the frame of each thread in the warp.
-  uintptr_t FrameStartAddress =
-      (uintptr_t)data_sharing_push_stack_common(PushSize);
-  FrameStartAddress += (uintptr_t)(GetLaneId() * DataSize);
-  return (void *)FrameStartAddress;
-}
-
-// Pop the stack and free any memory which can be reclaimed.
-//
-// When the pop operation removes the last global memory slot,
-// reclaim all outstanding global memory slots since it is
-// likely we have reached the end of the kernel.
-EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
-  ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
-
-  __kmpc_impl_threadfence_block();
-
-  if (GetThreadIdInBlock() % WARPSIZE == 0) {
-    unsigned WID = GetWarpId();
-
-    // Current slot
-    __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID];
-
-    // Pointer to next available stack.
-    void *&StackP = DataSharingState.StackPtr[WID];
-
-    // Pop the frame.
-    StackP = FrameStart;
-
-    // If the current slot is empty, we need to free the slot after the
-    // pop.
-    bool SlotEmpty = (StackP == &SlotP->Data[0]);
-
-    if (SlotEmpty && SlotP->Prev) {
-      // Before removing the slot we need to reset StackP.
-      StackP = SlotP->PrevSlotStackPtr;
-
-      // Remove the slot.
-      SlotP = SlotP->Prev;
-      SafeFree(SlotP->Next, "Free slot.");
-      SlotP->Next = 0;
-    }
-  }
+// Free the allocated memory.
+EXTERN void __kmpc_free_shared(void *FrameStart) {
+  SafeFree(FrameStart, "Free Shared");
 }
 
 // Begin a data sharing context. Maintain a list of references to shared
@@ -278,4 +107,21 @@
   omptarget_nvptx_simpleMemoryManager.Release();
 }
 
+// Deprecated globalization code
+EXTERN void __kmpc_data_sharing_init_stack() {}
+EXTERN void __kmpc_data_sharing_init_stack_spmd() {}
+
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize,
+                                                      int16_t) {
+  return (void *)SafeMalloc(DataSize, "Alloc Deprecated");
+}
+
+EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, int16_t) {
+  return (void *)SafeMalloc(DataSize, "Alloc Deprecated");
+}
+
+EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) {
+  SafeFree(FrameStart, "Free Shared");
+}
+
 #pragma omp end declare target
diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h
--- a/openmp/libomptarget/deviceRTLs/interface.h
+++ b/openmp/libomptarget/deviceRTLs/interface.h
@@ -424,13 +424,8 @@
 EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
 EXTERN void __kmpc_kernel_end_parallel();
 
-EXTERN void __kmpc_data_sharing_init_stack();
-EXTERN void __kmpc_data_sharing_init_stack_spmd();
-EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
-                                                      int16_t UseSharedMemory);
-EXTERN void *__kmpc_data_sharing_push_stack(size_t size,
-                                            int16_t UseSharedMemory);
-EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void *__kmpc_alloc_shared(size_t Size);
+EXTERN void __kmpc_free_shared(void *Data);
 EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
 EXTERN void __kmpc_end_sharing_variables();
 EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
@@ -445,4 +440,11 @@
 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
                                               int16_t is_shared);
 
+// Deprecated globalization interface
+EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, int16_t s);
+EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t s);
+EXTERN void __kmpc_data_sharing_pop_stack(void *a);
+EXTERN void __kmpc_data_sharing_init_stack();
+EXTERN void __kmpc_data_sharing_init_stack_spmd();
+
 #endif