diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10617,18 +10617,18 @@
     }
     Linkage = CGM.getLLVMLinkageVarDefinition(VD, /*IsConstant=*/false);
     // Temp solution to prevent optimizations of the internal variables.
-    if (CGM.getLangOpts().OpenMPIsDevice && !VD->isExternallyVisible()) {
-      std::string RefName = getName({VarName, "ref"});
-      if (!CGM.GetGlobalValue(RefName)) {
-        llvm::Constant *AddrRef =
-            getOrCreateInternalVariable(Addr->getType(), RefName);
-        auto *GVAddrRef = cast<llvm::GlobalVariable>(AddrRef);
-        GVAddrRef->setConstant(/*Val=*/true);
-        GVAddrRef->setLinkage(llvm::GlobalValue::InternalLinkage);
-        GVAddrRef->setInitializer(Addr);
-        CGM.addCompilerUsedGlobal(GVAddrRef);
-      }
-    }
+    // if (CGM.getLangOpts().OpenMPIsDevice && !VD->isExternallyVisible()) {
+    // std::string RefName = getName({VarName, "ref"});
+    // if (!CGM.GetGlobalValue(RefName)) {
+    // llvm::Constant *AddrRef =
+    // getOrCreateInternalVariable(Addr->getType(), RefName);
+    // auto *GVAddrRef = cast<llvm::GlobalVariable>(AddrRef);
+    // GVAddrRef->setConstant([>Val=<]true);
+    // GVAddrRef->setLinkage(llvm::GlobalValue::InternalLinkage);
+    // GVAddrRef->setInitializer(Addr);
+    // CGM.addCompilerUsedGlobal(GVAddrRef);
+    //}
+    //}
   } else {
     assert(((*Res == OMPDeclareTargetDeclAttr::MT_Link) ||
             (*Res == OMPDeclareTargetDeclAttr::MT_To &&
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -440,15 +440,14 @@
   /// The data for the single globalized variable.
   struct MappedVarData {
     /// Corresponding field in the global record.
-    const FieldDecl *FD = nullptr;
+    llvm::Value *GlobalizedVal = nullptr;
     /// Corresponding address.
     Address PrivateAddr = Address::invalid();
     /// true, if only one element is required (for latprivates in SPMD mode),
     /// false, if need to create based on the warp-size.
     bool IsOnePerTeam = false;
     MappedVarData() = delete;
-    MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false)
-        : FD(FD), IsOnePerTeam(IsOnePerTeam) {}
+    MappedVarData(bool IsOnePerTeam = false) : IsOnePerTeam(IsOnePerTeam) {}
   };
   /// The map of local variables to their addresses in the global memory.
   using DeclToAddrMapTy = llvm::MapVector<const Decl *, MappedVarData>;
@@ -460,29 +459,12 @@
     EscapedParamsTy EscapedParameters;
     llvm::SmallVector<const ValueDecl*, 4> EscapedVariableLengthDecls;
     llvm::SmallVector<llvm::Value *, 4> EscapedVariableLengthDeclsAddrs;
-    const RecordDecl *GlobalRecord = nullptr;
-    llvm::Optional<const RecordDecl *> SecondaryGlobalRecord = llvm::None;
-    llvm::Value *GlobalRecordAddr = nullptr;
     llvm::Value *IsInSPMDModeFlag = nullptr;
     std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams;
   };
   /// Maps the function to the list of the globalized variables with their
   /// addresses.
   llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
-  /// List of records for the globalized variables in target/teams/distribute
-  /// contexts. Inner records are going to be joined into the single record,
-  /// while those resulting records are going to be joined into the single
-  /// union. This resulting union (one per CU) is the entry point for the static
-  /// memory management runtime functions.
-  struct GlobalPtrSizeRecsTy {
-    llvm::GlobalVariable *UseSharedMemory = nullptr;
-    llvm::GlobalVariable *RecSize = nullptr;
-    llvm::GlobalVariable *Buffer = nullptr;
-    SourceLocation Loc;
-    llvm::SmallVector<const RecordDecl *, 2> Records;
-    unsigned RegionCounter = 0;
-  };
-  llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords;
   llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
   /// List of the records with the list of fields for the reductions across the
   /// teams. Used to build the intermediate buffer for the fast teams
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1096,17 +1096,6 @@
   } Action(EST, WST);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
-  // Reserve place for the globalized memory.
-  GlobalizedRecords.emplace_back();
-  if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::UndefValue::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
-  }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
   IsInTTDRegion = false;
@@ -1156,10 +1145,6 @@
                           CGM.getModule(), OMPRTL___kmpc_kernel_init),
                       Args);
 
-  // For data sharing, we need to initialize the stack.
-  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-      CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
-
   emitGenericVarsProlog(CGF, WST.Loc);
 }
 
@@ -1228,17 +1213,6 @@
   } Action(*this, EST, D);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
-  // Reserve place for the globalized memory.
-  GlobalizedRecords.emplace_back();
-  if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::UndefValue::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
-  }
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
                                    IsOffloadEntry, CodeGen);
   IsInTTDRegion = false;
@@ -1260,12 +1234,6 @@
                           CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
                       Args);
 
-  if (RequiresFullRuntime) {
-    // For data sharing, we need to initialize the stack.
-    CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-        CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
-  }
-
   CGF.EmitBranch(ExecuteBB);
 
   CGF.EmitBlock(ExecuteBB);
@@ -1671,7 +1639,6 @@
           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
       if (GlobalizedRD) {
         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
-        I->getSecond().GlobalRecord = GlobalizedRD;
         I->getSecond().MappedParams =
             std::make_unique<CodeGenFunction::OMPMapVars>();
         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
@@ -1679,8 +1646,7 @@
           assert(Pair.getFirst()->isCanonicalDecl() &&
                  "Expected canonical declaration");
           Data.insert(std::make_pair(Pair.getFirst(),
-                                     MappedVarData(Pair.getSecond(),
-                                                   /*IsOnePerTeam=*/true)));
+                                     MappedVarData(/*IsOnePerTeam=*/true)));
         }
       }
       Rt.emitGenericVarsProlog(CGF, Loc);
@@ -1709,282 +1675,67 @@
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I == FunctionGlobalizedDecls.end())
     return;
-  if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
-    QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
-    QualType SecGlobalRecTy;
 
-    // Recover pointer to this function's global record. The runtime will
-    // handle the specifics of the allocation of the memory.
-    // Use actual memory size of the record including the padding
-    // for alignment purposes.
-    unsigned Alignment =
-        CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
-    unsigned GlobalRecordSize =
-        CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
-    GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
+  for (auto &Rec : I->getSecond().LocalVarData) {
+    const auto *VD = cast<VarDecl>(Rec.first);
+    bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
+    QualType VarTy = VD->getType();
 
-    llvm::PointerType *GlobalRecPtrTy =
-        CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
-    llvm::Value *GlobalRecCastAddr;
-    llvm::Value *IsTTD = nullptr;
-    if (!IsInTTDRegion &&
-        (WithSPMDCheck ||
-         getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
-      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-      llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
-      llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
-      if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
-        llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-        llvm::Value *ThreadID = getThreadID(CGF, Loc);
-        llvm::Value *PL = CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
-                                                  OMPRTL___kmpc_parallel_level),
-            {RTLoc, ThreadID});
-        IsTTD = Bld.CreateIsNull(PL);
-      }
-      llvm::Value *IsSPMD = Bld.CreateIsNotNull(
-          CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
-      Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
-      // There is no need to emit line number for unconditional branch.
-      (void)ApplyDebugLocation::CreateEmpty(CGF);
-      CGF.EmitBlock(SPMDBB);
-      Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
-                               CharUnits::fromQuantity(Alignment));
-      CGF.EmitBranch(ExitBB);
-      // There is no need to emit line number for unconditional branch.
-      (void)ApplyDebugLocation::CreateEmpty(CGF);
-      CGF.EmitBlock(NonSPMDBB);
-      llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
-      if (const RecordDecl *SecGlobalizedVarsRecord =
-              I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
-        SecGlobalRecTy =
-            CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
-
-        // Recover pointer to this function's global record. The runtime will
-        // handle the specifics of the allocation of the memory.
-        // Use actual memory size of the record including the padding
-        // for alignment purposes.
-        unsigned Alignment =
-            CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
-        unsigned GlobalRecordSize =
-            CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
-        GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
-        Size = Bld.CreateSelect(
-            IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
-      }
-      // TODO: allow the usage of shared memory to be controlled by
-      // the user, for now, default to global.
-      llvm::Value *GlobalRecordSizeArg[] = {
-          Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
-      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
-          GlobalRecordSizeArg);
-      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          GlobalRecValue, GlobalRecPtrTy);
-      CGF.EmitBlock(ExitBB);
-      auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
-                                /*NumReservedValues=*/2, "_select_stack");
-      Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
-      Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
-      GlobalRecCastAddr = Phi;
-      I->getSecond().GlobalRecordAddr = Phi;
-      I->getSecond().IsInSPMDModeFlag = IsSPMD;
-    } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
-      assert(GlobalizedRecords.back().Records.size() < 2 &&
-             "Expected less than 2 globalized records: one for target and one "
-             "for teams.");
-      unsigned Offset = 0;
-      for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
-        QualType RDTy = CGM.getContext().getRecordType(RD);
-        unsigned Alignment =
-            CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
-        unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
-        Offset =
-            llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
-      }
-      unsigned Alignment =
-          CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
-      Offset = llvm::alignTo(Offset, Alignment);
-      GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
-      ++GlobalizedRecords.back().RegionCounter;
-      if (GlobalizedRecords.back().Records.size() == 1) {
-        assert(KernelStaticGlobalized &&
-               "Kernel static pointer must be initialized already.");
-        auto *UseSharedMemory = new llvm::GlobalVariable(
-            CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
-            llvm::GlobalValue::InternalLinkage, nullptr,
-            "_openmp_static_kernel$is_shared");
-        UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-        QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
-            /*DestWidth=*/16, /*Signed=*/0);
-        llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
-            Address(UseSharedMemory,
-                    CGM.getContext().getTypeAlignInChars(Int16Ty)),
-            /*Volatile=*/false, Int16Ty, Loc);
-        auto *StaticGlobalized = new llvm::GlobalVariable(
-            CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
-            llvm::GlobalValue::CommonLinkage, nullptr);
-        auto *RecSize = new llvm::GlobalVariable(
-            CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
-            llvm::GlobalValue::InternalLinkage, nullptr,
-            "_openmp_static_kernel$size");
-        RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
-        llvm::Value *Ld = CGF.EmitLoadOfScalar(
-            Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
-            CGM.getContext().getSizeType(), Loc);
-        llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-            KernelStaticGlobalized, CGM.VoidPtrPtrTy);
-        llvm::Value *GlobalRecordSizeArg[] = {
-            llvm::ConstantInt::get(
-                CGM.Int16Ty,
-                getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
-            StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
-            GlobalRecordSizeArg);
-        GlobalizedRecords.back().Buffer = StaticGlobalized;
-        GlobalizedRecords.back().RecSize = RecSize;
-        GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
-        GlobalizedRecords.back().Loc = Loc;
-      }
-      assert(KernelStaticGlobalized && "Global address must be set already.");
-      Address FrameAddr = CGF.EmitLoadOfPointer(
-          Address(KernelStaticGlobalized, CGM.getPointerAlign()),
-          CGM.getContext()
-              .getPointerType(CGM.getContext().VoidPtrTy)
-              .castAs<PointerType>());
-      llvm::Value *GlobalRecValue =
-          Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
-      I->getSecond().GlobalRecordAddr = GlobalRecValue;
-      I->getSecond().IsInSPMDModeFlag = nullptr;
-      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
-    } else {
-      // TODO: allow the usage of shared memory to be controlled by
-      // the user, for now, default to global.
-      bool UseSharedMemory =
-          IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
-      llvm::Value *GlobalRecordSizeArg[] = {
-          llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
-          CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
-      llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(),
-              IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
-                            : OMPRTL___kmpc_data_sharing_coalesced_push_stack),
-          GlobalRecordSizeArg);
-      GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          GlobalRecValue, GlobalRecPtrTy);
-      I->getSecond().GlobalRecordAddr = GlobalRecValue;
-      I->getSecond().IsInSPMDModeFlag = nullptr;
+    // Get the local allocation of a firstprivate variable before sharing
+    llvm::Value *ParValue;
+    if (EscapedParam) {
+      LValue ParLVal =
+          CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
+      ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
     }
-    LValue Base =
-        CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
+    // Get the size needed in the stack. Logic of how much to allocate
+    // and which part to give to wich thread is inside the runtime function.
+    llvm::Value *Size = CGF.getTypeSize(VD->getType());
+    llvm::Value *VoidPtr =
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
+                            {Size});
 
-    // Emit the "global alloca" which is a GEP from the global declaration
-    // record using the pointer returned by the runtime.
-    LValue SecBase;
-    decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
-    if (IsTTD) {
-      SecIt = I->getSecond().SecondaryLocalVarData->begin();
-      llvm::PointerType *SecGlobalRecPtrTy =
-          CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
-      SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
-          Bld.CreatePointerBitCastOrAddrSpaceCast(
-              I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
-          SecGlobalRecTy);
-    }
-    for (auto &Rec : I->getSecond().LocalVarData) {
-      bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
-      llvm::Value *ParValue;
-      if (EscapedParam) {
-        const auto *VD = cast<VarDecl>(Rec.first);
-        LValue ParLVal =
-            CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
-        ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
-      }
-      LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
-      // Emit VarAddr basing on lane-id if required.
-      QualType VarTy;
-      if (Rec.second.IsOnePerTeam) {
-        VarTy = Rec.second.FD->getType();
-      } else {
-        Address Addr = VarAddr.getAddress(CGF);
-        llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
-            Addr.getElementType(), Addr.getPointer(),
-            {Bld.getInt32(0), getNVPTXLaneID(CGF)});
-        VarTy =
-            Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
-        VarAddr = CGF.MakeAddrLValue(
-            Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
-            AlignmentSource::Decl);
-      }
-      Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
-      if (!IsInTTDRegion &&
-          (WithSPMDCheck ||
-           getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
-        assert(I->getSecond().IsInSPMDModeFlag &&
-               "Expected unknown execution mode or required SPMD check.");
-        if (IsTTD) {
-          assert(SecIt->second.IsOnePerTeam &&
-                 "Secondary glob data must be one per team.");
-          LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
-          VarAddr.setAddress(
-              Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
-                                       VarAddr.getPointer(CGF)),
-                      VarAddr.getAlignment()));
-          Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
-        }
-        Address GlobalPtr = Rec.second.PrivateAddr;
-        Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
-        Rec.second.PrivateAddr = Address(
-            Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
-                             LocalAddr.getPointer(), GlobalPtr.getPointer()),
-            LocalAddr.getAlignment());
-      }
-      if (EscapedParam) {
-        const auto *VD = cast<VarDecl>(Rec.first);
-        CGF.EmitStoreOfScalar(ParValue, VarAddr);
-        I->getSecond().MappedParams->setVarAddr(CGF, VD,
-                                                VarAddr.getAddress(CGF));
-      }
-      if (IsTTD)
-        ++SecIt;
+    Rec.second.GlobalizedVal = VoidPtr;
+
+    // Cast the void pointer and get the address of the globalized variable.
+    llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
+    llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
+    LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
+    Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
+
+    // Assign the local allocation to the newly globalized location.
+    if (EscapedParam) {
+      CGF.EmitStoreOfScalar(ParValue, VarAddr);
+      I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
     }
   }
-  for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
-    // Recover pointer to this function's global record. The runtime will
-    // handle the specifics of the allocation of the memory.
-    // Use actual memory size of the record including the padding
+  for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
+    // Use actual memory size of the VLA object including the padding
     // for alignment purposes.
-    CGBuilderTy &Bld = CGF.Builder;
     llvm::Value *Size = CGF.getTypeSize(VD->getType());
     CharUnits Align = CGM.getContext().getDeclAlign(VD);
     Size = Bld.CreateNUWAdd(
         Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
     llvm::Value *AlignVal =
         llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
+
     Size = Bld.CreateUDiv(Size, AlignVal);
     Size = Bld.CreateNUWMul(Size, AlignVal);
-    // TODO: allow the usage of shared memory to be controlled by
-    // the user, for now, default to global.
-    llvm::Value *GlobalRecordSizeArg[] = {
-        Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
-    llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
-        OMPBuilder.getOrCreateRuntimeFunction(
-            CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
-        GlobalRecordSizeArg);
-    llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
-    LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
+
+    // Allocate space for this VLA object to be globalized
+    llvm::Value *VoidPtr =
+        CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                                CGM.getModule(), OMPRTL___kmpc_alloc_shared),
+                            {Size});
+
+    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr);
+    LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
                                      CGM.getContext().getDeclAlign(VD),
                                      AlignmentSource::Decl);
     I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
                                             Base.getAddress(CGF));
-    I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
   }
   I->getSecond().MappedParams->apply(CGF);
 }
@@ -1997,60 +1748,20 @@
 
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
   if (I != FunctionGlobalizedDecls.end()) {
-    I->getSecond().MappedParams->restore(CGF);
-    if (!CGF.HaveInsertPoint())
-      return;
+    // Deallocate the memory for each globalized VLA object
     for (llvm::Value *Addr :
          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
-      CGF.EmitRuntimeCall(
-          OMPBuilder.getOrCreateRuntimeFunction(
-              CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
-          Addr);
+      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                              CGM.getModule(), OMPRTL___kmpc_free_shared),
+                          Addr);
     }
-    if (I->getSecond().GlobalRecordAddr) {
-      if (!IsInTTDRegion &&
-          (WithSPMDCheck ||
-           getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
-        CGBuilderTy &Bld = CGF.Builder;
-        llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-        llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
-        Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(NonSPMDBB);
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
-            CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
-        CGF.EmitBlock(ExitBB);
-      } else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
-        assert(GlobalizedRecords.back().RegionCounter > 0 &&
-               "region counter must be > 0.");
-        --GlobalizedRecords.back().RegionCounter;
-        // Emit the restore function only in the target region.
-        if (GlobalizedRecords.back().RegionCounter == 0) {
-          QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
-              /*DestWidth=*/16, /*Signed=*/0);
-          llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
-              Address(GlobalizedRecords.back().UseSharedMemory,
-                      CGM.getContext().getTypeAlignInChars(Int16Ty)),
-              /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
-          llvm::Value *Args[] = {
-              llvm::ConstantInt::get(
-                  CGM.Int16Ty,
-                  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
-              IsInSharedMemory};
-          CGF.EmitRuntimeCall(
-              OMPBuilder.getOrCreateRuntimeFunction(
-                  CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
-              Args);
-        }
-      } else {
-        CGF.EmitRuntimeCall(
-            OMPBuilder.getOrCreateRuntimeFunction(
-                CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
-            I->getSecond().GlobalRecordAddr);
-      }
+    // Deallocate the memory for each globalized value
+    for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
+      I->getSecond().MappedParams->restore(CGF);
+
+      CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                              CGM.getModule(), OMPRTL___kmpc_free_shared),
+                          {Rec.second.GlobalizedVal});
     }
   }
 }
@@ -2093,6 +1804,14 @@
   // Force inline this outlined function at its call site.
   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
 
+  // Ensure we do not inline the function. This is trivially true for the ones
+  // passed to __kmpc_fork_call but the ones calles in serialized regions
+  // could be inlined. This is not a perfect but it is closer to the invariant
+  // we want, namely, every data environment starts with a new function.
+  // TODO: We should pass the if condition to the runtime function and do the
+  //       handling there. Much cleaner code.
+  cast<llvm::Function>(OutlinedFn)->addFnAttr(llvm::Attribute::NoInline);
+
   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
                                                       /*Name=*/".zero.addr");
   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
@@ -4208,15 +3927,6 @@
   auto *Fn = llvm::Function::Create(
       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
-
-  // Ensure we do not inline the function. This is trivially true for the ones
-  // passed to __kmpc_fork_call but the ones calles in serialized regions
-  // could be inlined. This is not a perfect but it is closer to the invariant
-  // we want, namely, every data environment starts with a new function.
-  // TODO: We should pass the if condition to the runtime function and do the
-  //       handling there. Much cleaner code.
-  Fn->addFnAttr(llvm::Attribute::NoInline);
-
   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
   Fn->setDoesNotRecurse();
@@ -4334,6 +4044,7 @@
   }
   if (!Body)
     return;
+
   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
   VarChecker.Visit(Body);
   const RecordDecl *GlobalizedVarsRecord =
@@ -4347,7 +4058,6 @@
   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
   I->getSecond().MappedParams =
       std::make_unique<CodeGenFunction::OMPMapVars>();
-  I->getSecond().GlobalRecord = GlobalizedVarsRecord;
   I->getSecond().EscapedParameters.insert(
       VarChecker.getEscapedParameters().begin(),
       VarChecker.getEscapedParameters().end());
@@ -4356,21 +4066,16 @@
   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
     assert(VD->isCanonicalDecl() && "Expected canonical declaration");
-    const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-    Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
+    Data.insert(std::make_pair(VD, MappedVarData(IsInTTDRegion)));
   }
   if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
     CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
     VarChecker.Visit(Body);
-    I->getSecond().SecondaryGlobalRecord =
-        VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
     I->getSecond().SecondaryLocalVarData.emplace();
     DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
     for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
       assert(VD->isCanonicalDecl() && "Expected canonical declaration");
-      const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
-      Data.insert(
-          std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
+      Data.insert(std::make_pair(VD, MappedVarData(/*IsInTTDRegion=*/true)));
     }
   }
   if (!NeedToDelayGlobalization) {
@@ -4661,185 +4366,8 @@
   CGOpenMPRuntime::processRequiresDirective(D);
 }
 
-/// Get number of SMs and number of blocks per SM.
-static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
-  std::pair<unsigned, unsigned> Data;
-  if (CGM.getLangOpts().OpenMPCUDANumSMs)
-    Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
-  if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
-    Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
-  if (Data.first && Data.second)
-    return Data;
-  switch (getCudaArch(CGM)) {
-  case CudaArch::SM_20:
-  case CudaArch::SM_21:
-  case CudaArch::SM_30:
-  case CudaArch::SM_32:
-  case CudaArch::SM_35:
-  case CudaArch::SM_37:
-  case CudaArch::SM_50:
-  case CudaArch::SM_52:
-  case CudaArch::SM_53:
-    return {16, 16};
-  case CudaArch::SM_60:
-  case CudaArch::SM_61:
-  case CudaArch::SM_62:
-    return {56, 32};
-  case CudaArch::SM_70:
-  case CudaArch::SM_72:
-  case CudaArch::SM_75:
-  case CudaArch::SM_80:
-  case CudaArch::SM_86:
-    return {84, 32};
-  case CudaArch::GFX600:
-  case CudaArch::GFX601:
-  case CudaArch::GFX602:
-  case CudaArch::GFX700:
-  case CudaArch::GFX701:
-  case CudaArch::GFX702:
-  case CudaArch::GFX703:
-  case CudaArch::GFX704:
-  case CudaArch::GFX705:
-  case CudaArch::GFX801:
-  case CudaArch::GFX802:
-  case CudaArch::GFX803:
-  case CudaArch::GFX805:
-  case CudaArch::GFX810:
-  case CudaArch::GFX900:
-  case CudaArch::GFX902:
-  case CudaArch::GFX904:
-  case CudaArch::GFX906:
-  case CudaArch::GFX908:
-  case CudaArch::GFX909:
-  case CudaArch::GFX90a:
-  case CudaArch::GFX90c:
-  case CudaArch::GFX1010:
-  case CudaArch::GFX1011:
-  case CudaArch::GFX1012:
-  case CudaArch::GFX1030:
-  case CudaArch::GFX1031:
-  case CudaArch::GFX1032:
-  case CudaArch::GFX1033:
-  case CudaArch::UNUSED:
-  case CudaArch::UNKNOWN:
-    break;
-  case CudaArch::LAST:
-    llvm_unreachable("Unexpected Cuda arch.");
-  }
-  llvm_unreachable("Unexpected NVPTX target without ptx feature.");
-}
-
 void CGOpenMPRuntimeGPU::clear() {
-  if (!GlobalizedRecords.empty() &&
-      !CGM.getLangOpts().OpenMPCUDATargetParallel) {
-    ASTContext &C = CGM.getContext();
-    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
-    llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
-    RecordDecl *StaticRD = C.buildImplicitRecord(
-        "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
-    StaticRD->startDefinition();
-    RecordDecl *SharedStaticRD = C.buildImplicitRecord(
-        "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
-    SharedStaticRD->startDefinition();
-    for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
-      if (Records.Records.empty())
-        continue;
-      unsigned Size = 0;
-      unsigned RecAlignment = 0;
-      for (const RecordDecl *RD : Records.Records) {
-        QualType RDTy = C.getRecordType(RD);
-        unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
-        RecAlignment = std::max(RecAlignment, Alignment);
-        unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
-        Size =
-            llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
-      }
-      Size = llvm::alignTo(Size, RecAlignment);
-      llvm::APInt ArySize(/*numBits=*/64, Size);
-      QualType SubTy = C.getConstantArrayType(
-          C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
-      const bool UseSharedMemory = Size <= SharedMemorySize;
-      auto *Field =
-          FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
-                            SourceLocation(), SourceLocation(), nullptr, SubTy,
-                            C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
-                            /*BW=*/nullptr, /*Mutable=*/false,
-                            /*InitStyle=*/ICIS_NoInit);
-      Field->setAccess(AS_public);
-      if (UseSharedMemory) {
-        SharedStaticRD->addDecl(Field);
-        SharedRecs.push_back(&Records);
-      } else {
-        StaticRD->addDecl(Field);
-        GlobalRecs.push_back(&Records);
-      }
-      Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
-      Records.UseSharedMemory->setInitializer(
-          llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
-    }
-    // Allocate SharedMemorySize buffer for the shared memory.
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore this code as sson as nvlink is fixed.
-    if (!SharedStaticRD->field_empty()) {
-      llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
-      QualType SubTy = C.getConstantArrayType(
-          C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
-      auto *Field = FieldDecl::Create(
-          C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
-          C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
-          /*BW=*/nullptr, /*Mutable=*/false,
-          /*InitStyle=*/ICIS_NoInit);
-      Field->setAccess(AS_public);
-      SharedStaticRD->addDecl(Field);
-    }
-    SharedStaticRD->completeDefinition();
-    if (!SharedStaticRD->field_empty()) {
-      QualType StaticTy = C.getRecordType(SharedStaticRD);
-      llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMStaticTy,
-          /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
-          llvm::UndefValue::get(LLVMStaticTy),
-          "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
-          llvm::GlobalValue::NotThreadLocal,
-          C.getTargetAddressSpace(LangAS::cuda_shared));
-      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-          GV, CGM.VoidPtrTy);
-      for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
-        Rec->Buffer->replaceAllUsesWith(Replacement);
-        Rec->Buffer->eraseFromParent();
-      }
-    }
-    StaticRD->completeDefinition();
-    if (!StaticRD->field_empty()) {
-      QualType StaticTy = C.getRecordType(StaticRD);
-      std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
-      llvm::APInt Size1(32, SMsBlockPerSM.second);
-      QualType Arr1Ty =
-          C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
-                                 /*IndexTypeQuals=*/0);
-      llvm::APInt Size2(32, SMsBlockPerSM.first);
-      QualType Arr2Ty =
-          C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
-                                 /*IndexTypeQuals=*/0);
-      llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
-      // FIXME: nvlink does not handle weak linkage correctly (object with the
-      // different size are reported as erroneous).
-      // Restore CommonLinkage as soon as nvlink is fixed.
-      auto *GV = new llvm::GlobalVariable(
-          CGM.getModule(), LLVMArr2Ty,
-          /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-          llvm::Constant::getNullValue(LLVMArr2Ty),
-          "_openmp_static_glob_rd_$_");
-      auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
-          GV, CGM.VoidPtrTy);
-      for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
-        Rec->Buffer->replaceAllUsesWith(Replacement);
-        Rec->Buffer->eraseFromParent();
-      }
-    }
-  }
+
   if (!TeamsReductions.empty()) {
     ASTContext &C = CGM.getContext();
     RecordDecl *StaticRD = C.buildImplicitRecord(
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -12604,6 +12604,10 @@
       return;
     }
 
+    // If we there is no initializer requested we don't check for one.
+    if (Var->hasAttr<LoaderUninitializedAttr>())
+      return;
+
     if (!Var->isInvalidDecl() && RealDecl->hasAttr<LoaderUninitializedAttr>()) {
       if (Var->getStorageClass() == SC_Extern) {
         Diag(Var->getLocation(), diag::err_loader_uninitialized_extern_decl)
diff --git a/llvm/include/llvm/Analysis/TargetLibraryInfo.def b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
--- a/llvm/include/llvm/Analysis/TargetLibraryInfo.def
+++ b/llvm/include/llvm/Analysis/TargetLibraryInfo.def
@@ -329,6 +329,12 @@
 /// int __isoc99_sscanf(const char *s, const char *format, ...)
 TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_sscanf)
 TLI_DEFINE_STRING_INTERNAL("__isoc99_sscanf")
+/// void __kmpc_alloc_shared(size_t nbyte);
+TLI_DEFINE_ENUM_INTERNAL(__kmpc_alloc_shared)
+TLI_DEFINE_STRING_INTERNAL("__kmpc_alloc_shared")
+/// void __kmpc_free_shared(void *ptr);
+TLI_DEFINE_ENUM_INTERNAL(__kmpc_free_shared)
+TLI_DEFINE_STRING_INTERNAL("__kmpc_free_shared")
 /// double __log10_finite(double x);
 TLI_DEFINE_ENUM_INTERNAL(log10_finite)
 TLI_DEFINE_STRING_INTERNAL("__log10_finite")
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -425,12 +425,9 @@
           GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
 
 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
-__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, )
-__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, )
 
-__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16)
-__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16)
-__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr)
+__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
+__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
 __OMP_RTL(__kmpc_end_sharing_variables, false, Void, )
 __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
@@ -526,6 +523,11 @@
                                    EnumAttr(NoCapture))
                     : AttributeSet())
 
+__OMP_ATTRS_SET(NoCaptureAttrs,
+                OptimisticAttributes
+                    ? AttributeSet(EnumAttr(NoCapture))
+                    : AttributeSet(EnumAttr(NoCapture)))
+
 #if 0
 __OMP_ATTRS_SET(WriteOnlyPtrAttrs,
                 OptimisticAttributes
@@ -841,38 +843,26 @@
 __OMP_RTL_ATTRS(__kmpc_alloc, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_free, AllocAttrs, AttributeSet(), ParamAttrs())
 
-__OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs,
-                ParamAttrs())
-__OMP_RTL_ATTRS(__kmpc_destroy_allocator, AllocAttrs, AttributeSet(),
-                ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_init_allocator, DefaultAttrs, ReturnPtrAttrs, ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_destroy_allocator, AllocAttrs, AttributeSet(), ParamAttrs())
 
-__OMP_RTL_ATTRS(__kmpc_push_target_tripcount_mapper, SetterAttrs,
-                AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__kmpc_push_target_tripcount_mapper, SetterAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(__tgt_target_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_target_nowait_mapper, ForkAttrs, AttributeSet(),
-                ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_target_teams_mapper, ForkAttrs, AttributeSet(),
-                ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_target_teams_nowait_mapper, ForkAttrs, AttributeSet(),
-                ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(),
-                ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(),
-                ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_target_nowait_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_target_teams_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_target_teams_nowait_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_register_requires, ForkAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_target_data_begin_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(__tgt_target_data_begin_nowait_mapper, ForkAttrs,
-                AttributeSet(), ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_target_data_end_mapper, ForkAttrs, AttributeSet(),
-                ParamAttrs())
+        AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_target_data_end_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(__tgt_target_data_end_nowait_mapper, ForkAttrs,
-                AttributeSet(), ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_target_data_update_mapper, ForkAttrs, AttributeSet(),
-                ParamAttrs())
+        AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_target_data_update_mapper, ForkAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(__tgt_target_data_update_nowait_mapper, ForkAttrs,
-                AttributeSet(), ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_mapper_num_components, ForkAttrs, AttributeSet(),
-                ParamAttrs())
-__OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(),
-                ParamAttrs())
+        AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_mapper_num_components, ForkAttrs, AttributeSet(), ParamAttrs())
+__OMP_RTL_ATTRS(__tgt_push_mapper_component, ForkAttrs, AttributeSet(), ParamAttrs())
 __OMP_RTL_ATTRS(__kmpc_task_allow_completion_event, DefaultAttrs,
                 ReturnPtrAttrs, ParamAttrs(ReadOnlyPtrAttrs))
 
@@ -967,7 +957,6 @@
 __OMP_PROC_BIND_KIND(master, 2)
 __OMP_PROC_BIND_KIND(close, 3)
 __OMP_PROC_BIND_KIND(spread, 4)
-__OMP_PROC_BIND_KIND(primary, 5)
 __OMP_PROC_BIND_KIND(default, 6)
 __OMP_PROC_BIND_KIND(unknown, 7)
 
@@ -1079,7 +1068,6 @@
 __OMP_TRAIT_PROPERTY(user, condition, false)
 __OMP_TRAIT_PROPERTY(user, condition, unknown)
 
-__OMP_TRAIT_SELECTOR_AND_PROPERTY(construct, dispatch)
 
 // Note that we put isa last so that the other conditions are checked first.
 // This allows us to issue warnings wrt. isa only if we match otherwise.
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -71,46 +71,65 @@
 // FIXME: certain users need more information. E.g., SimplifyLibCalls needs to
 // know which functions are nounwind, noalias, nocapture parameters, etc.
 static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
-  {LibFunc_malloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc_vec_malloc,          {MallocLike,  1, 0,  -1}},
-  {LibFunc_valloc,              {MallocLike,  1, 0,  -1}},
-  {LibFunc_Znwj,                {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc_ZnwjRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc_ZnwjSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new(unsigned int, align_val_t)
-  {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int, align_val_t, nothrow)
-                                {MallocLike,  3, 0,  -1}},
-  {LibFunc_Znwm,                {OpNewLike,   1, 0,  -1}}, // new(unsigned long)
-  {LibFunc_ZnwmRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new(unsigned long, nothrow)
-  {LibFunc_ZnwmSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new(unsigned long, align_val_t)
-  {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long, align_val_t, nothrow)
-                                {MallocLike,  3, 0,  -1}},
-  {LibFunc_Znaj,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc_ZnajRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc_ZnajSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new[](unsigned int, align_val_t)
-  {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int, align_val_t, nothrow)
-                                {MallocLike,  3, 0,  -1}},
-  {LibFunc_Znam,                {OpNewLike,   1, 0,  -1}}, // new[](unsigned long)
-  {LibFunc_ZnamRKSt9nothrow_t,  {MallocLike,  2, 0,  -1}}, // new[](unsigned long, nothrow)
-  {LibFunc_ZnamSt11align_val_t, {OpNewLike,   2, 0,  -1}}, // new[](unsigned long, align_val_t)
-  {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long, align_val_t, nothrow)
-                                 {MallocLike,  3, 0,  -1}},
-  {LibFunc_msvc_new_int,         {OpNewLike,   1, 0,  -1}}, // new(unsigned int)
-  {LibFunc_msvc_new_int_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned int, nothrow)
-  {LibFunc_msvc_new_longlong,         {OpNewLike,   1, 0,  -1}}, // new(unsigned long long)
-  {LibFunc_msvc_new_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new(unsigned long long, nothrow)
-  {LibFunc_msvc_new_array_int,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned int)
-  {LibFunc_msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
-  {LibFunc_msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
-  {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
-  {LibFunc_aligned_alloc,       {AlignedAllocLike, 2, 1,  -1}},
-  {LibFunc_calloc,              {CallocLike,  2, 0,   1}},
-  {LibFunc_vec_calloc,          {CallocLike,  2, 0,   1}},
-  {LibFunc_realloc,             {ReallocLike, 2, 1,  -1}},
-  {LibFunc_vec_realloc,         {ReallocLike, 2, 1,  -1}},
-  {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
-  {LibFunc_strdup,              {StrDupLike,  1, -1, -1}},
-  {LibFunc_strndup,             {StrDupLike,  2, 1,  -1}}
-  // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
+    {LibFunc_malloc, {MallocLike, 1, 0, -1}},
+    {LibFunc_vec_malloc, {MallocLike, 1, 0, -1}},
+    {LibFunc_valloc, {MallocLike, 1, 0, -1}},
+    {LibFunc_Znwj, {OpNewLike, 1, 0, -1}}, // new(unsigned int)
+    {LibFunc_ZnwjRKSt9nothrow_t,
+     {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow)
+    {LibFunc_ZnwjSt11align_val_t,
+     {OpNewLike, 2, 0, -1}}, // new(unsigned int, align_val_t)
+    {LibFunc_ZnwjSt11align_val_tRKSt9nothrow_t, // new(unsigned int,
+                                                // align_val_t, nothrow)
+     {MallocLike, 3, 0, -1}},
+    {LibFunc_Znwm, {OpNewLike, 1, 0, -1}}, // new(unsigned long)
+    {LibFunc_ZnwmRKSt9nothrow_t,
+     {MallocLike, 2, 0, -1}}, // new(unsigned long, nothrow)
+    {LibFunc_ZnwmSt11align_val_t,
+     {OpNewLike, 2, 0, -1}}, // new(unsigned long, align_val_t)
+    {LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t, // new(unsigned long,
+                                                // align_val_t, nothrow)
+     {MallocLike, 3, 0, -1}},
+    {LibFunc_Znaj, {OpNewLike, 1, 0, -1}}, // new[](unsigned int)
+    {LibFunc_ZnajRKSt9nothrow_t,
+     {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow)
+    {LibFunc_ZnajSt11align_val_t,
+     {OpNewLike, 2, 0, -1}}, // new[](unsigned int, align_val_t)
+    {LibFunc_ZnajSt11align_val_tRKSt9nothrow_t, // new[](unsigned int,
+                                                // align_val_t, nothrow)
+     {MallocLike, 3, 0, -1}},
+    {LibFunc_Znam, {OpNewLike, 1, 0, -1}}, // new[](unsigned long)
+    {LibFunc_ZnamRKSt9nothrow_t,
+     {MallocLike, 2, 0, -1}}, // new[](unsigned long, nothrow)
+    {LibFunc_ZnamSt11align_val_t,
+     {OpNewLike, 2, 0, -1}}, // new[](unsigned long, align_val_t)
+    {LibFunc_ZnamSt11align_val_tRKSt9nothrow_t, // new[](unsigned long,
+                                                // align_val_t, nothrow)
+     {MallocLike, 3, 0, -1}},
+    {LibFunc_msvc_new_int, {OpNewLike, 1, 0, -1}}, // new(unsigned int)
+    {LibFunc_msvc_new_int_nothrow,
+     {MallocLike, 2, 0, -1}}, // new(unsigned int, nothrow)
+    {LibFunc_msvc_new_longlong,
+     {OpNewLike, 1, 0, -1}}, // new(unsigned long long)
+    {LibFunc_msvc_new_longlong_nothrow,
+     {MallocLike, 2, 0, -1}}, // new(unsigned long long, nothrow)
+    {LibFunc_msvc_new_array_int, {OpNewLike, 1, 0, -1}}, // new[](unsigned int)
+    {LibFunc_msvc_new_array_int_nothrow,
+     {MallocLike, 2, 0, -1}}, // new[](unsigned int, nothrow)
+    {LibFunc_msvc_new_array_longlong,
+     {OpNewLike, 1, 0, -1}}, // new[](unsigned long long)
+    {LibFunc_msvc_new_array_longlong_nothrow,
+     {MallocLike, 2, 0, -1}}, // new[](unsigned long long, nothrow)
+    {LibFunc_aligned_alloc, {AlignedAllocLike, 2, 1, -1}},
+    {LibFunc_calloc, {CallocLike, 2, 0, 1}},
+    {LibFunc_vec_calloc, {CallocLike, 2, 0, 1}},
+    {LibFunc_realloc, {ReallocLike, 2, 1, -1}},
+    {LibFunc_vec_realloc, {ReallocLike, 2, 1, -1}},
+    {LibFunc_reallocf, {ReallocLike, 2, 1, -1}},
+    {LibFunc_strdup, {StrDupLike, 1, -1, -1}},
+    {LibFunc_strndup, {StrDupLike, 2, 1, -1}},
+    // TODO: Handle "int posix_memalign(void **, size_t, size_t)"
+    {LibFunc___kmpc_alloc_shared, {MallocLike, 1, 0, -1}},
 };
 
 static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast,
@@ -433,12 +452,13 @@
 bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
   unsigned ExpectedNumParams;
   if (TLIFn == LibFunc_free ||
-      TLIFn == LibFunc_ZdlPv || // operator delete(void*)
-      TLIFn == LibFunc_ZdaPv || // operator delete[](void*)
-      TLIFn == LibFunc_msvc_delete_ptr32 || // operator delete(void*)
-      TLIFn == LibFunc_msvc_delete_ptr64 || // operator delete(void*)
+      TLIFn == LibFunc_ZdlPv ||                   // operator delete(void*)
+      TLIFn == LibFunc_ZdaPv ||                   // operator delete[](void*)
+      TLIFn == LibFunc_msvc_delete_ptr32 ||       // operator delete(void*)
+      TLIFn == LibFunc_msvc_delete_ptr64 ||       // operator delete(void*)
       TLIFn == LibFunc_msvc_delete_array_ptr32 || // operator delete[](void*)
-      TLIFn == LibFunc_msvc_delete_array_ptr64)   // operator delete[](void*)
+      TLIFn == LibFunc_msvc_delete_array_ptr64 || // operator delete[](void*)
+      TLIFn == LibFunc___kmpc_free_shared)
     ExpectedNumParams = 1;
   else if (TLIFn == LibFunc_ZdlPvj ||              // delete(void*, uint)
            TLIFn == LibFunc_ZdlPvm ||              // delete(void*, ulong)
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -158,6 +158,8 @@
     TLI.setUnavailable(LibFunc_memcpy);
     TLI.setUnavailable(LibFunc_memset);
     TLI.setUnavailable(LibFunc_memset_pattern16);
+    TLI.setAvailable(llvm::LibFunc___kmpc_alloc_shared);
+    TLI.setAvailable(llvm::LibFunc___kmpc_free_shared);
     return;
   }
 
@@ -601,6 +603,8 @@
     //    TLI.setAvailable(llvm::LibFunc_memcpy);
     //    TLI.setAvailable(llvm::LibFunc_memset);
 
+    TLI.setAvailable(llvm::LibFunc___kmpc_alloc_shared);
+    TLI.setAvailable(llvm::LibFunc___kmpc_free_shared);
   } else {
     TLI.setUnavailable(LibFunc_nvvm_reflect);
   }
@@ -891,6 +895,8 @@
             FTy.getParamType(2)->isPointerTy());
   case LibFunc_system:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy());
+  case LibFunc___kmpc_alloc_shared:
+  case LibFunc___kmpc_free_shared:
   case LibFunc_malloc:
   case LibFunc_vec_malloc:
     return (NumParams == 1 && FTy.getReturnType()->isPointerTy());
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -155,22 +155,27 @@
 /// is set to false and the instruction is volatile, return nullptr.
 static const Value *getPointerOperand(const Instruction *I,
                                       bool AllowVolatile) {
-  if (!AllowVolatile && I->isVolatile())
-    return nullptr;
-
   if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (!AllowVolatile && LI->isVolatile())
+      return nullptr;
     return LI->getPointerOperand();
   }
 
   if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (!AllowVolatile && SI->isVolatile())
+      return nullptr;
     return SI->getPointerOperand();
   }
 
   if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!AllowVolatile && CXI->isVolatile())
+      return nullptr;
     return CXI->getPointerOperand();
   }
 
   if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    if (!AllowVolatile && RMWI->isVolatile())
+      return nullptr;
     return RMWI->getPointerOperand();
   }
 
@@ -1282,7 +1287,11 @@
   /// or monotonic ordering
   static bool isNonRelaxedAtomic(Instruction *I);
 
-  /// Helper function specific for intrinsics which are potentially volatile
+  /// Helper function used to determine whether an instruction is volatile.
+  static bool isVolatile(Instruction *I);
+
+  /// Helper function uset to check if intrinsic is volatile (memcpy, memmove,
+  /// memset).
   static bool isNoSyncIntrinsic(Instruction *I);
 };
 
@@ -1290,15 +1299,6 @@
   if (!I->isAtomic())
     return false;
 
-  if (auto *FI = dyn_cast<FenceInst>(I))
-    // All legal orderings for fence are stronger than monotonic.
-    return FI->getSyncScopeID() != SyncScope::SingleThread;
-  else if (auto *AI = dyn_cast<AtomicCmpXchgInst>(I)) {
-    // Unordered is not a legal ordering for cmpxchg.
-    return (AI->getSuccessOrdering() != AtomicOrdering::Monotonic ||
-            AI->getFailureOrdering() != AtomicOrdering::Monotonic);
-  }
-
   AtomicOrdering Ordering;
   switch (I->getOpcode()) {
   case Instruction::AtomicRMW:
@@ -1310,42 +1310,98 @@
   case Instruction::Load:
     Ordering = cast<LoadInst>(I)->getOrdering();
     break;
+  case Instruction::Fence: {
+    auto *FI = cast<FenceInst>(I);
+    if (FI->getSyncScopeID() == SyncScope::SingleThread)
+      return false;
+    Ordering = FI->getOrdering();
+    break;
+  }
+  case Instruction::AtomicCmpXchg: {
+    AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering();
+    AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering();
+    // Only if both are relaxed, than it can be treated as relaxed.
+    // Otherwise it is non-relaxed.
+    if (Success != AtomicOrdering::Unordered &&
+        Success != AtomicOrdering::Monotonic)
+      return true;
+    if (Failure != AtomicOrdering::Unordered &&
+        Failure != AtomicOrdering::Monotonic)
+      return true;
+    return false;
+  }
   default:
     llvm_unreachable(
         "New atomic operations need to be known in the attributor.");
   }
 
-  return (Ordering != AtomicOrdering::Unordered &&
-          Ordering != AtomicOrdering::Monotonic);
+  // Relaxed.
+  if (Ordering == AtomicOrdering::Unordered ||
+      Ordering == AtomicOrdering::Monotonic)
+    return false;
+  return true;
 }
 
-/// Return true if this intrinsic is nosync.  This is only used for intrinsics
-/// which would be nosync except that they have a volatile flag.  All other
-/// intrinsics are simply annotated with the nosync attribute in Intrinsics.td.
+/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
+/// FIXME: We should ipmrove the handling of intrinsics.
 bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
-  if (auto *MI = dyn_cast<MemIntrinsic>(I))
-    return !MI->isVolatile();
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    /// Element wise atomic memory intrinsics are can only be unordered,
+    /// therefore nosync.
+    case Intrinsic::memset_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memcpy_element_unordered_atomic:
+      return true;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      if (!cast<MemIntrinsic>(II)->isVolatile())
+        return true;
+      return false;
+    default:
+      return false;
+    }
+  }
   return false;
 }
 
+bool AANoSyncImpl::isVolatile(Instruction *I) {
+  assert(!isa<CallBase>(I) && "Calls should not be checked here");
+
+  switch (I->getOpcode()) {
+  case Instruction::AtomicRMW:
+    return cast<AtomicRMWInst>(I)->isVolatile();
+  case Instruction::Store:
+    return cast<StoreInst>(I)->isVolatile();
+  case Instruction::Load:
+    return cast<LoadInst>(I)->isVolatile();
+  case Instruction::AtomicCmpXchg:
+    return cast<AtomicCmpXchgInst>(I)->isVolatile();
+  default:
+    return false;
+  }
+}
+
 ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
 
   auto CheckRWInstForNoSync = [&](Instruction &I) {
     /// We are looking for volatile instructions or Non-Relaxed atomics.
+    /// FIXME: We should improve the handling of intrinsics.
+
+    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
+      return true;
 
     if (const auto *CB = dyn_cast<CallBase>(&I)) {
       if (CB->hasFnAttr(Attribute::NoSync))
         return true;
 
-      if (isNoSyncIntrinsic(&I))
-        return true;
-
       const auto &NoSyncAA = A.getAAFor<AANoSync>(
           *this, IRPosition::callsite_function(*CB), DepClassTy::REQUIRED);
       return NoSyncAA.isAssumedNoSync();
     }
 
-    if (!I.isVolatile() && !isNonRelaxedAtomic(&I))
+    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
       return true;
 
     return false;
@@ -1694,7 +1750,7 @@
 
     AANonNull::initialize(A);
 
-    bool CanBeNull, CanBeFreed;
+    bool CanBeNull = true, CanBeFreed;
     if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull,
                                          CanBeFreed)) {
       if (!CanBeNull) {
@@ -3608,7 +3664,7 @@
         // TODO: track globally.
         bool CanBeNull, CanBeFreed;
         DerefBytes =
-          Base->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
+            Base->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
         T.GlobalState.indicatePessimisticFixpoint();
       } else {
         const DerefState &DS = AA.getState();
@@ -5109,6 +5165,7 @@
       A.getInfoCache().getMustBeExecutedContextExplorer();
 
   auto FreeCheck = [&](Instruction &I) {
+    return false;
     const auto &Frees = FreesForMalloc.lookup(&I);
     if (Frees.size() != 1)
       return false;
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -50,6 +50,17 @@
 static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
                                         cl::init(false), cl::Hidden);
 
+static cl::opt<uint64_t> SharedMemoryLimit(
+    "openmp-shared-limit", cl::Optional,
+    cl::desc("Limits the total amount of shared memory used for optimizations"),
+    cl::init(-1));
+
+static cl::opt<uint64_t>
+    SharedMemoryThreshold("openmp-shared-threshold", cl::Optional,
+                          cl::desc("Only place variables sizes larger than the "
+                                   "threshold in shared memory"),
+                          cl::init(0));
+
 static cl::opt<bool> HideMemoryTransferLatency(
     "openmp-hide-memory-transfer-latency",
     cl::desc("[WIP] Tries to hide the latency of host to device memory"
@@ -72,6 +83,9 @@
 STATISTIC(NumOpenMPParallelRegionsMerged,
           "Number of OpenMP parallel regions merged");
 
+STATISTIC(NumBytesMovedToSharedMemory,
+          "Amount of memory pushed to shared memory");
+
 #if !defined(NDEBUG)
 static constexpr auto TAG = "[" DEBUG_TYPE "]";
 #endif
@@ -517,6 +531,8 @@
     // Recollect uses, in case Attributor deleted any.
     OMPInfoCache.recollectUses();
 
+    Changed |= removeGlobalization();
+    Changed |= replaceGlobalization();
     Changed |= deleteParallelRegions();
     if (HideMemoryTransferLatency)
       Changed |= hideMemTransfersLatency();
@@ -982,6 +998,131 @@
     return Changed;
   }
 
+  bool removeGlobalization() {
+    return false;
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+
+    auto RemoveAllocCalls = [&](Use &U, Function &F) {
+      auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
+      CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
+      if (!CB)
+        return false;
+
+      IRPosition AllocPos = IRPosition::callsite_returned(*CB);
+      if (!A.lookupAAFor<AANoCapture>(AllocPos)->isKnownNoCapture())
+        return false;
+
+      LLVM_DEBUG(dbgs() << TAG << "Remove globalization call in "
+                        << CB->getCaller()->getName() << "\n");
+
+      Value *AllocSize = CB->getArgOperand(0);
+
+      for (auto *U : CB->users()) {
+        CallBase *FC = dyn_cast<CallBase>(U);
+        if (FC && FC->getCalledFunction() == FreeCall.Declaration) {
+          FC->eraseFromParent();
+          break;
+        }
+      }
+
+      const DataLayout &DL = M.getDataLayout();
+      Type *Int8Ty = Type::getInt8Ty(M.getContext());
+      AllocaInst *NewAlloca =
+          new AllocaInst(Int8Ty, DL.getAllocaAddrSpace(), AllocSize,
+                         CB->getName(), &F.front().front());
+      NewAlloca->setDebugLoc(CB->getDebugLoc());
+      CB->replaceAllUsesWith(NewAlloca);
+      CB->eraseFromParent();
+
+      return false;
+    };
+    RFI.foreachUse(SCC, RemoveAllocCalls);
+
+    return false;
+  }
+
+  /// Replace globalization calls in the device with shared memory. Variables
+  /// will not be placed in shared memory if their size is below the threshold,
+  /// or if it would exceed the limit.
+  bool replaceGlobalization() {
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+    bool Changed = false;
+
+    auto ReplaceAllocCalls = [&](Use &U, Function &F) {
+      if (!isKernel(F))
+        return false;
+      auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
+      CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
+      if (!CB)
+        return false;
+
+      ConstantInt *AllocSize = dyn_cast<ConstantInt>(CB->getArgOperand(0));
+      if (!AllocSize)
+        return false;
+
+      if (AllocSize->getZExtValue() <= SharedMemoryThreshold)
+        return false;
+
+      if (NumBytesMovedToSharedMemory + AllocSize->getZExtValue() >
+          SharedMemoryLimit)
+        return false;
+
+      LLVM_DEBUG(dbgs() << TAG << "Replace globalization call in "
+                        << CB->getCaller()->getName() << " with "
+                        << AllocSize->getZExtValue()
+                        << " bytes of shared memory\n");
+
+      // Remove the free call
+      for (auto *U : CB->users()) {
+        CallBase *FC = dyn_cast<CallBase>(U);
+        if (FC && FC->getCalledFunction() == FreeCall.Declaration) {
+          FC->eraseFromParent();
+          break;
+        }
+      }
+
+      // Create a new shared memory buffer of the same size as the allocation
+      // and replace all the uses of the original allocation with it.
+      Type *Int8Ty = Type::getInt8Ty(M.getContext());
+      Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
+      auto *SharedMem = new GlobalVariable(
+          M, Int8ArrTy, /* IsConstant */ false, GlobalValue::InternalLinkage,
+          UndefValue::get(Int8ArrTy), CB->getName(), nullptr,
+          GlobalValue::NotThreadLocal, 3);
+      SharedMem->setAlignment(Align(32));
+
+      auto *NullInt = Constant::getNullValue(Type::getInt64Ty(M.getContext()));
+      auto *GEPExpr = ConstantExpr::getGetElementPtr(
+          Int8ArrTy, SharedMem, SmallVector<Constant *>({NullInt, NullInt}));
+
+      auto *NewBuffer = new AddrSpaceCastInst(GEPExpr, Int8Ty->getPointerTo(),
+                                              CB->getName() + "_shared", CB);
+
+      NewBuffer->setDebugLoc(CB->getDebugLoc());
+      CB->replaceAllUsesWith(NewBuffer);
+      CB->eraseFromParent();
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Replaced globalized variable with "
+                  << ore::NV("SharedMemory", AllocSize->getZExtValue())
+                  << ((AllocSize->getZExtValue() != 1) ? " bytes " : " byte ")
+                  << " of shared memory";
+      };
+      emitRemark<OptimizationRemark>(NewBuffer, "OpenMPReplaceGlobalization",
+                                     Remark);
+
+      NumBytesMovedToSharedMemory += AllocSize->getZExtValue();
+      Changed = true;
+
+      return true;
+    };
+    RFI.foreachUse(SCC, ReplaceAllocCalls);
+
+    OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_free_shared);
+
+    return Changed;
+  }
+
   /// Try to delete parallel regions if possible.
   bool deleteParallelRegions() {
     const unsigned CallbackCalleeOperand = 2;
@@ -1112,9 +1253,8 @@
   }
 
   void analysisGlobalization() {
-    RuntimeFunction GlobalizationRuntimeIDs[] = {
-        OMPRTL___kmpc_data_sharing_coalesced_push_stack,
-        OMPRTL___kmpc_data_sharing_push_stack};
+    RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared,
+                                                 OMPRTL___kmpc_free_shared};
 
     for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
       auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
@@ -1602,6 +1742,18 @@
 
       GetterRFI.foreachUse(SCC, CreateAA);
     }
+
+    auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
+    auto CreateAA = [&](Use &U, Function &Decl) {
+      auto *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
+      if (!CB)
+        return false;
+
+      IRPosition CBPos = IRPosition::function(*CB->getFunction());
+      A.getOrCreateAAFor<AAHeapToStack>(CBPos);
+      return false;
+    };
+    RFI.foreachUse(SCC, CreateAA);
   }
 };
 
diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt
--- a/openmp/libomptarget/CMakeLists.txt
+++ b/openmp/libomptarget/CMakeLists.txt
@@ -76,7 +76,8 @@
 
 # Build offloading plugins and device RTLs if they are available.
 add_subdirectory(plugins)
-add_subdirectory(deviceRTLs)
+# add_subdirectory(deviceRTLs)
+add_subdirectory(DeviceRTL)
 
 # Add tests.
 add_subdirectory(test)
diff --git a/openmp/libomptarget/DeviceRTL/CMakeLists.txt b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/CMakeLists.txt
@@ -0,0 +1,195 @@
+##===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+#
+# Build the Device RTL for all toolchains that are available
+#
+##===----------------------------------------------------------------------===##
+
+# TODO: copied from NVPTX, need to be generalized.
+
+# By default we will not build NVPTX deviceRTL on a CUDA free system
+set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
+  "Whether build NVPTX deviceRTL on CUDA free system.")
+
+if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
+  libomptarget_say("Not building NVPTX deviceRTL by default on CUDA free system.")
+  return()
+endif()
+
+# Check if we can create an LLVM bitcode implementation of the runtime library
+# that could be inlined in the user application. For that we need to find
+# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
+# an LLVM linker.
+set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
+  "Location of a CUDA compiler capable of emitting LLVM bitcode.")
+set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
+  "Location of a linker capable of linking LLVM bitcode objects.")
+
+if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
+  set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
+elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
+  set(cuda_compiler ${CMAKE_C_COMPILER})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: clang not found")
+  return()
+endif()
+
+# Get compiler directory to try to locate a suitable linker.
+get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
+set(llvm_link "${compiler_dir}/llvm-link")
+
+if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
+  set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
+elseif (EXISTS ${llvm_link})
+  set(bc_linker ${llvm_link})
+else()
+  libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
+  return()
+endif()
+
+# TODO: This part needs to be refined when libomptarget is going to support
+# Windows!
+# TODO: This part can also be removed if we can change the clang driver to make
+# it support device only compilation.
+if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
+  set(aux_triple x86_64-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
+  set(aux_triple powerpc64le-unknown-linux-gnu)
+elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
+  set(aux_triple aarch64-unknown-linux-gnu)
+else()
+  libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
+  return()
+endif()
+
+set(devicertl_base_directory ${CMAKE_CURRENT_SOURCE_DIR})
+set(include_directory ${devicertl_base_directory}/include)
+set(source_directory ${devicertl_base_directory}/src)
+
+set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80)
+
+set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
+  "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
+string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
+
+if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
+  set(nvptx_sm_list ${all_capabilities})
+elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
+  if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
+    libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
+  endif()
+  set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
+else()
+  string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
+endif()
+
+# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
+# build.
+if (NOT nvptx_sm_list)
+  libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
+  return()
+endif()
+
+# Check all SM values
+foreach(sm ${nvptx_sm_list})
+  if (NOT ${sm} IN_LIST all_capabilities)
+    libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
+  endif()
+endforeach()
+
+# Override default MAX_SM in src/target_impl.h if requested
+if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
+  set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
+endif()
+
+# Activate RTL message dumps if requested by the user.
+set(LIBOMPTARGET_DEVICE_DEBUG FALSE CACHE BOOL
+  "Activate NVPTX device RTL debug messages.")
+
+libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.")
+
+set(src_files
+  ${source_directory}/Configuration.cpp
+  ${source_directory}/Debug.cpp
+  ${source_directory}/Kernel.cpp
+  ${source_directory}/Mapping.cpp
+  ${source_directory}/Misc.cpp
+  ${source_directory}/Parallelism.cpp
+  ${source_directory}/Reduction.cpp
+  ${source_directory}/State.cpp
+  ${source_directory}/Synchronization.cpp
+  ${source_directory}/Tasking.cpp
+  ${source_directory}/Utils.cpp
+  ${source_directory}/Workshare.cpp
+)
+
+# Set flags for LLVM Bitcode compilation.
+set(bc_flags -S -x c++ -O1 -std=c++17
+             -target nvptx64
+             -Xclang -emit-llvm-bc
+             -Xclang -aux-triple -Xclang ${aux_triple}
+             -fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
+             -Xclang -target-feature -Xclang +ptx61
+             -I${include_directory}
+)
+
+if(${LIBOMPTARGET_DEVICE_DEBUG})
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=-1)
+else()
+  list(APPEND bc_flags -DOMPTARGET_DEBUG=0)
+endif()
+
+# Create target to build all Bitcode libraries.
+add_custom_target(omptarget-nvptx-bc)
+
+# Generate a Bitcode library for all the compute capabilities the user requested
+foreach(sm ${nvptx_sm_list})
+  set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
+  set(bc_files "")
+  foreach(src ${src_files})
+    get_filename_component(infile ${src} ABSOLUTE)
+    get_filename_component(outfile ${src} NAME)
+    set(outfile "${outfile}-sm_${sm}.bc")
+
+    add_custom_command(OUTPUT ${outfile}
+      COMMAND ${cuda_compiler} ${bc_flags}
+        ${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
+      DEPENDS ${infile}
+      IMPLICIT_DEPENDS CXX ${infile}
+      COMMENT "Building LLVM bitcode ${outfile}"
+      VERBATIM
+    )
+    set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
+
+    list(APPEND bc_files ${outfile})
+  endforeach()
+
+  set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
+
+  # Link to a bitcode library.
+  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+      COMMAND ${bc_linker}
+        -o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
+      DEPENDS ${bc_files}
+      COMMENT "Linking LLVM bitcode ${bclib_name}"
+  )
+  set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
+
+  set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
+
+  add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
+  add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
+
+  # Copy library to destination.
+  add_custom_command(TARGET ${bclib_target_name} POST_BUILD
+                    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
+                    ${LIBOMPTARGET_LIBRARY_DIR})
+
+  # Install bitcode library under the lib destination folder.
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
+endforeach()
diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h
@@ -0,0 +1,39 @@
+//===--- Configuration.h - OpenMP device configuration interface -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// API to query the global (constant) device environment.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_CONFIGURATION_H
+#define OMPTARGET_CONFIGURATION_H
+
+#include "Types.h"
+
+namespace _OMP {
+namespace config {
+
+enum DebugLevel : int32_t { Assertion };
+
+/// Return the number of devices in the system, same number as returned on the
+/// host by omp_get_num_devices.
+int32_t getNumDevices();
+
+/// Return the user choosen debug level.
+int32_t getDebugLevel();
+
+/// Return the amonut of shared memory scratchpad allocated to the main thread
+/// in generic mode. Unused in SPMD-mode.
+uint32_t getGenericModeMainThreadSharedMemoryStorage();
+
+bool isDebugMode(DebugLevel Level);
+
+} // namespace config
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Debug.h b/openmp/libomptarget/DeviceRTL/include/Debug.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Debug.h
@@ -0,0 +1,27 @@
+//===-------- Debug.h ---- Debug utilities ------------------------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_DEBUG_H
+#define OMPTARGET_DEVICERTL_DEBUG_H
+
+/// Assertion
+///
+/// {
+extern "C" {
+void __assert_assume(bool cond, const char *exp, const char *file, int line);
+}
+
+#define ASSERT(e) __assert_assume(e, #e, __FILE__, __LINE__)
+///}
+
+// TODO: Print
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -0,0 +1,350 @@
+//===-------- Interface.h - OpenMP interface ---------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_INTERFACE_H
+#define OMPTARGET_DEVICERTL_INTERFACE_H
+
+#include "Types.h"
+
+/// External API
+///
+///{
+
+extern "C" {
+
+/// ICV: dyn-var, constant 0
+///
+/// setter: ignored.
+/// getter: returns 0.
+///
+///{
+void omp_set_dynamic(int);
+int omp_get_dynamic(void);
+///}
+
+/// ICV: nthreads-var, integer
+///
+/// scope: data environment
+///
+/// setter: ignored.
+/// getter: returns false.
+///
+/// implementation notes:
+///
+///
+///{
+void omp_set_num_threads(int);
+int omp_get_max_threads(void);
+///}
+
+/// ICV: thread-limit-var, computed
+///
+/// getter: returns thread limited defined during launch.
+///
+///{
+int omp_get_thread_limit(void);
+///}
+
+/// ICV: max-active-level-var, constant 1
+///
+/// setter: ignored.
+/// getter: returns 1.
+///
+///{
+void omp_set_max_active_levels(int);
+int omp_get_max_active_levels(void);
+///}
+
+/// ICV: places-partition-var
+///
+///
+///{
+///}
+
+/// ICV: active-level-var, 0 or 1
+///
+/// getter: returns 0 or 1.
+///
+///{
+int omp_get_active_level(void);
+///}
+
+/// ICV: level-var
+///
+/// getter: returns parallel region nesting
+///
+///{
+int omp_get_level(void);
+///}
+
+/// ICV: run-sched-var
+///
+///
+///{
+void omp_set_schedule(omp_sched_t, int);
+void omp_get_schedule(omp_sched_t *, int *);
+///}
+
+/// TODO this is incomplete.
+int omp_get_num_threads(void);
+int omp_get_thread_num(void);
+void omp_set_nested(int);
+
+int omp_get_nested(void);
+
+void omp_set_max_active_levels(int Level);
+
+int omp_get_max_active_levels(void);
+
+omp_proc_bind_t omp_get_proc_bind(void);
+
+int omp_get_num_places(void);
+
+int omp_get_place_num_procs(int place_num);
+
+void omp_get_place_proc_ids(int place_num, int *ids);
+
+int omp_get_place_num(void);
+
+int omp_get_partition_num_places(void);
+
+void omp_get_partition_place_nums(int *place_nums);
+
+int omp_get_cancellation(void);
+
+void omp_set_default_device(int deviceId);
+
+int omp_get_default_device(void);
+
+int omp_get_num_devices(void);
+
+int omp_get_num_teams(void);
+
+int omp_get_team_num();
+
+int omp_get_initial_device(void);
+
+/// Synchronization
+///
+///{
+void omp_init_lock(omp_lock_t *Lock);
+
+void omp_destroy_lock(omp_lock_t *Lock);
+
+void omp_set_lock(omp_lock_t *Lock);
+
+void omp_unset_lock(omp_lock_t *Lock);
+
+int omp_test_lock(omp_lock_t *Lock);
+///}
+
+/// Tasking
+///
+///{
+int omp_in_final(void);
+
+int omp_get_max_task_priority(void);
+///}
+
+/// Misc
+///
+///{
+double omp_get_wtick(void);
+
+double omp_get_wtime(void);
+///}
+}
+
+extern "C" {
+/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
+/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
+/// called by any thread, allocation happens *per thread*.
+void *__kmpc_alloc_shared(uint64_t Bytes);
+
+/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
+/// a stack (push/pop). Can be called by any thread. \p Ptr has to be the
+/// allocated by __kmpc_alloc_shared by the same thread.
+void __kmpc_free_shared(void *Ptr);
+
+/// Allocate sufficient space for \p NumArgs sequential `void*` and store the
+/// allocation address in \p GlobalArgs.
+///
+/// Called by the main thread prior to a parallel region.
+///
+/// We also remember it in GlobalArgsPtr to ensure the worker threads and
+/// deallocation function know the allocation address too.
+void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs);
+
+/// Deallocate the memory allocated by __kmpc_begin_sharing_variables.
+///
+/// Called by the main thread after a parallel region.
+///
+/// TODO: This should really take the address, or number of bytes at least.
+void __kmpc_end_sharing_variables();
+
+/// Store the allocation address obtained via __kmpc_begin_sharing_variables in
+/// \p GlobalArgs.
+///
+/// Called by the worker threads in the parallel region (function).
+void __kmpc_get_shared_variables(void ***GlobalArgs);
+
+/// Kernel
+///
+///{
+
+void __kmpc_kernel_init(int, int16_t);
+
+void __kmpc_kernel_deinit(int16_t);
+
+void __kmpc_spmd_kernel_init(int, int16_t);
+
+void __kmpc_spmd_kernel_deinit_v2(int16_t);
+
+int8_t __kmpc_is_spmd_exec_mode();
+///}
+
+/// Reduction
+///
+///{
+void __kmpc_nvptx_end_reduce(int32_t TId);
+
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
+
+extern "C" int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
+
+int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct);
+///}
+
+/// Synchronization
+///
+///{
+void __kmpc_ordered(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId);
+
+int32_t __kmpc_cancel_barrier(IdentTy *Loc_ref, int32_t TId);
+
+void __kmpc_barrier(IdentTy *Loc_ref, int32_t TId);
+
+void __kmpc_barrier_simple_spmd(IdentTy *Loc_ref, int32_t TId);
+
+int32_t __kmpc_master(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_master(IdentTy *Loc, int32_t TId);
+
+int32_t __kmpc_single(IdentTy *Loc, int32_t TId);
+
+void __kmpc_end_single(IdentTy *Loc, int32_t TId);
+
+void __kmpc_flush(IdentTy *Loc);
+
+__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask();
+
+void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask);
+
+void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+
+void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name);
+///}
+
+/// Parallelism
+///
+///{
+/// TODO
+void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn);
+
+/// TODO
+bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn);
+
+/// TODO
+void __kmpc_kernel_end_parallel();
+
+/// TODO
+void __kmpc_serialized_parallel(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_end_serialized_parallel(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_push_proc_bind(IdentTy *Loc, uint32_t TId, int ProcBind);
+
+/// TODO
+void __kmpc_push_num_teams(IdentTy *Loc, int32_t TId, int32_t NumTeams,
+                           int32_t ThreadLimit);
+
+/// TODO
+uint16_t __kmpc_parallel_level(IdentTy *Loc, uint32_t);
+
+/// TODO
+void __kmpc_push_num_threads(IdentTy *Loc, int32_t, int32_t NumThreads);
+///}
+
+/// Tasking
+///
+///{
+TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
+                                        uint32_t TaskSizeInclPrivateValues,
+                                        uint32_t SharedValuesSize,
+                                        TaskFnTy TaskFn);
+
+int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+                        TaskDescriptorTy *TaskDescriptor);
+
+int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor, int32_t,
+                                  void *, int32_t, void *);
+
+void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+                               TaskDescriptorTy *TaskDescriptor);
+
+void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor);
+
+void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
+                          void *);
+
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId);
+
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId);
+
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int);
+
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId);
+
+void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+                     TaskDescriptorTy *TaskDescriptor, int,
+                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
+                     int32_t, uint64_t, void *);
+///}
+
+/// Misc
+///
+///{
+int32_t __kmpc_cancellationpoint(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+
+int32_t __kmpc_cancel(IdentTy *Loc, int32_t TId, int32_t CancelVal);
+///}
+
+/// Shuffle
+///
+///{
+int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
+int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
+///}
+}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Mapping.h b/openmp/libomptarget/DeviceRTL/include/Mapping.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Mapping.h
@@ -0,0 +1,86 @@
+//===--------- Mapping.h - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_MAPPING_H
+#define OMPTARGET_MAPPING_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace mapping {
+
+#pragma omp declare target
+
+inline constexpr uint32_t MaxThreadsPerTeam = 1024;
+
+#pragma omp end declare target
+
+/// Initialize the mapping machinery.
+void init(bool IsSPMD);
+
+/// Return true if the kernel is executed in SPMD mode.
+bool isSPMDMode();
+
+/// Return true if the kernel is executed in generic mode.
+bool isGenericMode();
+
+/// Return true if the executing thread is the main thread in generic mode.
+bool isMainThreadInGenericMode();
+
+/// Return true if the executing thread has the lowest Id of the active threads
+/// in the warp.
+bool isLeaderInWarp();
+
+/// Return a mask describing all active threads in the warp.
+LaneMaskTy activemask();
+
+/// Return a mask describing all threads with a smaller Id in the warp.
+LaneMaskTy lanemaskLT();
+
+/// Return a mask describing all threads with a larget Id in the warp.
+LaneMaskTy lanemaskGT();
+
+/// Return the thread Id in the warp, in [0, getWarpSize()).
+uint32_t getThreadIdInWarp();
+
+/// Return the thread Id in the block, in [0, getBlockSize()).
+uint32_t getThreadIdInBlock();
+
+/// Return the warp id in the block.
+uint32_t getWarpId();
+
+/// Return the warp size, thus number of threads in the warp.
+uint32_t getWarpSize();
+
+/// Return the number of warps in the block.
+uint32_t getNumberOfWarpsInBlock();
+
+/// Return the block Id in the kernel, in [0, getKernelSize()).
+uint32_t getBlockId();
+
+/// Return the block size, thus number of threads in the block.
+uint32_t getBlockSize();
+
+/// Return the number of blocks in the kernel.
+uint32_t getNumberOfBlocks();
+
+/// Return the kernel size, thus number of threads in the kernel.
+uint32_t getKernelSize();
+
+/// Return the number of processing elements on the device.
+uint32_t getNumberOfProcessorElements();
+
+} // namespace mapping
+
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -0,0 +1,173 @@
+//===-------- State.h - OpenMP State & ICV interface ------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_STATE_H
+#define OMPTARGET_STATE_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace state {
+
+#pragma omp declare target
+
+// TODO: Expose this via CMAKE.
+inline constexpr uint32_t SharedScratchpadSize = 2048;
+
+/// Initialize the state machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// TODO
+enum ValueKind {
+  VK_NThreads,
+  VK_Level,
+  VK_ActiveLevel,
+  VK_MaxActiveLevels,
+  VK_RunSched,
+  // ---
+  VK_RunSchedChunk,
+  VK_ParallelRegionFn,
+  VK_ParallelTeamSize,
+};
+
+/// TODO
+void enterDataEnvironment();
+
+/// TODO
+void exitDataEnvironment();
+
+/// TODO
+struct DateEnvironmentRAII {
+  DateEnvironmentRAII() { enterDataEnvironment(); }
+  ~DateEnvironmentRAII() { exitDataEnvironment(); }
+};
+
+/// TODO
+void resetStateForThread();
+
+uint32_t &lookup32(ValueKind VK, bool IsReadonly);
+void *&lookupPtr(ValueKind VK, bool IsReadonly);
+
+/// A mookup class without actual state used to provide
+/// a nice interface to lookup and update ICV values
+/// we can declare in global scope.
+template <typename Ty, ValueKind Kind> struct Value {
+  operator Ty() { return lookup(/* IsReadonly */ true); }
+
+  Value &operator=(const Ty &Other) {
+    set(Other);
+    return *this;
+  }
+
+  Value &operator++() {
+    inc(1);
+    return *this;
+  }
+
+  Value &operator--() {
+    inc(-1);
+    return *this;
+  }
+
+private:
+  Ty &lookup(bool IsReadonly) { return lookup32(Kind, IsReadonly); }
+
+  Ty &inc(int UpdateVal) {
+    return (lookup(/* IsReadonly */ false) += UpdateVal);
+  }
+
+  Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); }
+};
+
+/// A mookup class without actual state used to provide
+/// a nice interface to lookup and update ICV values
+/// we can declare in global scope.
+template <typename Ty, ValueKind Kind> struct PtrValue {
+  operator Ty() { return lookup(/* IsReadonly */ true); }
+
+  PtrValue &operator=(const Ty Other) {
+    set(Other);
+    return *this;
+  }
+
+private:
+  Ty &lookup(bool IsReadonly) { return lookupPtr(Kind, IsReadonly); }
+
+  Ty &set(Ty UpdateVal) { return (lookup(/* IsReadonly */ false) = UpdateVal); }
+};
+
+/// TODO
+inline state::Value<uint32_t, state::VK_RunSchedChunk> RunSchedChunk;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_ParallelTeamSize> ParallelTeamSize;
+
+/// TODO
+inline state::PtrValue<ParallelRegionFnTy, state::VK_ParallelRegionFn>
+    ParallelRegionFn;
+#pragma omp end declare target
+
+void runAndCheckState(void(Func(void)));
+
+void assumeInitialState(bool IsSPMD);
+
+} // namespace state
+
+namespace icv {
+
+#pragma omp declare target
+
+/// TODO
+inline state::Value<uint32_t, state::VK_NThreads> NThreads;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_Level> Level;
+
+/// The `active-level` describes which of the parallel level counted with the
+/// `level-var` is active. There can only be one.
+///
+/// active-level-var is 1, if ActiveLevelVar is not 0, otherweise it is 0.
+inline state::Value<uint32_t, state::VK_ActiveLevel> ActiveLevel;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_MaxActiveLevels> MaxActiveLevels;
+
+/// TODO
+inline state::Value<uint32_t, state::VK_RunSched> RunSched;
+
+#pragma omp end declare target
+
+} // namespace icv
+
+namespace memory {
+
+/// Alloca \p Size bytes in shared memory, if possible, for \p Reason.
+///
+/// Note: See the restrictions on __kmpc_alloc_shared for proper usage.
+void *allocShared(uint64_t Size, const char *Reason);
+
+/// Free \p Ptr, alloated via allocShared, for \p Reason.
+///
+/// Note: See the restrictions on __kmpc_free_shared for proper usage.
+void freeShared(void *Ptr, const char *Reason);
+
+/// Alloca \p Size bytes in global memory, if possible, for \p Reason.
+void *allocGlobal(uint64_t Size, const char *Reason);
+
+/// Free \p Ptr, alloated via allocGlobal, for \p Reason.
+void freeGlobal(void *Ptr, const char *Reason);
+
+} // namespace memory
+
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Synchronization.h b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Synchronization.h
@@ -0,0 +1,69 @@
+//===- Synchronization.h - OpenMP synchronization utilities ------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
+#define OMPTARGET_DEVICERTL_SYNCHRONIZATION_H
+
+#include "Types.h"
+
+namespace _OMP {
+
+namespace synchronize {
+
+/// Initialize the synchronization machinery. Must be called by all threads.
+void init(bool IsSPMD);
+
+/// Synchronize all threads in a warp identified by \p Mask.
+void warp(LaneMaskTy Mask);
+
+/// Synchronize all threads in a block.
+void threads();
+
+} // namespace synchronize
+
+namespace fence {
+
+/// Memory fence with \p Ordering semantics for the team.
+void team(int Ordering);
+
+/// Memory fence with \p Ordering semantics for the contention group.
+void kernel(int Ordering);
+
+/// Memory fence with \p Ordering semantics for the system.
+void system(int Ordering);
+
+} // namespace fence
+
+namespace atomic {
+
+/// Atomically read \p Addr with \p Ordering semantics.
+uint32_t read(uint32_t *Addr, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+uint32_t store(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically store \p V to \p Addr with \p Ordering semantics.
+uint64_t store(uint64_t *Addr, uint64_t V, int Ordering);
+
+/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
+uint32_t inc(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint32_t add(uint32_t *Addr, uint32_t V, int Ordering);
+
+/// Atomically add \p V to \p *Addr with \p Ordering semantics.
+uint64_t add(uint64_t *Addr, uint64_t V, int Ordering);
+
+} // namespace atomic
+
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Types.h b/openmp/libomptarget/DeviceRTL/include/Types.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Types.h
@@ -0,0 +1,200 @@
+//===---------- Types.h - OpenMP types ---------------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_TYPES_H
+#define OMPTARGET_TYPES_H
+
+/// Base type declarations for freestanding mode
+///
+///{
+using int8_t = char;
+using uint8_t = unsigned char;
+using int16_t = short;
+using uint16_t = unsigned short;
+using int32_t = int;
+using uint32_t = unsigned int;
+using int64_t = long;
+using uint64_t = unsigned long;
+
+static_assert(sizeof(int8_t) == 1, "type size mismatch");
+static_assert(sizeof(uint8_t) == 1, "type size mismatch");
+static_assert(sizeof(int16_t) == 2, "type size mismatch");
+static_assert(sizeof(uint16_t) == 2, "type size mismatch");
+static_assert(sizeof(int32_t) == 4, "type size mismatch");
+static_assert(sizeof(uint32_t) == 4, "type size mismatch");
+static_assert(sizeof(int64_t) == 8, "type size mismatch");
+static_assert(sizeof(uint64_t) == 8, "type size mismatch");
+///}
+
+enum omp_proc_bind_t {
+  omp_proc_bind_false = 0,
+  omp_proc_bind_true = 1,
+  omp_proc_bind_master = 2,
+  omp_proc_bind_close = 3,
+  omp_proc_bind_spread = 4
+};
+
+enum omp_sched_t {
+  omp_sched_static = 1,  /* chunkSize >0 */
+  omp_sched_dynamic = 2, /* chunkSize >0 */
+  omp_sched_guided = 3,  /* chunkSize >0 */
+  omp_sched_auto = 4,    /* no chunkSize */
+};
+
+enum kmp_sched_t {
+  kmp_sched_static_chunk = 33,
+  kmp_sched_static_nochunk = 34,
+  kmp_sched_dynamic = 35,
+  kmp_sched_guided = 36,
+  kmp_sched_runtime = 37,
+  kmp_sched_auto = 38,
+
+  kmp_sched_static_balanced_chunk = 45,
+
+  kmp_sched_static_ordered = 65,
+  kmp_sched_static_nochunk_ordered = 66,
+  kmp_sched_dynamic_ordered = 67,
+  kmp_sched_guided_ordered = 68,
+  kmp_sched_runtime_ordered = 69,
+  kmp_sched_auto_ordered = 70,
+
+  kmp_sched_distr_static_chunk = 91,
+  kmp_sched_distr_static_nochunk = 92,
+  kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
+
+  kmp_sched_default = kmp_sched_static_nochunk,
+  kmp_sched_unordered_first = kmp_sched_static_chunk,
+  kmp_sched_unordered_last = kmp_sched_auto,
+  kmp_sched_ordered_first = kmp_sched_static_ordered,
+  kmp_sched_ordered_last = kmp_sched_auto_ordered,
+  kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
+  kmp_sched_distribute_last =
+      kmp_sched_distr_static_chunk_sched_static_chunkone,
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
+   * Since we need to distinguish the three possible cases (no modifier,
+   * monotonic modifier, nonmonotonic modifier), we need separate bits for
+   * each modifier. The absence of monotonic does not imply nonmonotonic,
+   * especially since 4.5 says that the behaviour of the "no modifier" case
+   * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
+   *
+   * Since we're passing a full 32 bit value, we can use a couple of high
+   * bits for these flags; out of paranoia we avoid the sign bit.
+   *
+   * These modifiers can be or-ed into non-static schedules by the compiler
+   * to pass the additional information. They will be stripped early in the
+   * processing in __kmp_dispatch_init when setting up schedules, so
+   * most of the code won't ever see schedules with these bits set.
+   */
+  kmp_sched_modifier_monotonic = (1 << 29),
+  /**< Set if the monotonic schedule modifier was present */
+  kmp_sched_modifier_nonmonotonic = (1 << 30),
+/**< Set if the nonmonotonic schedule modifier was present */
+
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (enum kmp_sched_t)(                                                          \
+      (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s)                                           \
+  (((s)&kmp_sched_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
+   0)
+
+};
+
+struct TaskDescriptorTy;
+using TaskFnTy = int32_t (*)(int32_t global_tid, TaskDescriptorTy *taskDescr);
+struct TaskDescriptorTy {
+  void *Payload;
+  TaskFnTy TaskFn;
+};
+
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+using LaneMaskTy = uint64_t;
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(                                       \
+    device = {arch(amdgcn)}, implementation = {extension(match_none)})
+using LaneMaskTy = uint64_t;
+#pragma omp end declare variant
+
+namespace lanes {
+enum : LaneMaskTy { All = ~(LaneMaskTy)0 };
+} // namespace lanes
+
+/// The ident structure that describes a source location. The struct is
+/// identical to the one in the kmp.h file. We maintain the same data structure
+/// for compatibility.
+struct IdentTy {
+  int32_t reserved_1;  /**<  might be used in Fortran; see above  */
+  int32_t flags;       /**<  also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
+                            identifies this union member  */
+  int32_t reserved_2;  /**<  not really used in Fortran any more; see above */
+  int32_t reserved_3;  /**<  source[4] in Fortran, do not use for C++  */
+  char const *psource; /**<  String describing the source location.
+                       The string is composed of semi-colon separated fields
+                       which describe the source file, the function and a pair
+                       of line numbers that delimit the construct. */
+};
+
+using __kmpc_impl_lanemask_t = LaneMaskTy;
+
+using ParallelRegionFnTy = void *;
+
+using CriticalNameTy = int32_t[8];
+
+struct omp_lock_t {
+  void *Lock;
+};
+
+using InterWarpCopyFnTy = void (*)(void *src, int32_t warp_num);
+using ShuffleReductFnTy = void (*)(void *rhsData, int16_t lane_id,
+                                   int16_t lane_offset, int16_t shortCircuit);
+using ListGlobalFnTy = void (*)(void *buffer, int idx, void *reduce_data);
+
+/// Macros for allocating variables in different address spaces.
+///{
+
+// Follows the pattern in interface.h
+typedef enum omp_allocator_handle_t {
+  omp_null_allocator = 0,
+  omp_default_mem_alloc = 1,
+  omp_large_cap_mem_alloc = 2,
+  omp_const_mem_alloc = 3,
+  omp_high_bw_mem_alloc = 4,
+  omp_low_lat_mem_alloc = 5,
+  omp_cgroup_mem_alloc = 6,
+  omp_pteam_mem_alloc = 7,
+  omp_thread_mem_alloc = 8,
+  KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
+} omp_allocator_handle_t;
+
+#define __PRAGMA(STR) _Pragma(#STR)
+#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
+
+#define SHARED(NAME)                                                           \
+  NAME [[clang::loader_uninitialized]];                                        \
+  OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
+
+// TODO: clang should use address space 5 for omp_thread_mem_alloc, but right
+//       now that's not the case.
+#define THREAD_LOCAL(NAME)                                                     \
+  NAME [[clang::loader_uninitialized, clang::address_space(5)]]
+
+// TODO: clang should use address space 4 for omp_const_mem_alloc, maybe it
+//       does?
+#define CONSTANT(NAME)                                                         \
+  NAME [[clang::loader_uninitialized, clang::address_space(4)]]
+
+///}
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/include/Utils.h b/openmp/libomptarget/DeviceRTL/include/Utils.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/include/Utils.h
@@ -0,0 +1,57 @@
+//===--------- Utils.h - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef OMPTARGET_DEVICERTL_UTILS_H
+#define OMPTARGET_DEVICERTL_UTILS_H
+
+#include "Types.h"
+
+namespace _OMP {
+namespace utils {
+
+/// Return the value \p Var from thread Id \p SrcLane in the warp if the thread
+/// is identified by \p Mask.
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane);
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width);
+
+/// Return \p LowBits and \p HighBits packed into a single 64 bit value.
+uint64_t pack(uint32_t LowBits, uint32_t HighBits);
+
+/// Unpack \p Val into \p LowBits and \p HighBits.
+void unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits);
+
+/// Round up \p V to a \p Boundary.
+template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
+  return (V + Boundary - 1) / Boundary * Boundary;
+}
+
+/// Advance \p Ptr by \p Bytes bytes.
+template <typename Ty1, typename Ty2> inline Ty1 *advance(Ty1 Ptr, Ty2 Bytes) {
+  return reinterpret_cast<Ty1 *>(reinterpret_cast<char *>(Ptr) + Bytes);
+}
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint32_t V) { return __builtin_ffs(V); }
+
+/// Return the first bit set in \p V.
+inline uint32_t ffs(uint64_t V) { return __builtin_ffsl(V); }
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint32_t V) { return __builtin_popcount(V); }
+
+/// Return the number of bits set in \p V.
+inline uint32_t popc(uint64_t V) { return __builtin_popcountl(V); }
+
+} // namespace utils
+} // namespace _OMP
+
+#endif
diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp
@@ -0,0 +1,50 @@
+//===- Configuration.cpp - OpenMP device configuration interface -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the data object of the constant device environment and the
+// query API.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Configuration.h"
+#include "State.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+struct DeviceEnvironmentTy {
+  int32_t DebugLevel;
+  // uint32_t GenericModeMainThreadSharedMemoryStorage;
+  // uint32_t NumDevices;
+};
+
+#pragma omp declare target
+
+// TOOD: We want to change the name as soon as the old runtime is gone.
+DeviceEnvironmentTy CONSTANT(omptarget_device_environment)
+    __attribute__((used));
+
+int32_t config::getDebugLevel() {
+  return omptarget_device_environment.DebugLevel;
+}
+
+int32_t config::getNumDevices() {
+  // return omptarget_device_environment.NumDevices;
+  return 0;
+}
+
+uint32_t config::getGenericModeMainThreadSharedMemoryStorage() {
+  // return DeviceEnvironment.GenericModeMainThreadSharedMemoryStorage;
+  return state::SharedScratchpadSize / 4;
+}
+
+bool config::isDebugMode(config::DebugLevel Level) {
+  return config::getDebugLevel() >= Level;
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Debug.cpp b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Debug.cpp
@@ -0,0 +1,31 @@
+//===--- Debug.cpp -------- Debug utilities ----------------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains debug utilities
+//
+//===----------------------------------------------------------------------===//
+
+#include "Configuration.h"
+
+#include <stdio.h>
+
+#pragma omp declare target
+
+extern "C" {
+void __assert_assume(bool cond, const char *exp, const char *file, int line) {
+  // TODO: Replace 0 with right mode
+  if (_OMP::config::isDebugMode(_OMP::config::DebugLevel::Assertion) && !cond) {
+    printf("Assertion failed: %s at %s, line %d\n", exp, file, line);
+    __builtin_trap();
+  }
+
+  __builtin_assume(cond);
+}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -0,0 +1,60 @@
+//===--- Kernel.cpp - OpenMP device kernel interface -------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the kernel entry points for the device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+namespace {
+void inititializeRuntime(bool IsSPMD) {
+  synchronize::init(IsSPMD);
+  mapping::init(IsSPMD);
+  state::init(IsSPMD);
+}
+} // namespace
+
+#pragma omp declare target
+
+extern "C" {
+void __kmpc_kernel_init(int, int16_t) {
+  inititializeRuntime(/* IsSPMD */ false);
+  // No need to wait since only the main threads will execute user
+  // code and workers will run into a barrier right away.
+
+  ASSERT(!mapping::isSPMDMode());
+}
+
+void __kmpc_kernel_deinit(int16_t) {
+  // Signal the workers to exit the state machine and exit the kernel.
+  state::ParallelRegionFn = nullptr;
+}
+
+void __kmpc_spmd_kernel_init(int, int16_t) {
+  inititializeRuntime(/* IsSPMD */ true);
+
+  state::runAndCheckState(synchronize::threads);
+
+  ASSERT(mapping::isSPMDMode());
+}
+
+void __kmpc_spmd_kernel_deinit_v2(int16_t) {
+  state::assumeInitialState(/* IsSPMD */ true);
+}
+
+int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Mapping.cpp b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Mapping.cpp
@@ -0,0 +1,216 @@
+//===------- Mapping.cpp - OpenMP device runtime mapping helpers -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mapping.h"
+
+#include "Utils.h"
+
+using namespace _OMP;
+
+namespace _OMP {
+namespace impl {
+
+#pragma omp declare target
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+uint32_t getGridDim(uint32_t n, uint16_t d) {
+  uint32_t q = n / d;
+  return q + (n > q * d);
+}
+
+uint32_t getWorkgroupDim(uint32_t group_id, uint32_t grid_size,
+                         uint16_t group_size) {
+  uint32_t r = grid_size - group_id * group_size;
+  return (r < group_size) ? r : group_size;
+}
+
+LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
+
+LaneMaskTy lanemaskLT() {
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
+  return Mask & Ballot;
+}
+
+LaneMaskTy lanemaskGT() {
+  uint32_t Lane = mapping::getThreadIdInWarp();
+  if (lane == (mapping::getWarpSize() - 1))
+    return 0;
+  int64_t Ballot = mapping::activemask();
+  uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
+  return Mask & Ballot;
+}
+
+uint32_t getThreadIdInWarp() {
+  return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
+}
+
+uint32_t getThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); }
+
+uint32_t getBlockSize() {
+  return getWorkgroupDim(__builtin_amdgcn_workgroup_id_x(),
+                         __builtin_amdgcn_grid_size_x(),
+                         __builtin_amdgcn_workgroup_size_x());
+}
+
+uint32_t getKernelSize() { return __builtin_amdgcn_grid_size_x(); }
+
+uint32_t getBlockId() { return __builtin_amdgcn_workgroup_id_x(); }
+
+uint32_t getNumberOfBlocks() {
+  return getGridDim(__builtin_amdgcn_grid_size_x(),
+                    __builtin_amdgcn_workgroup_size_x());
+}
+
+uint32_t getNumberOfProcessorElements() {
+  // TODO
+  return mapping::getBlockSize();
+}
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return 64; }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return mapping::getBlockSize() / mapping::getWarpSize();
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+LaneMaskTy activemask() {
+  unsigned int Mask;
+  asm("activemask.b32 %0;" : "=r"(Mask));
+  return Mask;
+}
+
+LaneMaskTy lanemaskLT() {
+  __kmpc_impl_lanemask_t Res;
+  asm("mov.u32 %0, %%lanemask_lt;" : "=r"(Res));
+  return Res;
+}
+
+LaneMaskTy lanemaskGT() {
+  __kmpc_impl_lanemask_t Res;
+  asm("mov.u32 %0, %%lanemask_gt;" : "=r"(Res));
+  return Res;
+}
+
+uint32_t getThreadIdInWarp() {
+  return mapping::getThreadIdInBlock() & (mapping::getWarpSize() - 1);
+}
+
+uint32_t getThreadIdInBlock() { return __nvvm_read_ptx_sreg_tid_x(); }
+
+uint32_t getBlockSize() { return __nvvm_read_ptx_sreg_ntid_x(); }
+
+uint32_t getKernelSize() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+
+uint32_t getBlockId() { return __nvvm_read_ptx_sreg_ctaid_x(); }
+
+uint32_t getNumberOfBlocks() { return __nvvm_read_ptx_sreg_nctaid_x(); }
+
+uint32_t getNumberOfProcessorElements() {
+  // TODO
+  return mapping::getBlockSize();
+}
+
+uint32_t getWarpId() {
+  return mapping::getThreadIdInBlock() / mapping::getWarpSize();
+}
+
+uint32_t getWarpSize() { return 32; }
+
+uint32_t getNumberOfWarpsInBlock() {
+  return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
+         mapping::getWarpSize();
+}
+
+#pragma omp end declare variant
+///}
+
+#pragma omp end declare target
+
+} // namespace impl
+} // namespace _OMP
+
+#pragma omp declare target
+
+bool mapping::isMainThreadInGenericMode() {
+  if (mapping::isSPMDMode())
+    return false;
+
+  // Check if this is the last warp in the block.
+  return mapping::getWarpId() + 1 == mapping::getNumberOfWarpsInBlock();
+}
+
+bool mapping::isLeaderInWarp() {
+  __kmpc_impl_lanemask_t Active = mapping::activemask();
+  __kmpc_impl_lanemask_t LaneMaskLT = mapping::lanemaskLT();
+  return utils::popc(Active & LaneMaskLT) == 0;
+}
+
+LaneMaskTy mapping::activemask() { return impl::activemask(); }
+
+LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
+
+LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
+
+uint32_t mapping::getThreadIdInWarp() { return impl::getThreadIdInWarp(); }
+
+uint32_t mapping::getThreadIdInBlock() { return impl::getThreadIdInBlock(); }
+
+uint32_t mapping::getBlockSize() { return impl::getBlockSize(); }
+
+uint32_t mapping::getKernelSize() { return impl::getKernelSize(); }
+
+uint32_t mapping::getBlockId() { return impl::getBlockId(); }
+
+uint32_t mapping::getNumberOfBlocks() { return impl::getNumberOfBlocks(); }
+
+uint32_t mapping::getNumberOfProcessorElements() {
+  return impl::getNumberOfProcessorElements();
+}
+
+uint32_t mapping::getWarpId() { return impl::getWarpId(); }
+
+uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
+
+uint32_t mapping::getNumberOfWarpsInBlock() {
+  return impl::getNumberOfWarpsInBlock();
+}
+
+/// Execution mode
+///
+///{
+static int SHARED(IsSPMDMode);
+
+void mapping::init(bool IsSPMD) { IsSPMDMode = IsSPMD; }
+
+bool mapping::isSPMDMode() { return IsSPMDMode; }
+
+bool mapping::isGenericMode() { return !isSPMDMode(); }
+///}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Misc.cpp b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Misc.cpp
@@ -0,0 +1,74 @@
+//===--------- Misc.cpp - OpenMP device misc interfaces ----------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Types.h"
+
+namespace _OMP {
+namespace impl {
+#pragma omp declare target
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+double getWTick() { return ((double)1E-9); }
+
+double getWTime() {
+  // The intrinsics for measuring time have undocumented frequency
+  // This will probably need to be found by measurement on a number of
+  // architectures. Until then, return 0, which is very inaccurate as a
+  // timer but resolves the undefined symbol at link time.
+  return 0;
+}
+
+#pragma omp end declare variant
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+double getWTick() {
+  // Timer precision is 1ns
+  return ((double)1E-9);
+}
+
+double getWTime() {
+  unsigned long long nsecs;
+  asm("mov.u64  %0, %%globaltimer;" : "=l"(nsecs));
+  return (double)nsecs * getWTick();
+}
+
+#pragma omp end declare variant
+
+#pragma omp end declare target
+} // namespace impl
+} // namespace _OMP
+
+/// Interfaces
+///
+///{
+#pragma omp declare target
+
+extern "C" {
+int32_t __kmpc_cancellationpoint(IdentTy *, int32_t, int32_t) { return 0; }
+
+int32_t __kmpc_cancel(IdentTy *, int32_t, int32_t) { return 0; }
+
+double omp_get_wtick(void) { return _OMP::impl::getWTick(); }
+
+double omp_get_wtime(void) { return _OMP::impl::getWTime(); }
+}
+
+#pragma omp end declare target
+///}
diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp
@@ -0,0 +1,142 @@
+//===---- Parallelism.cpp - OpenMP GPU parallel implementation ---- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Parallel implementation in the GPU. Here is the pattern:
+//
+//    while (not finished) {
+//
+//    if (master) {
+//      sequential code, decide which par loop to do, or if finished
+//     __kmpc_kernel_prepare_parallel() // exec by master only
+//    }
+//    syncthreads // A
+//    __kmpc_kernel_parallel() // exec by all
+//    if (this thread is included in the parallel) {
+//      switch () for all parallel loops
+//      __kmpc_kernel_end_parallel() // exec only by threads in parallel
+//    }
+//
+//
+//    The reason we don't exec end_parallel for the threads not included
+//    in the parallel loop is that for each barrier in the parallel
+//    region, these non-included threads will cycle through the
+//    syncthread A. Thus they must preserve their current threadId that
+//    is larger than thread in team.
+//
+//    To make a long story short...
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Types.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+namespace {
+
+uint32_t determineNumberOfThreads() {
+  uint32_t NThreadsICV = icv::NThreads;
+  uint32_t NumThreads = mapping::getBlockSize();
+
+  // In non-SPMD mode, we need to substract the warp for the master thread
+  if (!mapping::isSPMDMode())
+    NumThreads -= mapping::getWarpSize();
+
+  if (NThreadsICV != 0 && NThreadsICV < NumThreads)
+    NumThreads = NThreadsICV;
+
+  // Round down to a multiple of WARPSIZE since it is legal to do so in OpenMP.
+  if (NumThreads < mapping::getWarpSize())
+    NumThreads = 1;
+  else
+    NumThreads = (NumThreads & ~((uint32_t)mapping::getWarpSize() - 1));
+
+  return NumThreads;
+}
+
+} // namespace
+
+extern "C" {
+
+void __kmpc_kernel_prepare_parallel(ParallelRegionFnTy WorkFn) {
+
+  uint32_t NumThreads = determineNumberOfThreads();
+  state::ParallelTeamSize = NumThreads;
+  state::ParallelRegionFn = WorkFn;
+
+  // We do *not* create a new data environment because all threads in the team
+  // that are active are now running this parallel region. They share the
+  // TeamState, which has an increase level-var and potentially active-level
+  // set, but they do not have individual ThreadStates yet. If they ever
+  // modify the ICVs beyond this point a ThreadStates will be allocated.
+  int NewLevel = ++icv::Level;
+  bool IsActiveParallelRegion = NumThreads > 1;
+  if (IsActiveParallelRegion)
+    icv::ActiveLevel = NewLevel;
+}
+
+bool __kmpc_kernel_parallel(ParallelRegionFnTy *WorkFn) {
+  // Work function and arguments for L1 parallel region.
+  *WorkFn = state::ParallelRegionFn;
+
+  // If this is the termination signal from the master, quit early.
+  if (!*WorkFn) {
+    return false;
+  }
+
+  // Set to true for workers participating in the parallel region.
+  uint32_t TId = mapping::getThreadIdInBlock();
+  bool ThreadIsActive = TId < state::ParallelTeamSize;
+  return ThreadIsActive;
+}
+
+void __kmpc_kernel_end_parallel() {
+  // We did *not* create a new data environment because all threads in the team
+  // that were active were running the parallel region. We used the TeamState
+  // which needs adjustment now.
+  // --icv::Level;
+  // bool IsActiveParallelRegion = state::ParallelTeamSize;
+  // if (IsActiveParallelRegion)
+  //   icv::ActiveLevel = 0;
+
+  // state::ParallelTeamSize = 1;
+
+  // In case we have modified an ICV for this thread before a ThreadState was
+  // created. We drop it now to not contaminate the next parallel region.
+  state::resetStateForThread();
+}
+
+void __kmpc_serialized_parallel(IdentTy *, uint32_t TId) {
+  state::enterDataEnvironment();
+  ++icv::Level;
+}
+
+void __kmpc_end_serialized_parallel(IdentTy *, uint32_t TId) {
+  state::exitDataEnvironment();
+  --icv::Level;
+}
+
+uint16_t __kmpc_parallel_level(IdentTy *, uint32_t) { return omp_get_level(); }
+
+int32_t __kmpc_global_thread_num(IdentTy *) { return omp_get_thread_num(); }
+
+void __kmpc_push_num_threads(IdentTy *, int32_t, int32_t NumThreads) {
+  icv::NThreads = NumThreads;
+}
+
+void __kmpc_push_num_teams(IdentTy *loc, int32_t tid, int32_t num_teams,
+                           int32_t thread_limit) {}
+
+void __kmpc_push_proc_bind(IdentTy *loc, uint32_t tid, int proc_bind) {}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -0,0 +1,319 @@
+//===---- Reduction.cpp - OpenMP device reduction implementation - C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of reduction with KMPC interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+namespace {
+
+#pragma omp declare target
+
+void gpu_regular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct) {
+  for (uint32_t mask = mapping::getWarpSize() / 2; mask > 0; mask /= 2) {
+    shflFct(reduce_data, /*LaneId - not used= */ 0,
+            /*Offset = */ mask, /*AlgoVersion=*/0);
+  }
+}
+
+void gpu_irregular_warp_reduce(void *reduce_data, ShuffleReductFnTy shflFct,
+                               uint32_t size, uint32_t tid) {
+  uint32_t curr_size;
+  uint32_t mask;
+  curr_size = size;
+  mask = curr_size / 2;
+  while (mask > 0) {
+    shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
+    curr_size = (curr_size + 1) / 2;
+    mask = curr_size / 2;
+  }
+}
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+static uint32_t gpu_irregular_simd_reduce(void *reduce_data,
+                                          ShuffleReductFnTy shflFct) {
+  uint32_t size, remote_id, physical_lane_id;
+  physical_lane_id = mapping::getThreadIdInBlock() % mapping::getWarpSize();
+  __kmpc_impl_lanemask_t lanemask_lt = mapping::lanemaskLT();
+  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+  uint32_t logical_lane_id = utils::popc(Liveness & lanemask_lt) * 2;
+  __kmpc_impl_lanemask_t lanemask_gt = mapping::lanemaskGT();
+  do {
+    Liveness = mapping::activemask();
+    remote_id = utils::ffs(Liveness & lanemask_gt);
+    size = utils::popc(Liveness);
+    logical_lane_id /= 2;
+    shflFct(reduce_data, /*LaneId =*/logical_lane_id,
+            /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
+  } while (logical_lane_id % 2 == 0 && size > 1);
+  return (logical_lane_id == 0);
+}
+#endif
+
+static int32_t nvptx_parallel_reduce_nowait(int32_t TId, int32_t num_vars,
+                                            uint64_t reduce_size,
+                                            void *reduce_data,
+                                            ShuffleReductFnTy shflFct,
+                                            InterWarpCopyFnTy cpyFct,
+                                            bool isSPMDExecutionMode, bool) {
+  uint32_t BlockThreadId = mapping::getThreadIdInBlock();
+  uint32_t NumThreads = omp_get_num_threads();
+  if (NumThreads == 1)
+    return 1;
+    /*
+     * This reduce function handles reduction within a team. It handles
+     * parallel regions in both L1 and L2 parallelism levels. It also
+     * supports Generic, SPMD, and NoOMP modes.
+     *
+     * 1. Reduce within a warp.
+     * 2. Warp master copies value to warp 0 via shared memory.
+     * 3. Warp 0 reduces to a single value.
+     * 4. The reduced value is available in the thread that returns 1.
+     */
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+  uint32_t WarpsNeeded =
+      (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+  uint32_t WarpId = mapping::getWarpId();
+
+  // Volta execution model:
+  // For the Generic execution mode a parallel region either has 1 thread and
+  // beyond that, always a multiple of 32. For the SPMD execution mode we may
+  // have any number of threads.
+  if ((NumThreads % mapping::getWarpSize() == 0) || (WarpId < WarpsNeeded - 1))
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/NumThreads % mapping::getWarpSize(),
+                              /*LaneId=*/mapping::getThreadIdInBlock() %
+                                  mapping::getWarpSize());
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > mapping::getWarpSize()) {
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+  }
+  return BlockThreadId == 0;
+#else
+  __kmpc_impl_lanemask_t Liveness = mapping::activemask();
+  if (Liveness == lanes::All) // Full warp
+    gpu_regular_warp_reduce(reduce_data, shflFct);
+  else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
+    gpu_irregular_warp_reduce(reduce_data, shflFct,
+                              /*LaneCount=*/utils::popc(Liveness),
+                              /*LaneId=*/mapping::getThreadIdInBlock() %
+                                  mapping::getWarpSize());
+  else { // Dispersed lanes. Only threads in L2
+         // parallel region may enter here; return
+         // early.
+    return gpu_irregular_simd_reduce(reduce_data, shflFct);
+  }
+
+  // When we have more than [mapping::getWarpSize()] number of threads
+  // a block reduction is performed here.
+  //
+  // Only L1 parallel region can enter this if condition.
+  if (NumThreads > mapping::getWarpSize()) {
+    uint32_t WarpsNeeded =
+        (NumThreads + mapping::getWarpSize() - 1) / mapping::getWarpSize();
+    // Gather all the reduced values from each warp
+    // to the first warp.
+    cpyFct(reduce_data, WarpsNeeded);
+
+    uint32_t WarpId = BlockThreadId / mapping::getWarpSize();
+    if (WarpId == 0)
+      gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                BlockThreadId);
+
+    return BlockThreadId == 0;
+  }
+
+  // Get the OMP thread Id. This is different from BlockThreadId in the case of
+  // an L2 parallel region.
+  return TId == 0;
+#endif // __CUDA_ARCH__ >= 700
+}
+
+uint32_t roundToWarpsize(uint32_t s) {
+  if (s < mapping::getWarpSize())
+    return 1;
+  return (s & ~(unsigned)(mapping::getWarpSize() - 1));
+}
+
+uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
+
+static volatile uint32_t IterCnt = 0;
+static volatile uint32_t Cnt = 0;
+
+#pragma omp end declare target
+
+} // namespace
+
+#pragma omp declare target
+
+extern "C" {
+int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct) {
+  return nvptx_parallel_reduce_nowait(TId, num_vars, reduce_size, reduce_data,
+                                      shflFct, cpyFct, mapping::isSPMDMode(),
+                                      false);
+}
+
+int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
+    IdentTy *Loc, int32_t TId, void *GlobalBuffer, uint32_t num_of_records,
+    void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct,
+    ListGlobalFnTy lgcpyFct, ListGlobalFnTy lgredFct, ListGlobalFnTy glcpyFct,
+    ListGlobalFnTy glredFct) {
+
+  // Terminate all threads in non-SPMD mode except for the master thread.
+  if (mapping::isGenericMode() && !mapping::isMainThreadInGenericMode())
+    return 0;
+
+  uint32_t ThreadId = omp_get_thread_num();
+
+  // In non-generic mode all workers participate in the teams reduction.
+  // In generic mode only the team master participates in the teams
+  // reduction because the workers are waiting for parallel work.
+  uint32_t NumThreads = omp_get_num_threads();
+  uint32_t TeamId = omp_get_team_num();
+  uint32_t NumTeams = omp_get_num_teams();
+  static unsigned SHARED(Bound);
+  static unsigned SHARED(ChunkTeamCount);
+
+  // Block progress for teams greater than the current upper
+  // limit. We always only allow a number of teams less or equal
+  // to the number of slots in the buffer.
+  // bool IsMaster = __kmpc_master(nullptr, TId);
+  bool IsMaster = (ThreadId == 0);
+  while (IsMaster) {
+    Bound = atomic::read((uint32_t *)&IterCnt, __ATOMIC_SEQ_CST);
+    if (TeamId < Bound + num_of_records)
+      break;
+  }
+
+  if (IsMaster) {
+    int ModBockId = TeamId % num_of_records;
+    if (TeamId < num_of_records) {
+      lgcpyFct(GlobalBuffer, ModBockId, reduce_data);
+    } else
+      lgredFct(GlobalBuffer, ModBockId, reduce_data);
+
+    fence::system(__ATOMIC_SEQ_CST);
+
+    // Increment team counter.
+    // This counter is incremented by all teams in the current
+    // BUFFER_SIZE chunk.
+    ChunkTeamCount =
+        atomic::inc((uint32_t *)&Cnt, num_of_records - 1u, __ATOMIC_SEQ_CST);
+  }
+  // Synchronize
+  if (mapping::isSPMDMode())
+    __kmpc_barrier(Loc, TId);
+
+  // reduce_data is global or shared so before being reduced within the
+  // warp we need to bring it in local memory:
+  // local_reduce_data = reduce_data[i]
+  //
+  // Example for 3 reduction variables a, b, c (of potentially different
+  // types):
+  //
+  // buffer layout (struct of arrays):
+  // a, a, ..., a, b, b, ... b, c, c, ... c
+  // |__________|
+  //     num_of_records
+  //
+  // local_data_reduce layout (struct):
+  // a, b, c
+  //
+  // Each thread will have a local struct containing the values to be
+  // reduced:
+  //      1. do reduction within each warp.
+  //      2. do reduction across warps.
+  //      3. write the final result to the main reduction variable
+  //         by returning 1 in the thread holding the reduction result.
+
+  // Check if this is the very last team.
+  unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
+  if (ChunkTeamCount == NumTeams - Bound - 1) {
+    //
+    // Last team processing.
+    //
+    if (ThreadId >= NumRecs)
+      return 0;
+    NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
+    if (ThreadId >= NumThreads)
+      return 0;
+
+    // Load from buffer and reduce.
+    glcpyFct(GlobalBuffer, ThreadId, reduce_data);
+    for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
+      glredFct(GlobalBuffer, i, reduce_data);
+
+    // Reduce across warps to the warp master.
+    if (NumThreads > 1) {
+      gpu_regular_warp_reduce(reduce_data, shflFct);
+
+      // When we have more than [mapping::getWarpSize()] number of threads
+      // a block reduction is performed here.
+      uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
+      if (ActiveThreads > mapping::getWarpSize()) {
+        uint32_t WarpsNeeded = (ActiveThreads + mapping::getWarpSize() - 1) /
+                               mapping::getWarpSize();
+        // Gather all the reduced values from each warp
+        // to the first warp.
+        cpyFct(reduce_data, WarpsNeeded);
+
+        uint32_t WarpId = ThreadId / mapping::getWarpSize();
+        if (WarpId == 0)
+          gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
+                                    ThreadId);
+      }
+    }
+
+    if (IsMaster) {
+      Cnt = 0;
+      IterCnt = 0;
+      return 1;
+    }
+    return 0;
+  }
+  if (IsMaster && ChunkTeamCount == num_of_records - 1) {
+    // Allow SIZE number of teams to proceed writing their
+    // intermediate results to the global buffer.
+    atomic::add((uint32_t *)&IterCnt, uint32_t(num_of_records),
+                __ATOMIC_SEQ_CST);
+  }
+
+  return 0;
+}
+
+void __kmpc_nvptx_end_reduce(int32_t TId) {}
+
+void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -0,0 +1,714 @@
+//===------ State.cpp - OpenMP State & ICV interface ------------- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "State.h"
+#include "Configuration.h"
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+/// Memory implementation
+///
+///{
+
+namespace {
+
+#pragma omp declare target
+
+/// Fallback implementations are missing to trigger a link time error.
+/// Implementations for new devices, including the host, should go into a
+/// dedicated begin/end declare variant.
+///
+///{
+
+extern "C" {
+void *malloc(uint64_t Size);
+void free(void *Ptr);
+}
+
+///}
+
+/// AMDGCN implementations of the shuffle sync idiom.
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+extern "C" {
+void *malloc(uint64_t Size) {
+  // TODO: Use some preallocated space for dynamic malloc.
+  return 0;
+}
+
+void free(void *Ptr) {}
+}
+
+#pragma omp end declare variant
+///}
+
+/// Add worst-case padding so that future allocations are properly aligned.
+constexpr const uint32_t Alignment = 8;
+
+/// We do expose a malloc/free interface but instead should take the number of
+/// bytes also in the pop call. Until clang is updated we hide the allocation
+/// size "under" the allocated pointer.
+constexpr const uint32_t StorageTrackingBytes = 8;
+
+static_assert((StorageTrackingBytes % Alignment) == 0,
+              "Storage tracker should preserve alignment.");
+
+/// A "smart" stack in shared memory.
+///
+/// The stack exposes a malloc/free interface but works like a stack internally.
+/// In fact, it is a separate stack *per warp*. That means, each warp must push
+/// and pop symmetrically or this breaks, badly. The implementation will (aim
+/// to) detect non-lock-step warps and fallback to malloc/free. The same will
+/// happen if a warp runs out of memory. The master warp in generic memory is
+/// special and is given more memory than the rest.
+///
+struct SharedMemorySmartStackTy {
+  /// The amount of memory reserved for the main thread in generic mode. This
+  /// is done because the main thread in generic mode has to share values with
+  /// the workers which requires usually more space than used by workers for
+  /// sharing among each other.
+  uint32_t GenericModeMainThreadStorage;
+
+  /// The amount of memory for each warp.
+  uint32_t WarpStorageTotal;
+
+  /// Initialize the stack. Must be called by all threads.
+  void init(bool IsSPMD);
+
+  /// Allocate \p Bytes on the stack for the encountering thread. Each thread
+  /// can call this function.
+  void *push(uint64_t Bytes);
+
+  /// Deallocate the last allocation made by the encountering thread and pointed
+  /// to by \p Ptr from the stack. Each thread can call this function.
+  void pop(void *Ptr);
+
+private:
+  /// Compute the size of the storage space reserved for the warp of the
+  /// encountering thread.
+  uint32_t computeWarpStorageTotal();
+
+  /// Return the bottom address of the warp data stack, that is the first
+  /// address this warp allocated memory at, or will.
+  char *getWarpDataBottom();
+
+  /// Same as \p getWarpDataBottom() but it can access other warp's bottom. No
+  /// lock is used so it's user's responsibility to make sure the correctness.
+  char *getWarpDataBottom(uint32_t WarpId);
+
+  /// Return the top address of the warp data stack, that is the first address
+  /// this warp will allocate memory at next.
+  char *getWarpDataTop();
+
+  /// Return the location of the usage tracker which keeps track of the amount
+  /// of memory used by this warp.
+  /// TODO: We could use the next warp bottom to avoid this tracker completely.
+  uint32_t *getWarpStorageTracker();
+
+  /// Same as \p getWarpStorageTracker() but it can access other warp's bottom.
+  /// No lock is used so it's user's responsibility to make sure the
+  /// correctness.
+  uint32_t *getWarpStorageTracker(uint32_t WarpId);
+
+  /// The actual storage, shared among all warps.
+  char Data[state::SharedScratchpadSize] __attribute__((aligned(Alignment)));
+};
+
+/// The allocation of a single shared memory scratchpad.
+static SharedMemorySmartStackTy SHARED(SharedMemorySmartStack);
+
+void SharedMemorySmartStackTy::init(bool IsSPMD) {
+  // Only leaders are needed to initialize the per-warp areas.
+  if (!mapping::isLeaderInWarp())
+    return;
+
+  GenericModeMainThreadStorage =
+      IsSPMD ? 0 : config::getGenericModeMainThreadSharedMemoryStorage();
+
+  WarpStorageTotal = computeWarpStorageTotal();
+
+  // Initialize the tracker to the bytes used by the storage tracker.
+  uint32_t *WarpStorageTracker = getWarpStorageTracker();
+  *WarpStorageTracker = StorageTrackingBytes;
+
+  static_assert(
+      StorageTrackingBytes >= (sizeof(*WarpStorageTracker)),
+      "Storage tracker bytes should cover the size of the storage tracker.");
+
+  // Initialize the tracker for all warps if in non-spmd mode because this
+  // function will be only called by master thread.
+  if (!IsSPMD) {
+    for (unsigned I = 0; I < mapping::getNumberOfWarpsInBlock(); ++I) {
+      uint32_t *WarpStorageTracker = getWarpStorageTracker(I);
+      *WarpStorageTracker = StorageTrackingBytes;
+    }
+  }
+}
+
+void *SharedMemorySmartStackTy::push(uint64_t BytesPerLane) {
+  // First align the number of requested bytes.
+  BytesPerLane = (BytesPerLane + (Alignment - 1)) / Alignment * Alignment;
+
+  // The pointer we eventually return.
+  void *Ptr = nullptr;
+
+  LaneMaskTy Active = mapping::activemask();
+  uint32_t NumActive = utils::popc(Active);
+
+  // Only the leader allocates, the rest, if any, waits for the result at the
+  // shfl_sync below.
+  if (mapping::isLeaderInWarp()) {
+    uint32_t BytesTotal = BytesPerLane * NumActive;
+
+    // Ensure the warp is complete or it is the sole main thread in generic
+    // mode. In any other situation we might not be able to preserve balanced
+    // push/pop accesses even if the code looked that way because the hardware
+    // could independently schedule the warp parts. If an incomplete warp
+    // arrives here we fallback to the slow path, namely malloc.
+    if (NumActive < mapping::getWarpSize() &&
+        !mapping::isMainThreadInGenericMode()) {
+      Ptr = memory::allocGlobal(
+          BytesTotal, "Slow path shared memory allocation, incomplete warp!");
+    } else {
+      // If warp is "complete" determine if we have sufficient space.
+      uint32_t *WarpStorageTracker = getWarpStorageTracker();
+      uint32_t BytesTotalAdjusted = BytesTotal + StorageTrackingBytes;
+      uint32_t BytesTotalAdjustedAligned =
+          (BytesTotalAdjusted + (Alignment - 1)) / Alignment * Alignment;
+
+      uint32_t BytesInUse = *WarpStorageTracker;
+      if (BytesInUse + BytesTotalAdjustedAligned > WarpStorageTotal) {
+        Ptr = memory::allocGlobal(
+            BytesTotal,
+            "Slow path shared memory allocation, insufficient memory!");
+      } else {
+        // We have enough memory, put the new allocation on the top of the stack
+        // preceded by the size of the allocation.
+        Ptr = getWarpDataTop();
+        *WarpStorageTracker += BytesTotalAdjustedAligned;
+        *((uint64_t *)Ptr) = BytesTotalAdjustedAligned;
+        Ptr = ((char *)Ptr) + StorageTrackingBytes;
+      }
+    }
+  }
+
+  // Skip the shfl_sync if the thread is alone.
+  if (NumActive == 1)
+    return Ptr;
+
+  // Get the address of the allocation from the leader.
+  uint32_t Leader = utils::ffs(Active) - 1;
+  int *FP = reinterpret_cast<int *>(&Ptr);
+  FP[0] = utils::shuffle(Active, FP[0], Leader);
+  if (sizeof(Ptr) == 8)
+    FP[1] = utils::shuffle(Active, FP[1], Leader);
+
+  // Compute the thread position into the allocation, which we did for the
+  // entire warp.
+  LaneMaskTy LaneMaskLT = mapping::lanemaskLT();
+  uint32_t WarpPosition = utils::popc(Active & LaneMaskLT);
+  return reinterpret_cast<char *>(Ptr) + (BytesPerLane * WarpPosition);
+}
+
+void SharedMemorySmartStackTy::pop(void *Ptr) {
+  // Only the leader deallocates, the rest, if any, waits at the synchwarp
+  // below.
+  if (mapping::isLeaderInWarp()) {
+    // memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
+    // // Check if the pointer is from a malloc or from within the stack.
+    if (Ptr < &Data[0] || Ptr > &Data[state::SharedScratchpadSize]) {
+      memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
+    } else {
+      // Lookup the allocation size "below" the allocation (=Ptr).
+      Ptr = reinterpret_cast<char *>(Ptr) - StorageTrackingBytes;
+      uint64_t BytesTotalAdjustedAligned = *reinterpret_cast<uint64_t *>(Ptr);
+
+      // Free the memory by adjusting the storage tracker accordingly.
+      uint32_t *WarpStorageTracker = getWarpStorageTracker();
+      *WarpStorageTracker -= BytesTotalAdjustedAligned;
+    }
+  }
+  // Ensure the entire warp waits until the pop is done.
+  synchronize::warp(mapping::activemask());
+}
+
+uint32_t SharedMemorySmartStackTy::computeWarpStorageTotal() {
+  if (mapping::isMainThreadInGenericMode())
+    return GenericModeMainThreadStorage;
+
+  // In generic mode we reserve parts of the storage for the main thread.
+  uint32_t StorageTotal = state::SharedScratchpadSize;
+  if (mapping::isGenericMode())
+    StorageTotal -= GenericModeMainThreadStorage;
+
+  uint32_t NumWarps = mapping::getNumberOfWarpsInBlock();
+  uint32_t WarpStorageTotal = StorageTotal / NumWarps;
+
+  // Align the size
+  WarpStorageTotal = WarpStorageTotal / Alignment * Alignment;
+
+  return WarpStorageTotal;
+}
+
+char *SharedMemorySmartStackTy::getWarpDataBottom(uint32_t WarpId) {
+  if (mapping::isMainThreadInGenericMode())
+    return &Data[0];
+
+  uint32_t PriorWarpStorageTotal = 0;
+  if (mapping::isGenericMode())
+    PriorWarpStorageTotal += GenericModeMainThreadStorage;
+
+  PriorWarpStorageTotal += WarpStorageTotal * WarpId;
+
+  return &Data[PriorWarpStorageTotal];
+}
+
+char *SharedMemorySmartStackTy::getWarpDataBottom() {
+  return getWarpDataBottom(mapping::getWarpId());
+}
+
+char *SharedMemorySmartStackTy::getWarpDataTop() {
+  uint32_t *WarpStorageTracker = getWarpStorageTracker();
+  return getWarpDataBottom() + (*WarpStorageTracker);
+}
+
+uint32_t *SharedMemorySmartStackTy::getWarpStorageTracker(uint32_t WarpId) {
+  return ((uint32_t *)getWarpDataBottom(WarpId));
+}
+
+uint32_t *SharedMemorySmartStackTy::getWarpStorageTracker() {
+  return getWarpStorageTracker(mapping::getWarpId());
+}
+
+#pragma omp end declare target
+} // namespace
+
+// TODO: Clang should accept namespaces inside the declare target range.
+#pragma omp declare target
+
+void *memory::allocShared(uint64_t Bytes, const char *Reason) {
+  return SharedMemorySmartStack.push(Bytes);
+}
+
+void memory::freeShared(void *Ptr, const char *Reason) {
+  SharedMemorySmartStack.pop(Ptr);
+}
+
+void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
+  return malloc(Bytes);
+}
+
+void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+
+#pragma omp end declare target
+
+///}
+
+namespace {
+
+#pragma omp declare target
+
+struct ICVStateTy {
+  uint32_t NThreadsVar;
+  uint32_t LevelVar;
+  uint32_t ActiveLevelVar;
+  uint32_t MaxActiveLevelsVar;
+  uint32_t RunSchedVar;
+  uint32_t RunSchedChunkVar;
+
+  bool operator==(const ICVStateTy &Other) const;
+
+  void assertEqual(const ICVStateTy &Other) const;
+};
+
+bool ICVStateTy::operator==(const ICVStateTy &Other) const {
+  return (NThreadsVar == Other.NThreadsVar) && (LevelVar == Other.LevelVar) &&
+         (ActiveLevelVar == Other.ActiveLevelVar) &&
+         (MaxActiveLevelsVar == Other.MaxActiveLevelsVar) &&
+         (RunSchedVar == Other.RunSchedVar) &&
+         (RunSchedChunkVar == Other.RunSchedChunkVar);
+}
+
+void ICVStateTy::assertEqual(const ICVStateTy &Other) const {
+  ASSERT(*this == Other);
+}
+
+struct TeamStateTy {
+  /// TODO: provide a proper init function.
+  void init(bool IsSPMD);
+
+  bool operator==(const TeamStateTy &) const;
+
+  void assertEqual(TeamStateTy &Other) const;
+
+  /// ICVs
+  ///
+  /// Preallocated storage for ICV values that are used if the threads have not
+  /// set a custom default. The latter is supported but unlikely and slow(er).
+  ///
+  ///{
+  ICVStateTy ICVState;
+  ///}
+
+  uint32_t ParallelTeamSize;
+  ParallelRegionFnTy ParallelRegionFnVar;
+};
+
+TeamStateTy SHARED(TeamState);
+
+void TeamStateTy::init(bool IsSPMD) {
+  if (IsSPMD) {
+    ICVState.NThreadsVar = 1;
+    ICVState.LevelVar = 1;
+    ICVState.ActiveLevelVar = 1;
+    ICVState.MaxActiveLevelsVar = 1;
+    ICVState.RunSchedVar = omp_sched_static;
+    ICVState.RunSchedChunkVar = 1;
+    ParallelTeamSize = mapping::getBlockSize();
+  } else {
+    // In non-SPMD mode, the last warp only contains one thread, the master
+    // thread.
+    ICVState.NThreadsVar =
+        (mapping::getNumberOfWarpsInBlock() - 1) * mapping::getWarpSize();
+    ICVState.LevelVar = 0;
+    ICVState.ActiveLevelVar = 0;
+    ICVState.MaxActiveLevelsVar = 1;
+    ICVState.RunSchedVar = omp_sched_static;
+    ICVState.RunSchedChunkVar = 1;
+    ParallelTeamSize = 1;
+  }
+}
+
+bool TeamStateTy::operator==(const TeamStateTy &Other) const {
+  return ICVState == Other.ICVState &&
+         ParallelTeamSize == Other.ParallelTeamSize;
+}
+
+void TeamStateTy::assertEqual(TeamStateTy &Other) const {
+  ASSERT(*this == Other);
+}
+
+struct ThreadStateTy {
+
+  /// ICVs have preallocated storage in the TeamStateTy which is used if a
+  /// thread has not set a custom value. The latter is supported but unlikely.
+  /// When it happens we will allocate dynamic memory to hold the values of all
+  /// ICVs. Thus, the first time an ICV is set by a thread we will allocate an
+  /// ICV struct to hold them all. This is slower than alternatives but allows
+  /// users to pay only for what they use.
+  ///
+  ICVStateTy ICVState;
+
+  ThreadStateTy *PreviousThreadState;
+
+  void init() {
+    ICVState = TeamState.ICVState;
+    PreviousThreadState = nullptr;
+  }
+
+  void init(ThreadStateTy &PreviousTS) {
+    ICVState = PreviousTS.ICVState;
+    PreviousThreadState = &PreviousTS;
+  }
+};
+
+__attribute__((loader_uninitialized))
+ThreadStateTy *ThreadStates[mapping::MaxThreadsPerTeam];
+#pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
+
+uint32_t &lookupForModify32Impl(uint32_t ICVStateTy::*Var) {
+  if (mapping::isMainThreadInGenericMode())
+    return TeamState.ICVState.*Var;
+  uint32_t TId = mapping::getThreadIdInBlock();
+  if (!ThreadStates[TId]) {
+    ThreadStates[TId] = reinterpret_cast<ThreadStateTy *>(memory::allocGlobal(
+        sizeof(ThreadStateTy), "ICV modification outside data environment"));
+    ThreadStates[TId]->init();
+  }
+  return ThreadStates[TId]->ICVState.*Var;
+}
+
+uint32_t &lookup32Impl(uint32_t ICVStateTy::*Var) {
+  if (mapping::isMainThreadInGenericMode())
+    return TeamState.ICVState.*Var;
+  uint32_t TId = mapping::getThreadIdInBlock();
+  if (ThreadStates[TId])
+    return ThreadStates[TId]->ICVState.*Var;
+  return TeamState.ICVState.*Var;
+}
+uint64_t &lookup64Impl(uint64_t ICVStateTy::*Var) {
+  if (mapping::isMainThreadInGenericMode())
+    return TeamState.ICVState.*Var;
+  uint64_t TId = mapping::getThreadIdInBlock();
+  if (ThreadStates[TId])
+    return ThreadStates[TId]->ICVState.*Var;
+  return TeamState.ICVState.*Var;
+}
+
+int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
+                             int OutOfBoundsVal = -1) {
+  if (Level == 0)
+    return DefaultVal;
+  int LevelVar = omp_get_level();
+  if (Level < 0 || Level > LevelVar)
+    return OutOfBoundsVal;
+  int ActiveLevel = icv::ActiveLevel;
+  if (Level != ActiveLevel)
+    return DefaultVal;
+  return Val;
+}
+
+#pragma omp end declare target
+
+} // namespace
+
+#pragma omp declare target
+
+uint32_t &state::lookup32(ValueKind Kind, bool IsReadonly) {
+  switch (Kind) {
+  case state::VK_NThreads:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::NThreadsVar);
+    return lookupForModify32Impl(&ICVStateTy::NThreadsVar);
+  case state::VK_Level:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::LevelVar);
+    return lookupForModify32Impl(&ICVStateTy::LevelVar);
+  case state::VK_ActiveLevel:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::ActiveLevelVar);
+    return lookupForModify32Impl(&ICVStateTy::ActiveLevelVar);
+  case state::VK_MaxActiveLevels:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::MaxActiveLevelsVar);
+    return lookupForModify32Impl(&ICVStateTy::MaxActiveLevelsVar);
+  case state::VK_RunSched:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::RunSchedVar);
+    return lookupForModify32Impl(&ICVStateTy::RunSchedVar);
+  case state::VK_RunSchedChunk:
+    if (IsReadonly)
+      return lookup32Impl(&ICVStateTy::RunSchedChunkVar);
+    return lookupForModify32Impl(&ICVStateTy::RunSchedChunkVar);
+  case state::VK_ParallelTeamSize:
+    return TeamState.ParallelTeamSize;
+  default:
+    break;
+  }
+  __builtin_unreachable();
+}
+
+void *&state::lookupPtr(ValueKind Kind, bool IsReadonly) {
+  switch (Kind) {
+  case state::VK_ParallelRegionFn:
+    return TeamState.ParallelRegionFnVar;
+  default:
+    break;
+  }
+  __builtin_unreachable();
+}
+
+void state::init(bool IsSPMD) {
+  SharedMemorySmartStack.init(IsSPMD);
+  TeamState.init(IsSPMD);
+
+  if (IsSPMD) {
+    ThreadStates[mapping::getThreadIdInBlock()] = nullptr;
+  } else {
+    for (uint32_t i = 0; i < mapping::getBlockSize(); ++i)
+      ThreadStates[i] = nullptr;
+  }
+}
+
+void state::enterDataEnvironment() {
+  unsigned TId = mapping::getThreadIdInBlock();
+
+  ThreadStateTy *NewThreadState =
+      static_cast<ThreadStateTy *>(__kmpc_alloc_shared(sizeof(ThreadStateTy)));
+  NewThreadState->init(*ThreadStates[TId]);
+  ThreadStates[TId] = NewThreadState;
+}
+
+void state::exitDataEnvironment() {
+  // assert(ThreadStates[TId] && "exptected thread state");
+  resetStateForThread();
+}
+
+void state::resetStateForThread() {
+  unsigned TId = mapping::getThreadIdInBlock();
+  if (!ThreadStates[TId])
+    return;
+
+  ThreadStateTy *PreviousThreadState = ThreadStates[TId]->PreviousThreadState;
+  __kmpc_free_shared(ThreadStates[TId]);
+  ThreadStates[TId] = PreviousThreadState;
+}
+
+void state::runAndCheckState(void(Func(void))) {
+  TeamStateTy OldTeamState = TeamState;
+
+  Func();
+
+  ASSERT(OldTeamState == TeamState);
+}
+
+void state::assumeInitialState(bool IsSPMD) {
+  TeamStateTy InitialTeamState;
+  InitialTeamState.init(IsSPMD);
+  ASSERT(InitialTeamState == TeamState);
+
+  if (IsSPMD)
+    ASSERT(ThreadStates[mapping::getThreadIdInBlock()] == nullptr);
+  else
+    for (unsigned I = 0; I < mapping::getBlockSize(); ++I)
+      ASSERT(ThreadStates[I] == nullptr);
+}
+
+extern "C" {
+void omp_set_dynamic(int V) {}
+
+int omp_get_dynamic(void) { return 0; }
+
+void omp_set_num_threads(int V) { icv::NThreads = V; }
+
+int omp_get_max_threads(void) { return icv::NThreads; }
+
+int omp_get_level(void) {
+  int LevelVar = icv::Level;
+  __builtin_assume(LevelVar >= 0);
+  return LevelVar;
+}
+
+int omp_get_active_level(void) { return !!icv::ActiveLevel; }
+
+int omp_in_parallel(void) { return !!icv::ActiveLevel; }
+
+void omp_get_schedule(omp_sched_t *ScheduleKind, int *ChunkSize) {
+  *ScheduleKind = static_cast<omp_sched_t>((int)icv::RunSched);
+  *ChunkSize = state::RunSchedChunk;
+}
+
+void omp_set_schedule(omp_sched_t ScheduleKind, int ChunkSize) {
+  icv::RunSched = (int)ScheduleKind;
+  state::RunSchedChunk = ChunkSize;
+}
+
+int omp_get_ancestor_thread_num(int Level) {
+  return returnValIfLevelIsActive(Level, mapping::getThreadIdInBlock(), 0);
+}
+
+int omp_get_thread_num(void) {
+  return omp_get_ancestor_thread_num(omp_get_level());
+}
+
+int omp_get_team_size(int Level) {
+  return returnValIfLevelIsActive(Level, state::ParallelTeamSize, 1);
+}
+
+int omp_get_num_threads(void) { return state::ParallelTeamSize; }
+
+int omp_get_thread_limit(void) { return mapping::getKernelSize(); }
+
+int omp_get_num_procs(void) { return mapping::getNumberOfProcessorElements(); }
+
+void omp_set_nested(int) {}
+
+int omp_get_nested(void) { return false; }
+
+void omp_set_max_active_levels(int Levels) {
+  icv::MaxActiveLevels = Levels > 0 ? 1 : 0;
+}
+
+int omp_get_max_active_levels(void) { return icv::MaxActiveLevels; }
+
+omp_proc_bind_t omp_get_proc_bind(void) { return omp_proc_bind_false; }
+
+int omp_get_num_places(void) { return 0; }
+
+int omp_get_place_num_procs(int) { return omp_get_num_procs(); }
+
+void omp_get_place_proc_ids(int, int *) {
+  // TODO
+}
+
+int omp_get_place_num(void) { return 0; }
+
+int omp_get_partition_num_places(void) { return 0; }
+
+void omp_get_partition_place_nums(int *) {
+  // TODO
+}
+
+int omp_get_cancellation(void) { return 0; }
+
+void omp_set_default_device(int) {}
+
+int omp_get_default_device(void) { return -1; }
+
+int omp_get_num_devices(void) { return config::getNumDevices(); }
+
+int omp_get_num_teams(void) { return mapping::getNumberOfBlocks(); }
+
+int omp_get_team_num() { return mapping::getBlockId(); }
+
+int omp_get_initial_device(void) { return -1; }
+}
+
+extern "C" {
+// TODO: The noinline is a workaround until we run OpenMP opt before the
+// inliner.
+__attribute__((noinline)) void *__kmpc_alloc_shared(uint64_t Bytes) {
+  return memory::allocShared(Bytes, "Frontend alloc shared");
+}
+
+// TODO: The noinline is a workaround until we run OpenMP opt before the
+// inliner.
+__attribute__((noinline)) void __kmpc_free_shared(void *Ptr) {
+  memory::freeShared(Ptr, "Frontend free shared");
+}
+
+/// The shared variable used by the main thread to communicate with the workers.
+/// It will contain the location of the memory allocated for the actually shared
+/// values.
+///
+/// Workaround until the interface is changed.
+static void **SHARED(GlobalArgsPtr);
+
+void __kmpc_begin_sharing_variables(void ***GlobalArgs, uint64_t NumArgs) {
+  // TODO: To mimic the old behavior we allocate in `sizeof(void*)` chunks. We
+  //       should pass the required bytes instead.
+  *GlobalArgs = GlobalArgsPtr = static_cast<decltype(GlobalArgsPtr)>(
+      __kmpc_alloc_shared(NumArgs * sizeof(GlobalArgsPtr[0])));
+}
+
+void __kmpc_end_sharing_variables() {
+  --icv::Level;
+  bool IsActiveParallelRegion = state::ParallelTeamSize > 1;
+  if (IsActiveParallelRegion)
+    icv::ActiveLevel = 0;
+
+  state::ParallelTeamSize = 1;
+
+  __kmpc_free_shared(GlobalArgsPtr);
+}
+
+void __kmpc_get_shared_variables(void ***GlobalArgs) {
+  *GlobalArgs = GlobalArgsPtr;
+}
+}
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Synchronization.cpp
@@ -0,0 +1,315 @@
+//===- Synchronization.cpp - OpenMP Device synchronization API ---- c++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Include all synchronization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Synchronization.h"
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+namespace impl {
+#pragma omp declare target
+
+/// Atomics
+///
+///{
+/// NOTE: This function needs to be implemented by every target.
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering);
+
+uint32_t atomicRead(uint32_t *Address, int Ordering) {
+  return __atomic_fetch_add(Address, 0U, __ATOMIC_SEQ_CST);
+}
+
+uint32_t atomicAdd(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __atomic_fetch_add(Address, Val, Ordering);
+}
+uint32_t atomicMax(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __atomic_fetch_max(Address, Val, Ordering);
+}
+
+uint32_t atomicExchange(uint32_t *Address, uint32_t Val, int Ordering) {
+  uint32_t R;
+  __atomic_exchange(Address, &Val, &R, Ordering);
+  return R;
+}
+uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val,
+                   int Ordering) {
+  (void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering,
+                                  Ordering);
+  return Compare;
+}
+
+uint64_t atomicAdd(uint64_t *Address, uint64_t Val, int Ordering) {
+  return __atomic_fetch_add(Address, Val, Ordering);
+}
+///}
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __builtin_amdgcn_atomic_inc32(Address, Val, Ordering, "");
+}
+
+uint32_t SHARD(namedBarrierTracker);
+
+void namedBarrierInit() {
+  // Don't have global ctors, and shared memory is not zero init
+  atomic::store(&namedBarrierTracker, 0u, __ATOMIC_RELEASE);
+}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  // assert(NumThreads % 32 == 0);
+
+  uint32_t WarpSize = maping::getWarpSize();
+  uint32_t NumWaves = NumThreads / WarpSize;
+
+  fence::team(__ATOMIC_ACQUIRE);
+
+  // named barrier implementation for amdgcn.
+  // Uses two 16 bit unsigned counters. One for the number of waves to have
+  // reached the barrier, and one to count how many times the barrier has been
+  // passed. These are packed in a single atomically accessed 32 bit integer.
+  // Low bits for the number of waves, assumed zero before this call.
+  // High bits to count the number of times the barrier has been passed.
+
+  // precondition: NumWaves != 0;
+  // invariant: NumWaves * WarpSize == NumThreads;
+  // precondition: NumWaves < 0xffffu;
+
+  // Increment the low 16 bits once, using the lowest active thread.
+  if (mapping::isLeaderInWarp()) {
+    uint32_t load = atomic::add(&namedBarrierTracker, 1,
+                                __ATOMIC_RELAXED); // commutative
+
+    // Record the number of times the barrier has been passed
+    uint32_t generation = load & 0xffff0000u;
+
+    if ((load & 0x0000ffffu) == (NumWaves - 1)) {
+      // Reached NumWaves in low bits so this is the last wave.
+      // Set low bits to zero and increment high bits
+      load += 0x00010000u; // wrap is safe
+      load &= 0xffff0000u; // because bits zeroed second
+
+      // Reset the wave counter and release the waiting waves
+      atomic::store(&namedBarrierTracker, load, __ATOMIC_RELAXED);
+    } else {
+      // more waves still to go, spin until generation counter changes
+      do {
+        __builtin_amdgcn_s_sleep(0);
+        load = atomi::load(&namedBarrierTracker, __ATOMIC_RELAXED);
+      } while ((load & 0xffff0000u) == generation);
+    }
+  }
+  fence::team(__ATOMIC_RELEASE);
+}
+
+void syncWarp(__kmpc_impl_lanemask_t) {
+  // AMDGCN doesn't need to sync threads in a warp
+}
+
+void syncThreads() { __builtin_amdgcn_s_barrier(); }
+
+void fenceTeam(int Ordering) { __builtin_amdgcn_fence(Ordering, "workgroup"); }
+
+void fenceKernel(int Ordering) { __builtin_amdgcn_fence(Ordering, "agent"); }
+
+void fenceSystem(int Ordering) { __builtin_amdgcn_fence(Ordering, ""); }
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+uint32_t atomicInc(uint32_t *Address, uint32_t Val, int Ordering) {
+  return __nvvm_atom_inc_gen_ui(Address, Val);
+}
+
+void namedBarrierInit() {}
+
+void namedBarrier() {
+  uint32_t NumThreads = omp_get_num_threads();
+  ASSERT(NumThreads % 32 == 0);
+
+  // The named barrier for active parallel threads of a team in an L1 parallel
+  // region to synchronize with each other.
+  int BarrierNo = 7;
+  asm volatile("bar.sync %0, %1;"
+               :
+               : "r"(BarrierNo), "r"(NumThreads)
+               : "memory");
+}
+
+void fenceTeam(int) { __nvvm_membar_cta(); }
+
+void fenceKernel(int) { __nvvm_membar_gl(); }
+
+void fenceSystem(int) { __nvvm_membar_sys(); }
+
+void syncWarp(__kmpc_impl_lanemask_t Mask) { __nvvm_bar_warp_sync(Mask); }
+
+void syncThreads() { __syncthreads(); }
+
+constexpr uint32_t OMP_SPIN = 1000;
+constexpr uint32_t UNSET = 0;
+constexpr uint32_t SET = 1;
+
+// TODO: This seems to hide a bug in the declare variant handling. If it is
+// called before it is defined
+//       here the overload won't happen. Investigate lalter!
+void unsetLock(omp_lock_t *Lock) {
+  (void)atomicExchange((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST);
+}
+
+int testLock(omp_lock_t *Lock) {
+  return atomicAdd((uint32_t *)Lock, 0u, __ATOMIC_SEQ_CST);
+}
+
+void initLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void destoryLock(omp_lock_t *Lock) { unsetLock(Lock); }
+
+void setLock(omp_lock_t *Lock) {
+  // TODO: not sure spinning is a good idea here..
+  while (atomicCAS((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST) != UNSET) {
+    int32_t start = __nvvm_read_ptx_sreg_clock();
+    int32_t now;
+    for (;;) {
+      now = __nvvm_read_ptx_sreg_clock();
+      int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
+      if (cycles >= OMP_SPIN * mapping::getBlockId()) {
+        break;
+      }
+    }
+  } // wait for 0 to be the read value
+}
+
+#pragma omp end declare variant
+///}
+
+#pragma omp end declare target
+} // namespace impl
+
+#pragma omp declare target
+
+void synchronize::init(bool IsSPMD) {
+  if (!IsSPMD)
+    impl::namedBarrierInit();
+}
+
+void synchronize::warp(LaneMaskTy Mask) { impl::syncWarp(Mask); }
+
+void synchronize::threads() { impl::syncThreads(); }
+
+void fence::team(int Ordering) { impl::fenceTeam(Ordering); }
+
+void fence::kernel(int Ordering) { impl::fenceKernel(Ordering); }
+
+void fence::system(int Ordering) { impl::fenceSystem(Ordering); }
+
+uint32_t atomic::read(uint32_t *Addr, int Ordering) {
+  return impl::atomicRead(Addr, Ordering);
+}
+
+uint32_t atomic::inc(uint32_t *Addr, uint32_t V, int Ordering) {
+  return impl::atomicInc(Addr, V, Ordering);
+}
+
+uint32_t atomic::add(uint32_t *Addr, uint32_t V, int Ordering) {
+  return impl::atomicAdd(Addr, V, Ordering);
+}
+
+uint64_t atomic::add(uint64_t *Addr, uint64_t V, int Ordering) {
+  return impl::atomicAdd(Addr, V, Ordering);
+}
+
+extern "C" {
+void __kmpc_ordered(IdentTy *Loc, int32_t TId) {}
+
+void __kmpc_end_ordered(IdentTy *Loc, int32_t TId) {}
+
+int32_t __kmpc_cancel_barrier(IdentTy *Loc, int32_t TId) {
+  __kmpc_barrier(Loc, TId);
+  return 0;
+}
+
+void __kmpc_barrier(IdentTy *Loc, int32_t TId) {
+  if (mapping::isMainThreadInGenericMode())
+    return __kmpc_flush(Loc);
+
+  if (mapping::isSPMDMode())
+    return __kmpc_barrier_simple_spmd(Loc, TId);
+
+  impl::namedBarrier();
+}
+
+void __kmpc_barrier_simple_spmd(IdentTy *Loc, int32_t TId) {
+  synchronize::threads();
+}
+
+int32_t __kmpc_master(IdentTy *Loc, int32_t TId) {
+  int WarpSize = mapping::getWarpSize();
+  int BlockSize = mapping::getBlockSize();
+  int MasterId = ((WarpSize - 1) ^ (~0U)) & (BlockSize - 1);
+  return mapping::getThreadIdInBlock() == MasterId;
+}
+
+void __kmpc_end_master(IdentTy *Loc, int32_t TId) {}
+
+int32_t __kmpc_single(IdentTy *Loc, int32_t TId) {
+  return __kmpc_master(Loc, TId);
+}
+
+void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
+  // The barrier is explicitly called.
+}
+
+void __kmpc_flush(IdentTy *Loc) { fence::kernel(__ATOMIC_SEQ_CST); }
+
+__kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() {
+  return mapping::activemask();
+}
+
+void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { synchronize::warp(Mask); }
+
+void __kmpc_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  omp_set_lock(reinterpret_cast<omp_lock_t *>(Name));
+}
+
+void __kmpc_end_critical(IdentTy *Loc, int32_t TId, CriticalNameTy *Name) {
+  omp_unset_lock(reinterpret_cast<omp_lock_t *>(Name));
+}
+
+void omp_init_lock(omp_lock_t *Lock) { impl::initLock(Lock); }
+
+void omp_destroy_lock(omp_lock_t *Lock) { impl::destoryLock(Lock); }
+
+void omp_set_lock(omp_lock_t *Lock) { impl::setLock(Lock); }
+
+void omp_unset_lock(omp_lock_t *Lock) { impl::unsetLock(Lock); }
+
+int omp_test_lock(omp_lock_t *Lock) { return impl::testLock(Lock); }
+} // extern "C"
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Tasking.cpp b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Tasking.cpp
@@ -0,0 +1,104 @@
+//===-------- Tasking.cpp - NVPTX OpenMP tasks support ------------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Task implementation support.
+//
+// TODO: We should not allocate and execute the task in two steps. A new API is
+//       needed for that though.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Interface.h"
+#include "State.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+#pragma omp declare target
+
+TaskDescriptorTy *__kmpc_omp_task_alloc(IdentTy *, uint32_t, int32_t,
+                                        uint64_t TaskSizeInclPrivateValues,
+                                        uint64_t SharedValuesSize,
+                                        TaskFnTy TaskFn) {
+  auto TaskSizeInclPrivateValuesPadded =
+      utils::roundUp(TaskSizeInclPrivateValues, uint64_t(sizeof(void *)));
+  auto TaskSizeTotal = TaskSizeInclPrivateValuesPadded + SharedValuesSize;
+  TaskDescriptorTy *TaskDescriptor = (TaskDescriptorTy *)memory::allocGlobal(
+      TaskSizeTotal, "explicit task descriptor");
+  TaskDescriptor->Payload =
+      utils::advance(TaskDescriptor, TaskSizeInclPrivateValuesPadded);
+  TaskDescriptor->TaskFn = TaskFn;
+
+  return TaskDescriptor;
+}
+
+int32_t __kmpc_omp_task(IdentTy *Loc, uint32_t TId,
+                        TaskDescriptorTy *TaskDescriptor) {
+  return __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
+}
+
+int32_t __kmpc_omp_task_with_deps(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor, int32_t,
+                                  void *, int32_t, void *) {
+  state::DateEnvironmentRAII DERAII;
+
+  TaskDescriptor->TaskFn(0, TaskDescriptor);
+
+  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
+  return 0;
+}
+
+void __kmpc_omp_task_begin_if0(IdentTy *Loc, uint32_t TId,
+                               TaskDescriptorTy *TaskDescriptor) {
+  state::enterDataEnvironment();
+}
+
+void __kmpc_omp_task_complete_if0(IdentTy *Loc, uint32_t TId,
+                                  TaskDescriptorTy *TaskDescriptor) {
+  state::exitDataEnvironment();
+
+  memory::freeGlobal(TaskDescriptor, "explicit task descriptor");
+}
+
+void __kmpc_omp_wait_deps(IdentTy *Loc, uint32_t TId, int32_t, void *, int32_t,
+                          void *) {}
+
+void __kmpc_taskgroup(IdentTy *Loc, uint32_t TId) {}
+
+void __kmpc_end_taskgroup(IdentTy *Loc, uint32_t TId) {}
+
+int32_t __kmpc_omp_taskyield(IdentTy *Loc, uint32_t TId, int) { return 0; }
+
+int32_t __kmpc_omp_taskwait(IdentTy *Loc, uint32_t TId) { return 0; }
+
+void __kmpc_taskloop(IdentTy *Loc, uint32_t TId,
+                     TaskDescriptorTy *TaskDescriptor, int,
+                     uint64_t *LowerBound, uint64_t *UpperBound, int64_t, int,
+                     int32_t, uint64_t, void *) {
+  // Skip task entirely if empty iteration space.
+  if (*LowerBound > *UpperBound)
+    return;
+
+  // The compiler has already stored lb and ub in the TaskDescriptorTy structure
+  // as we are using a single task to execute the entire loop, we can leave
+  // the initial task_t untouched
+  __kmpc_omp_task_with_deps(Loc, TId, TaskDescriptor, 0, 0, 0, 0);
+}
+
+int omp_in_final(void) {
+  // treat all tasks as final... Specs may expect runtime to keep
+  // track more precisely if a task was actively set by users... This
+  // is not explicitly specified; will treat as if runtime can
+  // actively decide to put a non-final task into a final one.
+  return 1;
+}
+
+int omp_get_max_task_priority(void) { return 0; }
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Utils.cpp b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Utils.cpp
@@ -0,0 +1,133 @@
+//===------- Utils.cpp - OpenMP device runtime utility functions -- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils.h"
+
+#include "Mapping.h"
+
+using namespace _OMP;
+
+namespace impl {
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  *LowBits = (uint32_t)(Val & UINT64_C(0x00000000FFFFFFFF));
+  *HighBits = (uint32_t)((Val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  return (((uint64_t)HighBits) << 32) | (uint64_t)LowBits;
+}
+
+#pragma omp end declare variant
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+void Unpack(uint64_t Val, uint32_t *LowBits, uint32_t *HighBits) {
+  uint32_t LowBitsLocal, HighBitsLocal;
+  asm("mov.b64 {%0,%1}, %2;"
+      : "=r"(LowBitsLocal), "=r"(HighBitsLocal)
+      : "l"(Val));
+  *LowBits = LowBitsLocal;
+  *HighBits = HighBitsLocal;
+}
+
+uint64_t Pack(uint32_t LowBits, uint32_t HighBits) {
+  uint64_t Val;
+  asm("mov.b64 %0, {%1,%2};" : "=l"(Val) : "r"(LowBits), "r"(HighBits));
+  return Val;
+}
+
+#pragma omp end declare variant
+
+/// AMDGCN Implementation
+///
+///{
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  int Width = GetWarpSize();
+  int Self = GetLaneId();
+  int Index = SrcLane + (Self & ~(Width - 1));
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t LaneDelta,
+                    int32_t Width) {
+  int Self = GetLaneId();
+  int Index = Self + LaneDelta;
+  Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
+  return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
+}
+
+#pragma omp end declare variant
+///}
+
+/// NVPTX Implementation
+///
+///{
+#pragma omp begin declare variant match(                                       \
+    device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
+
+int32_t shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
+}
+
+int32_t shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta, int32_t Width) {
+  int32_t T = ((mapping::getWarpSize() - Width) << 8) | 0x1f;
+  return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
+}
+
+#pragma omp end declare variant
+} // namespace impl
+
+#pragma omp declare target
+
+uint64_t utils::pack(uint32_t LowBits, uint32_t HighBits) {
+  return impl::Pack(LowBits, HighBits);
+}
+
+void utils::unpack(uint64_t Val, uint32_t &LowBits, uint32_t &HighBits) {
+  impl::Unpack(Val, &LowBits, &HighBits);
+}
+
+int32_t utils::shuffle(uint64_t Mask, int32_t Var, int32_t SrcLane) {
+  return impl::shuffle(Mask, Var, SrcLane);
+}
+
+int32_t utils::shuffleDown(uint64_t Mask, int32_t Var, uint32_t Delta,
+                           int32_t Width) {
+  return impl::shuffleDown(Mask, Var, Delta, Width);
+}
+
+extern "C" {
+int32_t __kmpc_shuffle_int32(int32_t Val, int16_t Delta, int16_t SrcLane) {
+  return impl::shuffleDown(lanes::All, Val, Delta, SrcLane);
+}
+
+int64_t __kmpc_shuffle_int64(int64_t Val, int16_t Delta, int16_t Width) {
+  uint32_t lo, hi;
+  utils::unpack(Val, lo, hi);
+  hi = impl::shuffleDown(lanes::All, hi, Delta, Width);
+  lo = impl::shuffleDown(lanes::All, lo, Delta, Width);
+  return utils::pack(lo, hi);
+}
+}
+
+#pragma omp end declare target
diff --git a/openmp/libomptarget/DeviceRTL/src/Workshare.cpp b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/DeviceRTL/src/Workshare.cpp
@@ -0,0 +1,769 @@
+//===----- Workshare.cpp -  OpenMP workshare implementation ------ C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the KMPC interface
+// for the loop construct plus other worksharing constructs that use the same
+// interface as loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Debug.h"
+#include "Interface.h"
+#include "Mapping.h"
+#include "State.h"
+#include "Synchronization.h"
+#include "Types.h"
+#include "Utils.h"
+
+using namespace _OMP;
+
+// TODO:
+struct DynamicScheduleTracker {
+  int64_t Chunk;
+  int64_t LoopUpperBound;
+  int64_t NextLowerBound;
+  int64_t Stride;
+  kmp_sched_t ScheduleType;
+  DynamicScheduleTracker *NextDST;
+};
+
+#define PRINT(...)
+#define PRINT0(...)
+#define ASSERT0(...)
+
+// used by the library for the interface with the app
+#define DISPATCH_FINISHED 0
+#define DISPATCH_NOTFINISHED 1
+
+// used by dynamic scheduling
+#define FINISHED 0
+#define NOT_FINISHED 1
+#define LAST_CHUNK 2
+
+#pragma omp declare target
+
+// TODO: This variable is a hack inherited from the old runtime.
+uint64_t SHARED(Cnt);
+
+template <typename T, typename ST> struct omptarget_nvptx_LoopSupport {
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling with chunk
+
+  // Generic implementation of OMP loop scheduling with static policy
+  /*! \brief Calculate initial bounds for static loop and stride
+   *  @param[in] loc location in code of the call (not used here)
+   *  @param[in] global_tid global thread id
+   *  @param[in] schetype type of scheduling (see omptarget-nvptx.h)
+   *  @param[in] plastiter pointer to last iteration
+   *  @param[in,out] pointer to loop lower bound. it will contain value of
+   *  lower bound of first chunk
+   *  @param[in,out] pointer to loop upper bound. It will contain value of
+   *  upper bound of first chunk
+   *  @param[in,out] pointer to loop stride. It will contain value of stride
+   *  between two successive chunks executed by the same thread
+   *  @param[in] loop increment bump
+   *  @param[in] chunk size
+   */
+
+  // helper function for static chunk
+  static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, ST chunk,
+                             T entityId, T numberOfEntities) {
+    // each thread executes multiple chunks all of the same size, except
+    // the last one
+    // distance between two successive chunks
+    stride = numberOfEntities * chunk;
+    lb = lb + entityId * chunk;
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    // Say ub' is the begining of the last chunk. Then who ever has a
+    // lower bound plus a multiple of the increment equal to ub' is
+    // the last one.
+    T beginingLastChunk = inputUb - (inputUb % chunk);
+    last = ((beginingLastChunk - lb) % stride) == 0;
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Loop with static scheduling without chunk
+
+  // helper function for static no chunk
+  static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, ST &chunk,
+                               T entityId, T numberOfEntities) {
+    // No chunk size specified.  Each thread or warp gets at most one
+    // chunk; chunks are all almost of equal size
+    T loopSize = ub - lb + 1;
+
+    chunk = loopSize / numberOfEntities;
+    T leftOver = loopSize - chunk * numberOfEntities;
+
+    if (entityId < leftOver) {
+      chunk++;
+      lb = lb + entityId * chunk;
+    } else {
+      lb = lb + entityId * chunk + leftOver;
+    }
+
+    T inputUb = ub;
+    ub = lb + chunk - 1; // Clang uses i <= ub
+    last = lb <= inputUb && inputUb <= ub;
+    stride = loopSize; // make sure we only do 1 chunk per warp
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for Static Init
+
+  static void for_static_init(int32_t gtid, int32_t schedtype,
+                              int32_t *plastiter, T *plower, T *pupper,
+                              ST *pstride, ST chunk, bool IsSPMDExecutionMode) {
+    // When IsRuntimeUninitialized is true, we assume that the caller is
+    // in an L0 parallel region and that all worker threads participate.
+
+    // Assume we are in teams region or that we use a single block
+    // per target region
+    int numberOfActiveOMPThreads = omp_get_num_threads();
+
+    // All warps that are in excess of the maximum requested, do
+    // not execute the loop
+    PRINT(LD_LOOP,
+          "OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
+          "%d, num tids %d\n",
+          (int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
+          (int)numberOfActiveOMPThreads);
+    ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
+            "current thread is not needed here; error");
+
+    // copy
+    int lastiter = 0;
+    T lb = *plower;
+    T ub = *pupper;
+    ST stride = *pstride;
+
+    // init
+    switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
+    case kmp_sched_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_balanced_chunk: {
+      if (chunk > 0) {
+        // round up to make sure the chunk is enough to cover all iterations
+        T tripCount = ub - lb + 1; // +1 because ub is inclusive
+        T span = (tripCount + numberOfActiveOMPThreads - 1) /
+                 numberOfActiveOMPThreads;
+        // perform chunk adjustment
+        chunk = (span + chunk - 1) & ~(chunk - 1);
+
+        ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+        T oldUb = ub;
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+        if (ub > oldUb)
+          ub = oldUb;
+        break;
+      }
+    } // note: if chunk <=0, use nochunk
+    case kmp_sched_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
+                       numberOfActiveOMPThreads);
+      break;
+    }
+    case kmp_sched_distr_static_chunk: {
+      if (chunk > 0) {
+        ForStaticChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
+                       omp_get_num_teams());
+        break;
+      } // note: if chunk <=0, use nochunk
+    }
+    case kmp_sched_distr_static_nochunk: {
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, omp_get_team_num(),
+                       omp_get_num_teams());
+      break;
+    }
+    case kmp_sched_distr_static_chunk_sched_static_chunkone: {
+      ForStaticChunk(lastiter, lb, ub, stride, chunk,
+                     numberOfActiveOMPThreads * omp_get_team_num() + gtid,
+                     omp_get_num_teams() * numberOfActiveOMPThreads);
+      break;
+    }
+    default: {
+      // ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
+      PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
+            (int)schedtype);
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
+                     numberOfActiveOMPThreads);
+      break;
+    }
+    }
+    // copy back
+    *plastiter = lastiter;
+    *plower = lb;
+    *pupper = ub;
+    *pstride = stride;
+    PRINT(LD_LOOP,
+          "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
+          "%d\n",
+          (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
+          (long long)(*plower), (long long)(*pupper), (long long)(*pstride),
+          (int)lastiter);
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch Init
+
+  static int OrderedSchedule(kmp_sched_t schedule) {
+    return schedule >= kmp_sched_ordered_first &&
+           schedule <= kmp_sched_ordered_last;
+  }
+
+  static void dispatch_init(IdentTy *loc, int32_t threadId,
+                            kmp_sched_t schedule, T lb, T ub, ST st, ST chunk,
+                            DynamicScheduleTracker *DST) {
+    int tid = mapping::getThreadIdInBlock();
+    T tnum = omp_get_num_threads();
+    T tripCount = ub - lb + 1; // +1 because ub is inclusive
+    ASSERT0(LT_FUSSY, threadId < tnum,
+            "current thread is not needed here; error");
+
+    /* Currently just ignore the monotonic and non-monotonic modifiers
+     * (the compiler isn't producing them * yet anyway).
+     * When it is we'll want to look at them somewhere here and use that
+     * information to add to our schedule choice. We shouldn't need to pass
+     * them on, they merely affect which schedule we can legally choose for
+     * various dynamic cases. (In particular, whether or not a stealing scheme
+     * is legal).
+     */
+    schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
+
+    // Process schedule.
+    if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
+      if (OrderedSchedule(schedule))
+        __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
+            (long)tnum, (long long)tripCount, (int)schedule);
+      schedule = kmp_sched_static_chunk;
+      chunk = tripCount; // one thread gets the whole loop
+    } else if (schedule == kmp_sched_runtime) {
+      // process runtime
+      omp_sched_t rtSched;
+      int ChunkInt;
+      omp_get_schedule(&rtSched, &ChunkInt);
+      chunk = ChunkInt;
+      switch (rtSched) {
+      case omp_sched_static: {
+        if (chunk > 0)
+          schedule = kmp_sched_static_chunk;
+        else
+          schedule = kmp_sched_static_nochunk;
+        break;
+      }
+      case omp_sched_auto: {
+        schedule = kmp_sched_static_chunk;
+        chunk = 1;
+        break;
+      }
+      case omp_sched_dynamic:
+      case omp_sched_guided: {
+        schedule = kmp_sched_dynamic;
+        break;
+      }
+      }
+      PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else if (schedule == kmp_sched_auto) {
+      schedule = kmp_sched_static_chunk;
+      chunk = 1;
+      PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+    } else {
+      PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
+            (long long)chunk);
+      // ASSERT(LT_FUSSY,
+      //        schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+      //        "unknown schedule %d & chunk %lld\n", (int)schedule,
+      //        (long long)chunk);
+    }
+
+    // init schedules
+    if (schedule == kmp_sched_static_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum, DST->LoopUpperBound,
+            (unsigned long long)DST->NextLowerBound,
+            (unsigned long long)DST->Stride);
+    } else if (schedule == kmp_sched_static_balanced_chunk) {
+      ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      // round up to make sure the chunk is enough to cover all iterations
+      T span = (tripCount + tnum - 1) / tnum;
+      // perform chunk adjustment
+      chunk = (span + chunk - 1) & ~(chunk - 1);
+
+      T oldUb = ub;
+      ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
+      if (ub > oldUb)
+        ub = oldUb;
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static chunk) : num threads = %d, ub =  %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum, DST->LoopUpperBound,
+            (unsigned long long)DST->NextLowerBound,
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_static_nochunk) {
+      ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
+      // save sched state
+      DST->ScheduleType = schedule;
+      // save ub
+      DST->LoopUpperBound = ub;
+      // compute static chunk
+      ST stride;
+      int lastiter = 0;
+      ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
+      // save computed params
+      DST->Chunk = chunk;
+      DST->NextLowerBound = lb;
+      DST->Stride = stride;
+      PRINT(LD_LOOP,
+            "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
+            ", next lower bound = %llu, stride = %llu\n",
+            (int)tnum, DST->LoopUpperBound,
+            (unsigned long long)DST->NextLowerBound,
+            (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
+                tid));
+    } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
+      // save data
+      DST->ScheduleType = schedule;
+      if (chunk < 1)
+        chunk = 1;
+      DST->Chunk = chunk;
+      DST->LoopUpperBound = ub;
+      DST->NextLowerBound = lb;
+      __kmpc_barrier(loc, threadId);
+      if (tid == 0) {
+        Cnt = 0;
+        fence::team(__ATOMIC_SEQ_CST);
+      }
+      __kmpc_barrier(loc, threadId);
+      PRINT(LD_LOOP,
+            "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
+            ", chunk %" PRIu64 "\n",
+            (int)tnum, (unsigned long long)DST->NextLowerBound,
+            DST->LoopUpperBound, DST->Chunk);
+    }
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // Support for dispatch next
+
+  static uint64_t NextIter() {
+    __kmpc_impl_lanemask_t active = mapping::activemask();
+    uint32_t leader = utils::ffs(active) - 1;
+    uint32_t change = utils::popc(active);
+    __kmpc_impl_lanemask_t lane_mask_lt = mapping::lanemaskLT();
+    unsigned int rank = utils::popc(active & lane_mask_lt);
+    uint64_t warp_res;
+    if (rank == 0) {
+      warp_res = atomic::add(&Cnt, change, __ATOMIC_SEQ_CST);
+    }
+    warp_res = utils::shuffle(active, warp_res, leader);
+    return warp_res + rank;
+  }
+
+  static int DynamicNextChunk(T &lb, T &ub, T chunkSize, T loopLowerBound,
+                              T loopUpperBound) {
+    T N = NextIter();
+    lb = loopLowerBound + N * chunkSize;
+    ub = lb + chunkSize - 1; // Clang uses i <= ub
+
+    // 3 result cases:
+    //  a. lb and ub < loopUpperBound --> NOT_FINISHED
+    //  b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
+    //  NOT_FINISHED
+    //  c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
+    // a.
+    if (lb <= loopUpperBound && ub < loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      return NOT_FINISHED;
+    }
+    // b.
+    if (lb <= loopUpperBound) {
+      PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
+            (long long)lb, (long long)ub, (long long)loopUpperBound);
+      ub = loopUpperBound;
+      return LAST_CHUNK;
+    }
+    // c. if we are here, we are in case 'c'
+    lb = loopUpperBound + 2;
+    ub = loopUpperBound + 1;
+    PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
+          (long long)ub, (long long)loopUpperBound);
+    return FINISHED;
+  }
+
+  static int dispatch_next(IdentTy *loc, int32_t gtid, int32_t *plast,
+                           T *plower, T *pupper, ST *pstride,
+                           DynamicScheduleTracker *DST) {
+    // ID of a thread in its own warp
+
+    // automatically selects thread or warp ID based on selected implementation
+    ASSERT0(LT_FUSSY, gtid < omp_get_num_threads(),
+            "current thread is not needed here; error");
+    // retrieve schedule
+    kmp_sched_t schedule = DST->ScheduleType;
+
+    // xxx reduce to one
+    if (schedule == kmp_sched_static_chunk ||
+        schedule == kmp_sched_static_nochunk) {
+      T myLb = DST->NextLowerBound;
+      T ub = DST->LoopUpperBound;
+      // finished?
+      if (myLb > ub) {
+        PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
+              (long long)myLb, (long long)ub);
+        return DISPATCH_FINISHED;
+      }
+      // not finished, save current bounds
+      ST chunk = DST->Chunk;
+      *plower = myLb;
+      T myUb = myLb + chunk - 1; // Clang uses i <= ub
+      if (myUb > ub)
+        myUb = ub;
+      *pupper = myUb;
+      *plast = (int32_t)(myUb == ub);
+
+      // increment next lower bound by the stride
+      ST stride = DST->Stride;
+      DST->NextLowerBound = myLb + stride;
+      PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
+            (long long)*plower, (long long)*pupper);
+      return DISPATCH_NOTFINISHED;
+    }
+    ASSERT0(LT_FUSSY,
+            schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
+            "bad sched");
+    T myLb, myUb;
+    int finished = DynamicNextChunk(myLb, myUb, DST->Chunk, DST->NextLowerBound,
+                                    DST->LoopUpperBound);
+
+    if (finished == FINISHED)
+      return DISPATCH_FINISHED;
+
+    // not finished (either not finished or last chunk)
+    *plast = (int32_t)(finished == LAST_CHUNK);
+    *plower = myLb;
+    *pupper = myUb;
+    *pstride = 1;
+
+    PRINT(LD_LOOP,
+          "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
+          "last %d\n",
+          (int)omp_get_num_threads(), (int)GetNumberOfWorkersInTeam(),
+          (long long)*plower, (long long)*pupper, (long long)*pstride,
+          (int)*plast);
+    return DISPATCH_NOTFINISHED;
+  }
+
+  static void dispatch_fini() {
+    // nothing
+  }
+
+  ////////////////////////////////////////////////////////////////////////////////
+  // end of template class that encapsulate all the helper functions
+  ////////////////////////////////////////////////////////////////////////////////
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (dyn loops)
+////////////////////////////////////////////////////////////////////////////////
+
+// TODO: This is a stopgap. We probably want to expand the dispatch API to take
+//       an DST pointer which can then be allocated properly without malloc.
+DynamicScheduleTracker *THREAD_LOCAL(ThreadDSTPtr);
+
+// Create a new DST, link the current one, and define the new as current.
+static DynamicScheduleTracker *pushDST() {
+  DynamicScheduleTracker *NewDST = static_cast<DynamicScheduleTracker *>(
+      memory::allocGlobal(sizeof(DynamicScheduleTracker), "new DST"));
+  *NewDST = DynamicScheduleTracker({0});
+  NewDST->NextDST = ThreadDSTPtr;
+  ThreadDSTPtr = NewDST;
+  return ThreadDSTPtr;
+}
+
+// Return the current DST.
+static DynamicScheduleTracker *peekDST() { return ThreadDSTPtr; }
+
+// Pop the current DST and restore the last one.
+static void popDST() {
+  DynamicScheduleTracker *OldDST = ThreadDSTPtr->NextDST;
+  memory::freeGlobal(ThreadDSTPtr, "remove DST");
+  ThreadDSTPtr = OldDST;
+}
+
+extern "C" {
+
+// init
+void __kmpc_dispatch_init_4(IdentTy *loc, int32_t tid, int32_t schedule,
+                            int32_t lb, int32_t ub, int32_t st, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_4u(IdentTy *loc, int32_t tid, int32_t schedule,
+                             uint32_t lb, uint32_t ub, int32_t st,
+                             int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_8(IdentTy *loc, int32_t tid, int32_t schedule,
+                            int64_t lb, int64_t ub, int64_t st, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+void __kmpc_dispatch_init_8u(IdentTy *loc, int32_t tid, int32_t schedule,
+                             uint64_t lb, uint64_t ub, int64_t st,
+                             int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
+  DynamicScheduleTracker *DST = pushDST();
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
+      loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk, DST);
+}
+
+// next
+int __kmpc_dispatch_next_4(IdentTy *loc, int32_t tid, int32_t *p_last,
+                           int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_4u(IdentTy *loc, int32_t tid, int32_t *p_last,
+                            uint32_t *p_lb, uint32_t *p_ub, int32_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_8(IdentTy *loc, int32_t tid, int32_t *p_last,
+                           int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+int __kmpc_dispatch_next_8u(IdentTy *loc, int32_t tid, int32_t *p_last,
+                            uint64_t *p_lb, uint64_t *p_ub, int64_t *p_st) {
+  PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
+  DynamicScheduleTracker *DST = peekDST();
+  return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
+      loc, tid, p_last, p_lb, p_ub, p_st, DST);
+}
+
+// fini
+void __kmpc_dispatch_fini_4(IdentTy *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_4u(IdentTy *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_8(IdentTy *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
+  popDST();
+}
+
+void __kmpc_dispatch_fini_8u(IdentTy *loc, int32_t tid) {
+  PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
+  popDST();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// KMP interface implementation (static loops)
+////////////////////////////////////////////////////////////////////////////////
+
+void __kmpc_for_static_init_4(IdentTy *loc, int32_t global_tid,
+                              int32_t schedtype, int32_t *plastiter,
+                              int32_t *plower, int32_t *pupper,
+                              int32_t *pstride, int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_4u(IdentTy *loc, int32_t global_tid,
+                               int32_t schedtype, int32_t *plastiter,
+                               uint32_t *plower, uint32_t *pupper,
+                               int32_t *pstride, int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_8(IdentTy *loc, int32_t global_tid,
+                              int32_t schedtype, int32_t *plastiter,
+                              int64_t *plower, int64_t *pupper,
+                              int64_t *pstride, int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_8u(IdentTy *loc, int32_t global_tid,
+                               int32_t schedtype, int32_t *plastiter,
+                               uint64_t *plower, uint64_t *pupper,
+                               int64_t *pstride, int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      mapping::isSPMDMode());
+}
+
+void __kmpc_for_static_init_4_simple_spmd(IdentTy *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int32_t *plower, int32_t *pupper,
+                                          int32_t *pstride, int32_t incr,
+                                          int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+void __kmpc_for_static_init_4u_simple_spmd(IdentTy *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint32_t *plower,
+                                           uint32_t *pupper, int32_t *pstride,
+                                           int32_t incr, int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+void __kmpc_for_static_init_8_simple_spmd(IdentTy *loc, int32_t global_tid,
+                                          int32_t schedtype, int32_t *plastiter,
+                                          int64_t *plower, int64_t *pupper,
+                                          int64_t *pstride, int64_t incr,
+                                          int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+void __kmpc_for_static_init_8u_simple_spmd(IdentTy *loc, int32_t global_tid,
+                                           int32_t schedtype,
+                                           int32_t *plastiter, uint64_t *plower,
+                                           uint64_t *pupper, int64_t *pstride,
+                                           int64_t incr, int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/true);
+}
+
+void __kmpc_for_static_init_4_simple_generic(IdentTy *loc, int32_t global_tid,
+                                             int32_t schedtype,
+                                             int32_t *plastiter,
+                                             int32_t *plower, int32_t *pupper,
+                                             int32_t *pstride, int32_t incr,
+                                             int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+void __kmpc_for_static_init_4u_simple_generic(
+    IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
+    int32_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+void __kmpc_for_static_init_8_simple_generic(IdentTy *loc, int32_t global_tid,
+                                             int32_t schedtype,
+                                             int32_t *plastiter,
+                                             int64_t *plower, int64_t *pupper,
+                                             int64_t *pstride, int64_t incr,
+                                             int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
+  omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+void __kmpc_for_static_init_8u_simple_generic(
+    IdentTy *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
+    uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
+    int64_t chunk) {
+  PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
+  omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
+      global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
+      /*IsSPMDExecutionMode=*/false);
+}
+
+void __kmpc_for_static_fini(IdentTy *loc, int32_t global_tid) {
+  PRINT0(LD_IO, "call kmpc_for_static_fini\n");
+}
+}
+
+#pragma omp end declare target