diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -258,9 +258,6 @@
   llvm::Value *emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc,
                                   unsigned Flags = 0);
 
-  /// Returns pointer to ident_t type.
-  llvm::Type *getIdentTyPointerTy();
-
   /// Gets thread id value for the current thread.
   ///
   llvm::Value *getThreadID(CodeGenFunction &CGF, SourceLocation Loc);
@@ -762,6 +759,9 @@
   virtual ~CGOpenMPRuntime() {}
   virtual void clear();
 
+  /// Returns pointer to ident_t type.
+  llvm::Type *getIdentTyPointerTy();
+
   /// Get the platform-specific name separator.
   std::string getName(ArrayRef<StringRef> Parts) const;
 
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -1934,6 +1934,17 @@
 llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
     const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
     OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+/*
+  llvm::errs() << "Shared clauses:\n";
+  for (const auto *C : D.getClausesOfKind<OMPSharedClause>()) {
+    for (const Expr *E : C->getVarRefs()) {
+      E->dump();
+    }
+  }
+  llvm::errs() << "Shared done\n";
+*/
+
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
     bool &IsInParallelRegion;
@@ -4071,6 +4082,7 @@
     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
   ASTContext &Ctx = CGM.getContext();
   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
+  //CS.dump();
 
   // Create a function that takes as argument the source thread.
   FunctionArgList WrapperArgs;
@@ -4102,6 +4114,7 @@
                     D.getBeginLoc(), D.getBeginLoc());
 
   const auto *RD = CS.getCapturedRecordDecl();
+  //RD->dump();
   auto CurField = RD->field_begin();
 
   Address ZeroAddr = CGF.CreateMemTemp(
@@ -4166,6 +4179,8 @@
   if (CS.capture_size() > 0) {
     ASTContext &CGFContext = CGF.getContext();
     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
+      //CI->getCapturedVar()->dump();
+      //CI->capturesVariableByCopy();
       QualType ElemTy = CurField->getType();
       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
                                                CGF.getPointerSize());
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
@@ -0,0 +1,260 @@
+//===-- CGOpenMPRuntimeTRegion.h --- OpenMP RT TRegion interface codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code generation interface for OpenMP target offloading though the generic
+// Target Region (TRegion) interface.
+//
+// See openmp/libomptarget/deviceRTLs/common/target_Region.h for further
+// information on the interface functions and their intended use.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H
+
+#include "CGOpenMPRuntime.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeTRegion : public CGOpenMPRuntime {
+  // TODO: The target region interface only covers kernel codes for now. This
+  //       therefore codegen implicitly assumes the target region kernel
+  //       interface is targeted. Once a second target region interface is put
+  //       in place, e.g., specialized to many-core offloading, we might need
+  //       to make the target interface explicit.
+
+  /// Create an outlined function for a target kernel.
+  ///
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+                  const RegionCodeGenTy &CodeGen);
+
+  /// Helper for generic kernel mode, target directive's entry function.
+  void emitKernelHeader(CodeGenFunction &CGF, llvm::BasicBlock *&ExitBB);
+
+  /// Signal termination of generic mode execution.
+  void emitKernelFooter(CodeGenFunction &CGF, llvm::BasicBlock *ExitBB);
+
+  //
+  // Base class overrides.
+  //
+
+  /// Creates offloading entry for the provided entry ID \a ID,
+  /// address \a Addr, size \a Size, and flags \a Flags.
+  void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
+                          uint64_t Size, int32_t Flags,
+                          llvm::GlobalValue::LinkageTypes Linkage) override;
+
+  /// Emit outlined function for 'target' directive.
+  ///
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitTargetOutlinedFunction(const OMPExecutableDirective &D,
+                                  StringRef ParentName,
+                                  llvm::Function *&OutlinedFn,
+                                  llvm::Constant *&OutlinedFnID,
+                                  bool IsOffloadEntry,
+                                  const RegionCodeGenTy &CodeGen) override;
+
+protected:
+  /// Get the function name of an outlined region, customized to the target.
+  StringRef getOutlinedHelperName() const override { return ".omp_TRegion."; }
+
+public:
+  explicit CGOpenMPRuntimeTRegion(CodeGenModule &CGM);
+
+  /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
+  /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
+  virtual void emitProcBindClause(CodeGenFunction &CGF,
+                                  OpenMPProcBindClauseKind ProcBind,
+                                  SourceLocation Loc) override;
+
+  /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
+  /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
+  /// clause.
+  /// \param NumThreads An integer value of threads.
+  virtual void emitNumThreadsClause(CodeGenFunction &CGF,
+                                    llvm::Value *NumThreads,
+                                    SourceLocation Loc) override;
+
+  /// Set the number of teams to \p NumTeams and the thread limit to
+  /// \p ThreadLimit.
+  ///
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                          const Expr *ThreadLimit, SourceLocation Loc) override;
+
+  /// Emits inlined function for the specified OpenMP parallel
+  //  directive.
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Value *
+  emitParallelOutlinedFunction(const OMPExecutableDirective &D,
+                               const VarDecl *ThreadIDVar,
+                               OpenMPDirectiveKind InnermostKind,
+                               const RegionCodeGenTy &CodeGen) override;
+
+  /// Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                        llvm::Value *OutlinedFn,
+                        ArrayRef<llvm::Value *> CapturedVars,
+                        const Expr *IfCond) override;
+
+  /// Emits a critical region.
+  /// \param CriticalName Name of the critical region.
+  /// \param CriticalOpGen Generator for the statement associated with the given
+  /// critical region.
+  /// \param Hint Value of the 'hint' clause (optional).
+  void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName,
+                          const RegionCodeGenTy &CriticalOpGen,
+                          SourceLocation Loc,
+                          const Expr *Hint = nullptr) override;
+
+  /// Emit a code for reduction clause.
+  ///
+  /// \param Privates List of private copies for original reduction arguments.
+  /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+  /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+  /// or 'operator binop(LHS, RHS)'.
+  /// \param Options List of options for reduction codegen:
+  ///     WithNowait true if parent directive has also nowait clause, false
+  ///     otherwise.
+  ///     SimpleReduction Emit reduction operation only. Used for omp simd
+  ///     directive on the host.
+  ///     ReductionKind The kind of reduction to perform.
+  virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
+                             ArrayRef<const Expr *> Privates,
+                             ArrayRef<const Expr *> LHSExprs,
+                             ArrayRef<const Expr *> RHSExprs,
+                             ArrayRef<const Expr *> ReductionOps,
+                             ReductionOptionsTy Options) override;
+
+  /// Emits OpenMP-specific function prolog.
+  /// Required for device constructs.
+  void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override;
+
+  /// Cleans up references to the objects in finished function.
+  ///
+  void functionFinished(CodeGenFunction &CGF) override;
+
+  /// Choose a default value for the dist_schedule clause.
+  void
+  getDefaultDistScheduleAndChunk(CodeGenFunction &CGF,
+                                 const OMPLoopDirective &S,
+                                 OpenMPDistScheduleClauseKind &ScheduleKind,
+                                 llvm::Value *&Chunk) const override;
+
+  /// Choose a default value for the schedule clause.
+  void getDefaultScheduleAndChunk(CodeGenFunction &CGF,
+                                  const OMPLoopDirective &S,
+                                  OpenMPScheduleClauseKind &ScheduleKind,
+                                  const Expr *&ChunkExpr) const override;
+
+  /// Perform check on requires decl to ensure that target architecture
+  /// supports unified addressing
+  void checkArchForUnifiedAddressing(CodeGenModule &CGM,
+                                     const OMPRequiresDecl *D) const override;
+
+  /// Emits inlined function for the specified OpenMP teams
+  //  directive.
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Value *
+  emitTeamsOutlinedFunction(const OMPExecutableDirective &D,
+                            const VarDecl *ThreadIDVar,
+                            OpenMPDirectiveKind InnermostKind,
+                            const RegionCodeGenTy &CodeGen) override;
+
+  /// Emits code for teams call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run by team masters. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
+                     SourceLocation Loc, llvm::Value *OutlinedFn,
+                     ArrayRef<llvm::Value *> CapturedVars) override;
+
+protected:
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// SPMD mode.
+  virtual bool isKnownSPMDMode() const { return false; }
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// execution without runtime support.
+  virtual bool mayNeedRuntimeSupport() const { return true; }
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// execution without data sharing support.
+  virtual bool mayPerformDataSharing() const { return true; }
+
+private:
+  /// Simple container for a wrapper of an outlined parallel function and the
+  /// layout of the passed variables (= captured variables, both shared and
+  /// firstprivate).
+  struct WrapperInfo {
+    llvm::Function *WrapperFn = nullptr;
+    llvm::StructType *SharedVarsStructTy = nullptr;
+    llvm::StructType *PrivateVarsStructTy = nullptr;
+    llvm::SmallBitVector CaptureIsPrivate;
+  };
+
+  /// Map an outlined function to its wrapper and shared struct type. The latter
+  /// defines the layout of the payload and the wrapper will unpack that payload
+  /// and pass the values to the outlined function.
+  llvm::DenseMap<llvm::Function *, WrapperInfo> WrapperInfoMap;
+
+  /// Emit function which wraps the outline parallel region
+  /// and controls the parameters which are passed to this function.
+  /// The wrapper ensures that the outlined function is called
+  /// with the correct arguments when data is shared.
+  void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn,
+                                        const OMPExecutableDirective &D);
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEKERNEL_H
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
new file mode 100644
--- /dev/null
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
@@ -0,0 +1,708 @@
+//===-- CGOpenMPRuntimeTRegion.cpp - OpenMP RT TRegion interface codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the code generation interface for OpenMP target offloading
+// though the Target Region (TRegion) interface.
+//
+// See the file comment in CGOpenMPRuntimeTRegion.h for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeTRegion.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtVisitor.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+namespace {
+
+/// Enums for all functions in the target
+enum OpenMPTargetRuntimeLibraryCalls {
+  OMPRTL__kmpc_target_region_kernel_init,
+  OMPRTL__kmpc_target_region_kernel_deinit,
+  OMPRTL__kmpc_target_region_kernel_parallel,
+};
+
+/// Return the runtime function declaration specified by \p Function.
+static llvm::Function *getOrCreateRuntimeFunctionDeclaration(
+    CGOpenMPRuntimeTRegion &CG, CodeGenModule &CGM,
+    OpenMPTargetRuntimeLibraryCalls Function) {
+
+  llvm::Function *RTFn;
+  auto *I1Ty = llvm::IntegerType::getInt1Ty(CGM.getLLVMContext());
+  switch (Function) {
+  case OMPRTL__kmpc_target_region_kernel_init: {
+    // char __kmpc_target_region_kernel_init(bool UseSPMDMode,
+    //                                       bool UseStateMachine,
+    //                                       bool RequiresOMPRuntime,
+    //                                       bool RequiresDataSharing);
+    llvm::Type *TypeParams[] = {I1Ty, I1Ty, I1Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.Int8Ty, TypeParams, /* isVarArg */ false);
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_init"));
+    break;
+  }
+  case OMPRTL__kmpc_target_region_kernel_deinit: {
+    // void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+    //                                         bool RequiredOMPRuntime);
+    llvm::Type *TypeParams[] = {I1Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false);
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_deinit"));
+    break;
+  }
+  case OMPRTL__kmpc_target_region_kernel_parallel: {
+    // typedef void (*ParallelWorkFnTy)(void *, void *);
+    auto *ParWorkFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, {CGM.VoidPtrTy, CGM.VoidPtrTy},
+                                /* isVarArg */ false);
+    //
+    // void __kmpc_target_region_kernel_parallel(bool UseSPMDMode,
+    //                                           bool RequiredOMPRuntime,
+    //                                           ParallelWorkFnTy WorkFn,
+    //                                           void *SharedVars,
+    //                                           uint16_t SharedVarsBytes,
+    //                                           void *PrivateVars,
+    //                                           uint16_t PrivateVarsBytes,
+    //                                           bool SharedPointers);
+    llvm::Type *TypeParams[] = {
+        I1Ty,          I1Ty,        ParWorkFnTy->getPointerTo(),
+        CGM.VoidPtrTy, CGM.Int16Ty, CGM.VoidPtrTy,
+        CGM.Int16Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false);
+
+    RTFn = cast<llvm::Function>(CGM.CreateRuntimeFunction(
+        FnTy, "__kmpc_target_region_kernel_parallel"));
+
+    RTFn->addParamAttr(2, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(3, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(5, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(5, llvm::Attribute::ReadOnly);
+
+    // Add the callback metadata if it is not present already.
+    if (!RTFn->hasMetadata(llvm::LLVMContext::MD_callback)) {
+      llvm::LLVMContext &Ctx = RTFn->getContext();
+      llvm::MDBuilder MDB(Ctx);
+      // Annotate the callback behavior of __kmpc_target_region_kernel_parallel:
+      //  - The callback callee is WorkFn, argument 2 starting with 0.
+      //  - The first callback payload is SharedVars.
+      //  - The second callback payload is PrivateVars.
+      RTFn->addMetadata(
+          llvm::LLVMContext::MD_callback,
+          *llvm::MDNode::get(
+              Ctx, {MDB.createCallbackEncoding(2, {3, 5},
+                                               /* VarArgsArePassed */ false)}));
+    }
+    break;
+  }
+  }
+
+  // TODO: Remove all globals and set this attribute.
+  //
+  // This is overwritten when the definition is linked in.
+  //RTFn->addFnAttr(llvm::Attribute::InaccessibleMemOrArgMemOnly);
+
+  return RTFn;
+}
+
+} // anonymous namespace
+
+void CGOpenMPRuntimeTRegion::emitKernel(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        const RegionCodeGenTy &CodeGen) {
+  WrapperInfoMap.clear();
+
+  // Emit target region as a standalone region.
+  class KernelPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeTRegion &RT;
+    llvm::BasicBlock *ExitBB;
+
+  public:
+    KernelPrePostActionTy(CGOpenMPRuntimeTRegion &RT)
+        : RT(RT), ExitBB(nullptr) {}
+
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitKernelHeader(CGF, ExitBB);
+      // Skip target region initialization.
+      RT.setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true);
+    }
+
+    void Exit(CodeGenFunction &CGF) override {
+      RT.clearLocThreadIdInsertPt(CGF);
+      RT.emitKernelFooter(CGF, ExitBB);
+    }
+
+  } Action(*this);
+  CodeGen.setAction(Action);
+
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   /* IsOffloadEntry */ true, CodeGen);
+}
+
+void CGOpenMPRuntimeTRegion::emitKernelHeader(CodeGenFunction &CGF,
+                                              llvm::BasicBlock *&ExitBB) {
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
+  ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::Value *Args[] = {
+      /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()),
+      /* UseStateMachine */ Bld.getInt1(1),
+      /* RequiresOMPRuntime */
+      Bld.getInt1(mayNeedRuntimeSupport()),
+      /* RequiresDataSharing */ Bld.getInt1(mayPerformDataSharing())};
+  llvm::CallInst *InitCI = CGF.EmitRuntimeCall(
+      getOrCreateRuntimeFunctionDeclaration(
+          *this, CGM, OMPRTL__kmpc_target_region_kernel_init),
+      Args);
+
+  llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(InitCI, Bld.getInt8(1));
+
+  Bld.CreateCondBr(ExecuteCnd, ExecuteBB, ExitBB);
+  CGF.EmitBlock(ExecuteBB);
+}
+
+void CGOpenMPRuntimeTRegion::emitKernelFooter(CodeGenFunction &CGF,
+                                              llvm::BasicBlock *ExitBB) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
+  CGF.EmitBranch(OMPDeInitBB);
+
+  CGF.EmitBlock(OMPDeInitBB);
+
+  CGBuilderTy &Bld = CGF.Builder;
+  // DeInitialize the OMP state in the runtime; called by all active threads.
+  llvm::Value *Args[] = {/* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()),
+                         /* RequiredOMPRuntime */
+                         Bld.getInt1(mayNeedRuntimeSupport())};
+
+  CGF.EmitRuntimeCall(getOrCreateRuntimeFunctionDeclaration(
+                          *this, CGM, OMPRTL__kmpc_target_region_kernel_deinit),
+                      Args);
+
+  CGF.EmitBranch(ExitBB);
+  CGF.EmitBlock(ExitBB);
+}
+
+void CGOpenMPRuntimeTRegion::emitTargetOutlinedFunction(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  if (!IsOffloadEntry) // Nothing to do.
+    return;
+
+  assert(!ParentName.empty() && "Invalid target region parent name!");
+
+  emitKernel(D, ParentName, OutlinedFn, OutlinedFnID, CodeGen);
+
+  // Create a unique global variable to indicate the execution mode of this
+  // target region. The execution mode is either 'non-SPMD' or 'SPMD'. Initially
+  // all regions are executed in non-SPMD mode. This variable is picked up by
+  // the offload library to setup the device appropriately before kernel launch.
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /* isConstant */ true,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::ConstantInt::get(CGM.Int8Ty, 1),
+      Twine(OutlinedFn->getName(), "_exec_mode"));
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+CGOpenMPRuntimeTRegion::CGOpenMPRuntimeTRegion(CodeGenModule &CGM)
+    : CGOpenMPRuntime(CGM, "_", "$") {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("TRegion code generation does only handle device code!");
+}
+
+void CGOpenMPRuntimeTRegion::emitProcBindClause(
+    CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind,
+    SourceLocation Loc) {
+  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
+}
+
+void CGOpenMPRuntimeTRegion::emitNumThreadsClause(CodeGenFunction &CGF,
+                                                  llvm::Value *NumThreads,
+                                                  SourceLocation Loc) {
+  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
+}
+
+void CGOpenMPRuntimeTRegion::emitNumTeamsClause(CodeGenFunction &CGF,
+                                                const Expr *NumTeams,
+                                                const Expr *ThreadLimit,
+                                                SourceLocation Loc) {
+  // Nothing to do for kernel mode, no other modes supported yet.
+}
+
+llvm::Value *CGOpenMPRuntimeTRegion::emitParallelOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  // Emit target region as a standalone region.
+  llvm::Function *OutlinedFun =
+      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
+          D, ThreadIDVar, InnermostKind, CodeGen));
+
+  createParallelDataSharingWrapper(OutlinedFun, D);
+
+  return OutlinedFun;
+}
+
+// TODO: Check if this is actually needed.
+static const ValueDecl *getUnderlyingVar(const Expr *E) {
+  E = E->IgnoreParens();
+  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
+    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    E = Base;
+  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(E)) {
+    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+      Base = TempOASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    E = Base;
+  }
+  E = E->IgnoreParenImpCasts();
+  if (const auto *DE = dyn_cast<DeclRefExpr>(E))
+    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
+  const auto *ME = cast<MemberExpr>(E);
+  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
+}
+
+void CGOpenMPRuntimeTRegion::createParallelDataSharingWrapper(
+    llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
+  ASTContext &Ctx = CGM.getContext();
+  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
+
+  // Create a function that takes as argument the source thread.
+  FunctionArgList WrapperArgs;
+  ImplicitParamDecl SharedVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(),
+                                      /* Id */ nullptr, Ctx.VoidPtrTy,
+                                      ImplicitParamDecl::Other);
+  ImplicitParamDecl PrivateVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(),
+                                       /* Id */ nullptr, Ctx.VoidPtrTy,
+                                       ImplicitParamDecl::Other);
+  WrapperArgs.emplace_back(&SharedVarsArgDecl);
+  WrapperArgs.emplace_back(&PrivateVarsArgDecl);
+
+  const CGFunctionInfo &CGFI =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
+
+  auto *WrapperFn = llvm::Function::Create(
+      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
+      Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(GlobalDecl(), WrapperFn, CGFI);
+
+  OutlinedParallelFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  OutlinedParallelFn->setDoesNotRecurse();
+  WrapperFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  WrapperFn->setDoesNotRecurse();
+
+  CodeGenFunction CGF(CGM, /* suppressNewContext */ true);
+  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WrapperFn, CGFI, WrapperArgs,
+                    D.getBeginLoc(), D.getBeginLoc());
+
+  auto AI = WrapperFn->arg_begin();
+  llvm::Argument &SharedVarsArg = *(AI++);
+  llvm::Argument &PrivateVarsArg = *(AI);
+  SharedVarsArg.setName("shared_vars");
+  PrivateVarsArg.setName("private_vars");
+
+  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+                                           /* DestWidth */ 32, /* Signed */ 1),
+                                       /* Name */ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/* C */ 0));
+
+  setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true);
+
+  // Create the array of arguments and fill it with boilerplate values.
+  SmallVector<llvm::Value *, 8> Args;
+  Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer());
+  Args.emplace_back(ZeroAddr.getPointer());
+
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Collect all variables marked as shared.
+  llvm::SmallPtrSet<const ValueDecl *, 16> SharedVars;
+  for (const auto *C : D.getClausesOfKind<OMPSharedClause>())
+    for (const Expr *E : C->getVarRefs())
+      SharedVars.insert(getUnderlyingVar(E));
+
+  // Retrieve the shared and private variables from argument pointers and pass
+  // them to the outlined function.
+  llvm::SmallVector<llvm::Type *, 8> SharedStructMemberTypes;
+  llvm::SmallVector<llvm::Type *, 8> PrivateStructMemberTypes;
+
+  WrapperInfo &WI = WrapperInfoMap[OutlinedParallelFn];
+  WI.WrapperFn = WrapperFn;
+
+  auto ArgIt = OutlinedParallelFn->arg_begin() + /* global_tid & bound_tid */ 2;
+
+  // If we require loop bounds they are already part of the outlined function
+  // encoding, just after global_tid and bound_tid.
+  bool RequiresLoopBounds =
+      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind());
+  if (RequiresLoopBounds) {
+    // Register the lower bound in the wrapper info.
+    WI.CaptureIsPrivate.push_back(true);
+    PrivateStructMemberTypes.push_back((ArgIt++)->getType());
+    // Register the upper bound in the wrapper info.
+    WI.CaptureIsPrivate.push_back(true);
+    PrivateStructMemberTypes.push_back((ArgIt++)->getType());
+  }
+
+  auto CIt = CS.capture_begin();
+  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CIt) {
+    bool IsPrivate = CIt->capturesVariableArrayType() ||
+                     CIt->capturesVariableByCopy() ||
+                     !SharedVars.count(CIt->getCapturedVar());
+    WI.CaptureIsPrivate.push_back(IsPrivate);
+
+    auto &StructMemberTypes =
+        IsPrivate ? PrivateStructMemberTypes : SharedStructMemberTypes;
+    llvm::Type *ArgTy = (ArgIt++)->getType();
+    if (!IsPrivate) {
+      assert(ArgTy->isPointerTy());
+      ArgTy = ArgTy->getPointerElementType();
+    }
+    StructMemberTypes.push_back(ArgTy);
+  }
+
+  // Verify the position of the outlined function argument iterator as a sanity
+  // check.
+  assert(ArgIt == OutlinedParallelFn->arg_end() &&
+         "Not all arguments have been processed!");
+
+  llvm::Value *SharedVarsStructPtr = nullptr;
+  llvm::Value *PrivateVarsStructPtr = nullptr;
+  llvm::LLVMContext &LLVMCtx = OutlinedParallelFn->getContext();
+  if (!PrivateStructMemberTypes.empty()) {
+    WI.PrivateVarsStructTy = llvm::StructType::create(
+        LLVMCtx, PrivateStructMemberTypes, "omp.private.struct");
+    PrivateVarsStructPtr =
+        Bld.CreateBitCast(&PrivateVarsArg, WI.PrivateVarsStructTy->getPointerTo());
+  }
+  if (!SharedStructMemberTypes.empty()) {
+    WI.SharedVarsStructTy = llvm::StructType::create(
+        LLVMCtx, SharedStructMemberTypes, "omp.shared.struct");
+    SharedVarsStructPtr =
+        Bld.CreateBitCast(&SharedVarsArg, WI.SharedVarsStructTy->getPointerTo());
+  }
+
+  assert(WI.CaptureIsPrivate.size() + /* global_tid & bound_tid */ 2 ==
+             OutlinedParallelFn->arg_size() &&
+         "Not all arguments have been processed!");
+
+  unsigned PrivateIdx = 0, SharedIdx = 0;
+  for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+    bool IsPrivate = WI.CaptureIsPrivate[i];
+
+    llvm::Value *StructPtr =
+        IsPrivate ? PrivateVarsStructPtr : SharedVarsStructPtr;
+    unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx;
+
+    // TODO: Figure out the real alignment
+    if (IsPrivate) {
+      Args.emplace_back(
+          Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1));
+    } else {
+      llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++);
+      Args.emplace_back(GEP);
+    }
+  }
+
+  assert(Args.size() == OutlinedParallelFn->arg_size());
+  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
+
+  CGF.FinishFunction();
+
+  clearLocThreadIdInsertPt(CGF);
+}
+
+void CGOpenMPRuntimeTRegion::emitParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
+  const WrapperInfo &WI = WrapperInfoMap[Fn];
+
+  auto &&ParGen = [this, CapturedVars, WI](CodeGenFunction &CGF,
+                                           PrePostActionTy &) {
+    CGBuilderTy &Bld = CGF.Builder;
+    assert(WI.WrapperFn && "Wrapper function does not exist!");
+
+    llvm::Value *SharedVarsSize = llvm::Constant::getNullValue(CGM.Int16Ty);
+    llvm::Value *PrivateVarsSize = SharedVarsSize;
+    llvm::Value *SharedStructAlloca = llvm::UndefValue::get(CGM.VoidPtrTy);
+    llvm::Value *PrivateStructAlloca = SharedStructAlloca;
+
+    if (WI.SharedVarsStructTy) {
+      SharedStructAlloca = CGF.CreateDefaultAlignTempAlloca(
+                                  WI.SharedVarsStructTy, ".shared.vars")
+                               .getPointer();
+      const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout();
+      SharedVarsSize = Bld.getInt16(DL.getTypeAllocSize(WI.SharedVarsStructTy));
+    }
+    if (WI.PrivateVarsStructTy) {
+      PrivateStructAlloca = CGF.CreateDefaultAlignTempAlloca(
+                                   WI.PrivateVarsStructTy, ".private.vars")
+                                .getPointer();
+      const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout();
+      PrivateVarsSize =
+          Bld.getInt16(DL.getTypeAllocSize(WI.PrivateVarsStructTy));
+    }
+
+    llvm::SmallVector<llvm::Value *, 4> Args;
+    Args.push_back(
+        /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()));
+    Args.push_back(
+        /* RequiredOMPRuntime */ Bld.getInt1(mayNeedRuntimeSupport()));
+    Args.push_back(WI.WrapperFn);
+    Args.push_back(CGF.EmitCastToVoidPtr(SharedStructAlloca));
+    Args.push_back(SharedVarsSize);
+    Args.push_back(CGF.EmitCastToVoidPtr(PrivateStructAlloca));
+    Args.push_back(PrivateVarsSize);
+    Args.push_back(
+        /* SharedPointers */ Bld.getInt1(0));
+
+    assert((CapturedVars.empty() || (WI.SharedVarsStructTy || WI.PrivateVarsStructTy)) &&
+           "Expected the shared or private struct type to be set if variables are captured!");
+    assert((CapturedVars.empty() ||
+            CapturedVars.size() ==
+                (WI.SharedVarsStructTy ? WI.SharedVarsStructTy->getNumElements()
+                                       : 0) +
+                    (WI.PrivateVarsStructTy
+                         ? WI.PrivateVarsStructTy->getNumElements()
+                         : 0)) &&
+           "# elements in shared struct types should be number of captured "
+           "variables!");
+
+    // Store all captured variables into a single local structure that is then
+    // passed to the runtime library.
+    unsigned PrivateIdx = 0, SharedIdx = 0;
+    for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+      bool IsPrivate = WI.CaptureIsPrivate[i];
+
+      llvm::Value *StructPtr =
+          IsPrivate ? PrivateStructAlloca : SharedStructAlloca;
+      unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx;
+      llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++);
+      llvm::Value *Var = IsPrivate ? CapturedVars[i]
+                                   : Bld.CreateAlignedLoad(CapturedVars[i], 1);
+      Bld.CreateDefaultAlignedStore(Var, GEP);
+    }
+
+    CGF.EmitRuntimeCall(
+        getOrCreateRuntimeFunctionDeclaration(
+            *this, CGM, OMPRTL__kmpc_target_region_kernel_parallel),
+        Args);
+
+    SharedIdx = 0;
+    for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+      bool IsPrivate = WI.CaptureIsPrivate[i];
+      if (IsPrivate)
+        continue;
+
+      llvm::Value *GEP = Bld.CreateStructGEP(SharedStructAlloca, SharedIdx++);
+      llvm::Value *Var = Bld.CreateAlignedLoad(GEP, 1);
+      Bld.CreateDefaultAlignedStore(Var, CapturedVars[i]);
+    }
+  };
+
+  auto &&SeqGen = [this, &ParGen, Loc](CodeGenFunction &CGF,
+                                       PrePostActionTy &Action) {
+    // Use an artifical "num_threads(1)" clause to force sequential execution if
+    // the expression in the 'if clause' evaluated to false. We expect the
+    // middle-end to clean this up.
+    emitNumThreadsClause(CGF, CGF.Builder.getInt32(/* C */ 1), Loc);
+    ParGen(CGF, Action);
+  };
+
+  if (IfCond) {
+    emitOMPIfClause(CGF, IfCond, ParGen, SeqGen);
+  } else {
+    CodeGenFunction::RunCleanupsScope Scope(CGF);
+    RegionCodeGenTy ThenRCG(ParGen);
+    ThenRCG(CGF);
+  }
+}
+
+llvm::Value *CGOpenMPRuntimeTRegion::emitTeamsOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  // Emit target region as a standalone region.
+  llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
+      D, ThreadIDVar, InnermostKind, CodeGen);
+
+  return OutlinedFunVal;
+}
+
+void CGOpenMPRuntimeTRegion::emitTeamsCall(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc,
+    llvm::Value *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  Address ZeroAddr = CGF.CreateMemTemp(
+      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
+      /*Name*/ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+}
+
+void CGOpenMPRuntimeTRegion::emitCriticalRegion(
+    CodeGenFunction &CGF, StringRef CriticalName,
+    const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
+    const Expr *Hint) {
+  llvm_unreachable(
+      "TODO: TRegion code generation does not support critical regions yet!");
+}
+
+void CGOpenMPRuntimeTRegion::emitReduction(
+    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
+    ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
+    ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
+  llvm_unreachable(
+      "TODO: TRegion code generation does not support reductions yet!");
+}
+
+void CGOpenMPRuntimeTRegion::emitFunctionProlog(CodeGenFunction &CGF,
+                                                const Decl *D) {}
+
+void CGOpenMPRuntimeTRegion::functionFinished(CodeGenFunction &CGF) {
+  CGOpenMPRuntime::functionFinished(CGF);
+}
+
+void CGOpenMPRuntimeTRegion::getDefaultDistScheduleAndChunk(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const {
+  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(CGF, S, ScheduleKind, Chunk);
+}
+
+void CGOpenMPRuntimeTRegion::getDefaultScheduleAndChunk(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const {
+  ScheduleKind = OMPC_SCHEDULE_static;
+  // Chunk size is 1 in this case.
+  llvm::APInt ChunkSize(32, 1);
+  ChunkExpr = IntegerLiteral::Create(
+      CGF.getContext(), ChunkSize,
+      CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ 0),
+      SourceLocation());
+}
+
+// ------------------------------------------------------------------------ //
+// TODO: The following cuda specific part should live somewhere else,
+//       potentially in a derived class.
+
+void CGOpenMPRuntimeTRegion::createOffloadEntry(
+    llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t,
+    llvm::GlobalValue::LinkageTypes) {
+  // TODO: Add support for global variables on the device after declare target
+  // support.
+  if (!isa<llvm::Function>(Addr))
+    return;
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+
+  // Get "nvvm.annotations" metadata node
+  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+
+  llvm::Metadata *MDVals[] = {
+      llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
+      llvm::ConstantAsMetadata::get(
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+  // Append metadata to nvvm.annotations
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+#include "clang/Basic/Cuda.h"
+
+// Get current CudaArch and ignore any unknown values
+static CudaArch getCudaArch(CodeGenModule &CGM) {
+  if (!CGM.getTarget().hasFeature("ptx"))
+    return CudaArch::UNKNOWN;
+  llvm::StringMap<bool> Features;
+  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
+                                 CGM.getTarget().getTargetOpts().CPU,
+                                 CGM.getTarget().getTargetOpts().Features);
+  for (const auto &Feature : Features) {
+    if (Feature.getValue()) {
+      CudaArch Arch = StringToCudaArch(Feature.getKey());
+      if (Arch != CudaArch::UNKNOWN)
+        return Arch;
+    }
+  }
+  return CudaArch::UNKNOWN;
+}
+
+/// Check to see if target architecture supports unified addressing which is
+/// a restriction for OpenMP requires clause "unified_shared_memory".
+void CGOpenMPRuntimeTRegion::checkArchForUnifiedAddressing(
+    CodeGenModule &CGM, const OMPRequiresDecl *D) const {
+  for (const OMPClause *Clause : D->clauselists()) {
+    if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
+      switch (getCudaArch(CGM)) {
+      case CudaArch::SM_20:
+      case CudaArch::SM_21:
+      case CudaArch::SM_30:
+      case CudaArch::SM_32:
+      case CudaArch::SM_35:
+      case CudaArch::SM_37:
+      case CudaArch::SM_50:
+      case CudaArch::SM_52:
+      case CudaArch::SM_53:
+      case CudaArch::SM_60:
+      case CudaArch::SM_61:
+      case CudaArch::SM_62:
+        CGM.Error(Clause->getBeginLoc(),
+                  "Target architecture does not support unified addressing");
+        return;
+      case CudaArch::SM_70:
+      case CudaArch::SM_72:
+      case CudaArch::SM_75:
+      case CudaArch::GFX600:
+      case CudaArch::GFX601:
+      case CudaArch::GFX700:
+      case CudaArch::GFX701:
+      case CudaArch::GFX702:
+      case CudaArch::GFX703:
+      case CudaArch::GFX704:
+      case CudaArch::GFX801:
+      case CudaArch::GFX802:
+      case CudaArch::GFX803:
+      case CudaArch::GFX810:
+      case CudaArch::GFX900:
+      case CudaArch::GFX902:
+      case CudaArch::GFX904:
+      case CudaArch::GFX906:
+      case CudaArch::GFX909:
+      case CudaArch::UNKNOWN:
+        break;
+      case CudaArch::LAST:
+        llvm_unreachable("Unexpected Cuda arch.");
+      }
+    }
+  }
+}
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -69,6 +69,7 @@
   CGOpenCLRuntime.cpp
   CGOpenMPRuntime.cpp
   CGOpenMPRuntimeNVPTX.cpp
+  CGOpenMPRuntimeTRegion.cpp
   CGRecordLayoutBuilder.cpp
   CGStmt.cpp
   CGStmtOpenMP.cpp
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -20,6 +20,7 @@
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeNVPTX.h"
+#include "CGOpenMPRuntimeTRegion.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
 #include "ConstantEmitter.h"
@@ -66,6 +67,11 @@
     llvm::cl::desc("Emit limited coverage mapping information (experimental)"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<bool> UseGenericTRegionInterface(
+    "openmp-tregion-runtime", llvm::cl::ZeroOrMore, llvm::cl::Hidden,
+    llvm::cl::desc("Use the generic target region OpenMP runtime interface"),
+    llvm::cl::init(false));
+
 static const char AnnotationSection[] = "llvm.metadata";
 
 static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
@@ -205,7 +211,10 @@
   case llvm::Triple::nvptx64:
     assert(getLangOpts().OpenMPIsDevice &&
            "OpenMP NVPTX is only prepared to deal with device code.");
-    OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
+    if (UseGenericTRegionInterface)
+      OpenMPRuntime.reset(new CGOpenMPRuntimeTRegion(*this));
+    else
+      OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
     break;
   default:
     if (LangOpts.OpenMPSimd)
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -286,6 +286,7 @@
 void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
 void initializeNaryReassociateLegacyPassPass(PassRegistry&);
 void initializeNewGVNLegacyPassPass(PassRegistry&);
+void initializeOpenMPOptLegacyPass(PassRegistry&);
 void initializeObjCARCAAWrapperPassPass(PassRegistry&);
 void initializeObjCARCAPElimPass(PassRegistry&);
 void initializeObjCARCContractPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -142,6 +142,7 @@
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
       (void) llvm::createNaryReassociatePass();
+      (void) llvm::createOpenMPOptLegacyPass();
       (void) llvm::createObjCARCAAWrapperPass();
       (void) llvm::createObjCARCAPElimPass();
       (void) llvm::createObjCARCExpandPass();
diff --git a/llvm/include/llvm/Support/SaveAndRestore.h b/llvm/include/llvm/Support/SaveAndRestore.h
--- a/llvm/include/llvm/Support/SaveAndRestore.h
+++ b/llvm/include/llvm/Support/SaveAndRestore.h
@@ -19,15 +19,15 @@
 
 /// A utility class that uses RAII to save and restore the value of a variable.
 template <typename T> struct SaveAndRestore {
-  SaveAndRestore(T &X) : X(X), OldValue(X) {}
-  SaveAndRestore(T &X, const T &NewValue) : X(X), OldValue(X) {
+  SaveAndRestore(T &X) : Loc(&X), OldValue(X) {}
+  SaveAndRestore(T &X, const T &NewValue) : Loc(&X), OldValue(X) {
     X = NewValue;
   }
-  ~SaveAndRestore() { X = OldValue; }
+  ~SaveAndRestore() { *Loc = OldValue; }
   T get() { return OldValue; }
 
 private:
-  T &X;
+  T *Loc;
   T OldValue;
 };
 
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -156,6 +156,11 @@
 ///
 ModulePass *createIPConstantPropagationPass();
 
+//===----------------------------------------------------------------------===//
+/// createOpenMPOpt - This pass performs OpenMP specific optimizations.
+///
+Pass *createOpenMPOptLegacyPass();
+
 //===----------------------------------------------------------------------===//
 /// createIPSCCPPass - This pass propagates constants from call sites into the
 /// bodies of functions, and keeps track of whether basic blocks are executable
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -25,6 +25,7 @@
   LoopExtractor.cpp
   LowerTypeTests.cpp
   MergeFunctions.cpp
+  OpenMPOpt.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -35,6 +35,7 @@
   initializeGlobalSplitPass(Registry);
   initializeHotColdSplittingLegacyPassPass(Registry);
   initializeIPCPPass(Registry);
+  initializeOpenMPOptLegacyPass(Registry);
   initializeAlwaysInlinerLegacyPassPass(Registry);
   initializeSimpleInlinerPass(Registry);
   initializeInferFunctionAttrsLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -0,0 +1,788 @@
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> BuildCustomStateMachines(
+    "openmp-opt-build-custom-state-machines", cl::ZeroOrMore,
+    cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> PerformOpenMPSIMDIZATION("openmp-opt-simdization",
+                                              cl::ZeroOrMore,
+                                              cl::desc("SPMD kernels."),
+                                              cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ForceOpenMPSIMDIZATION("openmp-opt-simdization-force",
+                                            cl::ZeroOrMore,
+                                            cl::desc("SPMD kernels force."),
+                                            cl::Hidden, cl::init(false));
+
+STATISTIC(NumKernelsConvertedToSPMD,
+          "Number of GPU kernels converted to SPMD mode");
+STATISTIC(NumParallelCallsConvertedToSPMD,
+          "Number of parallel GPU kernel regions converted to SPMD mode");
+STATISTIC(NumCustomStateMachinesCreated,
+          "Number of custom GPU kernel non-SPMD mode state machines created");
+STATISTIC(NumCustomStateMachinesNoFallback,
+          "Number of custom GPU kernel non-SPMD mode state machines without "
+          "fallback");
+STATISTIC(NumRuntimeFunctionCallSitesInlinedLate,
+          "Number of runtime function call sites inlined late");
+
+enum {
+  ARG_INIT_USE_SPMD_MODE = 0,
+  ARG_INIT_REQUIRES_OMP_RUNTIME = 1,
+  ARG_INIT_USE_STATE_MACHINE = 2,
+  ARG_INIT_REQUIRES_DATA_SHARING = 3
+};
+
+enum {
+  ARG_DEINIT_USE_SPMD_MODE = 0,
+  ARG_DEINIT_REQUIRES_OMP_RUNTIME = 1
+};
+
+enum {
+  ARG_PARALLEL_USE_SPMD_MODE = 0,
+  ARG_PARALLEL_REQUIRES_OMP_RUNTIME = 1,
+  ARG_PARALLEL_WORK_FUNCTION = 2,
+  ARG_PARALLEL_SHARED_VARS = 3,
+  ARG_PARALLEL_SHARED_VARS_BYTES = 4
+};
+
+#define KNOWN_FUNCTIONS()                                                      \
+  KF(FID_OMP_GET_TEAM_NUM, "omp_get_team_num", 0)                              \
+  KF(FID_OMP_GET_NUM_TEAMS, "omp_get_num_teams", 0)                            \
+  KF(FID_OMP_GET_THREAD_NUM, "omp_get_thread_num", 0)                          \
+  KF(FID_OMP_GET_NUM_THREADS, "omp_get_num_threads", 0)                        \
+  KF(FID_OMP_SET_NUM_THREADS, "omp_set_num_threads", 1)                        \
+  KF(FID_KMPC_TREGION_KERNEL_INIT, "__kmpc_target_region_kernel_init", 4)      \
+  KF(FID_KMPC_TREGION_KERNEL_DEINIT, "__kmpc_target_region_kernel_deinit", 2)  \
+  KF(FID_KMPC_TREGION_KERNEL_PARALLEL, "__kmpc_target_region_kernel_parallel", \
+     8)                                                                        \
+  KF(FID_KMPC_FOR_STATIC_INIT_4, "__kmpc_for_static_init_4", 9)                \
+  KF(FID_KMPC_FOR_STATIC_FINI, "__kmpc_for_static_fini", 2)                    \
+  KF(FID_KMPC_GLOBAL_THREAD_NUM, "__kmpc_global_thread_num", 1)                \
+  KF(FID_KMPC_DISPATCH_INIT_4, "__kmpc_dispatch_init_4", 7)                    \
+  KF(FID_KMPC_DISPATCH_NEXT_4, "__kmpc_dispatch_next_4", 6)
+
+enum FunctionID {
+  #define KF(NAME, STR, NARGS) NAME,
+  KNOWN_FUNCTIONS()
+  #undef KF
+  FID_KMPC_UNKNOWN,
+  FID_OMP_UNKOWN,
+  FID_NVVM_UNKNOWN,
+  FID_LLVM_UNKNOWN,
+  FID_UNKNOWN
+};
+
+static FunctionID getFunctionID(Function *F) {
+  if (!F) return FID_UNKNOWN;
+#define KF(NAME, STR, NARGS) .Case(STR, NAME)
+  return StringSwitch<FunctionID>(F->getName()) KNOWN_FUNCTIONS()
+      .StartsWith("__kmpc_", FID_KMPC_UNKNOWN)
+      .StartsWith("omp_", FID_OMP_UNKOWN)
+      .StartsWith("llvm.nvvm.", FID_NVVM_UNKNOWN)
+      .StartsWith("llvm.", FID_LLVM_UNKNOWN)
+      .Default(FID_UNKNOWN);
+#undef KF
+}
+
+struct GuardGenerator {
+
+  GuardGenerator(Module &M) : M(M) {}
+
+  bool guardAllSideEffects(SmallVectorImpl<Instruction *> &SideEffectInst) {
+    bool Guarded = true;
+    const DataLayout &DL = M.getDataLayout();
+
+    SmallVector<Instruction *, 16> UnguardedSideEffectInst;
+    for (Instruction *I : SideEffectInst) {
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (getFunctionID(CI->getCalledFunction()) != FID_UNKNOWN)
+          continue;
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (isa<AllocaInst>(
+                SI->getPointerOperand()->stripInBoundsConstantOffsets()))
+          continue;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (isSafeToLoadUnconditionally(LI->getPointerOperand(),
+                                        LI->getAlignment(), DL))
+          continue;
+      }
+      LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n");
+      UnguardedSideEffectInst.push_back(I);
+    }
+
+    return UnguardedSideEffectInst.empty();
+  }
+
+private:
+  Module &M;
+};
+static Type *getOrCreateStructIdentTypePtr(Module &M) {
+  // TODO create if not present!
+  return M.getTypeByName("struct.ident_t")->getPointerTo();
+}
+
+// TODO: Simplify function declaration
+static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1, const char *Name,
+                               Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {T0, T1}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) {
+  static const char *Name = "__kmpc_barrier_simple_spmd";
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    LLVMContext &Ctx = M.getContext();
+    FunctionType *FType = FunctionType::get(
+        Type::getVoidTy(Ctx),
+        {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+
+static bool isSPMDRelatedRTCall(Instruction *I) {
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->getCalledFunction())
+    return false;
+
+  return StringSwitch<bool>(CI->getCalledFunction()->getName())
+      .Case("__kmpc_target_region_kernel_init", true)
+      .Case("__kmpc_target_region_kernel_parallel", true)
+      .Case("__kmpc_target_region_kernel_deinit", true)
+      .Default(false);
+}
+
+static bool inlineRTCalls(Module &M, SmallVectorImpl<CallInst *> &RTCalls) {
+  bool Changed = false;
+
+  InlineFunctionInfo IFI;
+  for (CallInst *CI : RTCalls) {
+    CI->getCalledFunction()->removeFnAttr(Attribute::NoInline);
+
+    InlineResult IR = InlineFunction(CI, IFI,
+                                     /* AAResults* */ nullptr,
+                                     /* InsertLifetime */ true);
+    Changed |= IR;
+
+    NumRuntimeFunctionCallSitesInlinedLate += IR;
+  }
+
+  return Changed;
+}
+
+namespace {
+struct ParallelRegion {
+
+  ParallelRegion(CallInst &CI) : CI(CI) {
+    assert(CI.getCalledFunction()->getName() ==
+           "__kmpc_target_region_kernel_parallel");
+    assert(CI.getCalledFunction()->arg_size() == 5);
+  }
+
+private:
+
+  CallInst &CI;
+};
+
+
+struct KernelTy {
+
+  KernelTy(Function *KernelFn, bool IsEntry = true)
+      : KernelFn(*KernelFn), IsEntry(IsEntry) {}
+
+  bool optimize();
+
+private:
+
+  bool mayInvokeUnknownParallelRegion() const { return MayInvokeUnknownParallelRegion; }
+  bool analyze();
+
+  SmallVector<Instruction *, 16> SideEffectInst;
+  SmallVector<Instruction *, 16> ReadOnlyInst;
+
+  SmallVector<CallInst *, 2> ContainedCalls[FID_UNKNOWN + 1];
+
+  bool MayInvokeUnknownParallelRegion = false;
+
+  const bool IsEntry;
+
+  Function &KernelFn;
+};
+
+
+bool KernelTy::analyze() {
+  LLVM_DEBUG(KernelFn.getParent()->dump());
+  LLVM_DEBUG(dbgs() << "Analyze kernel function: " << KernelFn.getName() << "\n");
+
+  for (Instruction &I : instructions(&KernelFn)) {
+
+    // Handle non-side-effect instructions first. These will not write or throw
+    // which makes reading the only interesting property left.
+    if (!I.mayHaveSideEffects()) {
+      if (I.mayReadFromMemory()) {
+        LLVM_DEBUG(dbgs() << "- ro: " << I << "\n");
+        ReadOnlyInst.push_back(&I);
+      }
+      continue;
+    }
+
+    // Now we handle all non-call instructions.
+    if (!isa<CallInst>(I)) {
+      LLVM_DEBUG(dbgs() << "- se: " << I << "\n");
+      SideEffectInst.push_back(&I);
+      continue;
+    }
+
+    CallInst &CI = cast<CallInst>(I);
+    Function *Callee = CI.getCalledFunction();
+
+    FunctionID ID = getFunctionID(Callee);
+
+    // Check that know functions have the right number of arguments early on.
+    // Additionally provide debug output based on the function ID.
+    switch (ID) {
+#define KF(NAME, STR, NARGS)                                                   \
+  case NAME:                                                                   \
+    LLVM_DEBUG(dbgs() << "- known call [" << CI.getNumArgOperands() << "/"     \
+                      << NARGS << "]: " << I << "\n");                         \
+    if (CI.getNumArgOperands() != NARGS)                                       \
+      ID = FID_UNKNOWN;                                                        \
+    break;
+      KNOWN_FUNCTIONS()
+#undef KF
+      LLVM_DEBUG(dbgs() << "- known call call: " << I << "\n");
+      break;
+    case FID_KMPC_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown __kmpc_* call: " << I << "\n");
+      break;
+    case FID_OMP_UNKOWN:
+      LLVM_DEBUG(dbgs() << "- unknown omp_* call: " << I << "\n");
+      break;
+    case FID_NVVM_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown llvm.nvvm.* call: " << I << "\n");
+      break;
+    case FID_LLVM_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown llvm.* call: " << I << "\n");
+      break;
+    case FID_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown call: " << I << "\n");
+      break;
+    }
+
+    ContainedCalls[ID].push_back(&CI);
+  }
+
+  // Verify we have at least the calls we expect to see in the right places.
+  if (ContainedCalls[FID_KMPC_TREGION_KERNEL_INIT].size() != 1 ||
+      ContainedCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() != 1 ||
+      ContainedCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getParent() !=
+          &KernelFn.getEntryBlock()) {
+    LLVM_DEBUG(dbgs() << "- malformed kernel: [#Init: "
+                      << ContainedCalls[FID_KMPC_TREGION_KERNEL_INIT].size()
+                      << "][#DeInit: "
+                      << ContainedCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size()
+                      << "]\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool KernelTy::optimize() {
+  bool Changed = false;
+
+  // First analyze the code. If that fails for some reason we bail out early.
+  if (!analyze())
+    return Changed;
+
+  if (!ForceOpenMPSIMDIZATION) {
+    if (!ContainedCalls[FID_UNKNOWN].empty())
+      return Changed;
+
+    GuardGenerator GG(*KernelFn.getParent());
+    if (!GG.guardAllSideEffects(SideEffectInst))
+      return Changed;
+
+    LLVM_DEBUG(dbgs() << "Transformation to SPMD OK\n");
+
+    if (!PerformOpenMPSIMDIZATION)
+      return Changed;
+  }
+
+  Function *SimpleBarrierFn =
+      getOrCreateSimpleSPMDBarrierFn(*KernelFn.getParent());
+  auto AI = SimpleBarrierFn->arg_begin();
+
+  Type *FlagTy = ContainedCalls[FID_KMPC_TREGION_KERNEL_INIT][0]
+                     ->getArgOperand(0)
+                     ->getType();
+  Constant *SPMDFlag = ConstantInt::getTrue(FlagTy);
+
+  ContainedCalls[FID_KMPC_TREGION_KERNEL_INIT][0]->setArgOperand(0, SPMDFlag);
+  ContainedCalls[FID_KMPC_TREGION_KERNEL_DEINIT][0]->setArgOperand(0, SPMDFlag);
+  for (CallInst *ParCI : ContainedCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) {
+    ParCI->setArgOperand(0, SPMDFlag);
+    CallInst::Create(SimpleBarrierFn,
+                     {Constant::getNullValue((AI++)->getType()),
+                      Constant::getNullValue((AI)->getType())},
+                     "", ParCI->getNextNode());
+  }
+
+  GlobalVariable *ExecMode = KernelFn.getParent()->getGlobalVariable(
+      (KernelFn.getName() + "_exec_mode").str());
+  assert(ExecMode &&
+          "Assumed to find an execution mode hint among the globals");
+  assert(ExecMode->getInitializer()->isOneValue() &&
+          "Assumed target_region execution mode prior to 'SPMD'-zation");
+  ExecMode->setInitializer(
+      Constant::getNullValue(ExecMode->getInitializer()->getType()));
+
+  NumKernelsConvertedToSPMD++;
+  NumParallelCallsConvertedToSPMD +=
+      ContainedCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size();
+
+  //simplifySequentialParallelism();
+
+  return Changed;
+}
+
+}
+
+#if 0
+static bool
+createCustomStateMachine(Module &M,
+                         SmallVectorImpl<Instruction *> &SideEffectInst,
+                         SmallVectorImpl<CallInst *> &RTCalls) {
+
+  // TODO use reachability to eliminate the loop and if-cascade
+
+  SmallVector<CallInst *, 8> ParallelRTCalls;
+  CallInst *InitCI = nullptr;
+  for (CallInst *CI : RTCalls) {
+    const auto &CalleeName = CI->getCalledFunction()->getName();
+    if (CalleeName.equals("__kmpc_target_region_kernel_init")) {
+      assert(!InitCI && "Found multiple kernel init calls!");
+      InitCI = CI;
+      continue;
+    }
+    if (CalleeName.equals("__kmpc_target_region_kernel_parallel")) {
+      ParallelRTCalls.push_back(CI);
+    }
+  }
+
+  assert(InitCI && "No kernel init call found");
+
+  // TODO: Warn or eliminate the offloading if no parallel regions are present.
+
+  ConstantInt *UseSM = dyn_cast<ConstantInt>(InitCI->getArgOperand(1));
+  if (!UseSM || !UseSM->isOne()) {
+    LLVM_DEBUG(dbgs() << "No custom state machine because of " << *InitCI
+                      << "\n");
+    return false;
+  }
+
+  InitCI->setName("thread_kind");
+  LLVMContext &Ctx = InitCI->getContext();
+  Function *KernelFn = InitCI->getFunction();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *BoolTy = Type::getInt1Ty(Ctx);
+  Type *Int16Ty = Type::getInt16Ty(Ctx);
+  Type *ChoiceTy = InitCI->getType();
+  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "work_fn.addr",
+                                        &KernelFn->getEntryBlock().front());
+
+  Instruction *IP = InitCI->getNextNode();
+  Constant *ConstZero = ConstantInt::getSigned(BoolTy, 0);
+  InitCI->setArgOperand(1, ConstZero);
+  Constant *ConstMOne = ConstantInt::getSigned(ChoiceTy, -1);
+  Instruction *WorkerCnd =
+      new ICmpInst(IP, ICmpInst::ICMP_EQ, InitCI, ConstMOne, "is_worker");
+
+  Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false);
+  BasicBlock *WaitBB = WaitTI->getParent();
+  WaitBB->setName("worker.wait");
+  IP->getParent()->setName("master_check");
+
+  Instruction *MasterCheckTI = IP->getParent()->getTerminator();
+  assert(MasterCheckTI->getNumSuccessors() == 2);
+  assert(WaitTI->getNumSuccessors() == 1);
+
+  BasicBlock *FinalBB = WaitTI->getSuccessor(0);
+  if (MasterCheckTI->getSuccessor(0)->size() == 1 &&
+      isa<ReturnInst>(MasterCheckTI->getSuccessor(0)->getTerminator()))
+    FinalBB = MasterCheckTI->getSuccessor(0);
+  else if (MasterCheckTI->getSuccessor(1)->size() == 1 &&
+      isa<ReturnInst>(MasterCheckTI->getSuccessor(1)->getTerminator()))
+    FinalBB = MasterCheckTI->getSuccessor(1);
+
+  Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M);
+
+  auto AI = SimpleBarrierFn->arg_begin();
+  Instruction *BarrierCall =
+      CallInst::Create(SimpleBarrierFn,
+                       {Constant::getNullValue((AI++)->getType()),
+                        Constant::getNullValue((AI)->getType())},
+                       "", WaitTI);
+
+  Function *KernelParallelFn = getOrCreateFn(
+      BoolTy, VoidPtrTy->getPointerTo(), Int16Ty, "__kmpc_kernel_parallel", M);
+
+  Value *RequiresOMPRuntime = CastInst::CreateZExtOrBitCast(
+      InitCI->getArgOperand(2), Int16Ty, "", WaitTI);
+  Instruction *ActiveCnd = CallInst::Create(
+      KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI);
+
+  Type *WorkFnPrototype =
+      FunctionType::get(VoidTy, {VoidPtrTy}, false)->getPointerTo();
+  Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+      WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI);
+  Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI);
+
+  Instruction *WorkFnCnd =
+      new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn,
+                   Constant::getNullValue(WorkFn->getType()), "no_work");
+
+  Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false);
+  FinishedTI->getParent()->setName("worker.finished");
+  WaitTI->getParent()->setName("worker.active_check");
+
+  Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false);
+  ActiveTI->getParent()->setName("worker.active");
+  WaitTI->getParent()->setName("worker.inactive");
+
+  Function *KernelGetSharedVars = getOrCreateFn(
+      VoidPtrTy, "__kmpc_target_region_kernel_get_shared_memory", M);
+  Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI);
+
+  BasicBlock *ExecuteBB = ActiveTI->getParent();
+  BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI);
+  ParallelEndBB->setName("worker.parallel_end");
+
+  Function *KernelEndParallelFn =
+      getOrCreateFn(VoidTy, "__kmpc_kernel_end_parallel", M);
+  CallInst::Create(KernelEndParallelFn, "", ActiveTI);
+
+  // A fallback is required if we might not see all parallel regions
+  // (__kmpc_target_region_kernel_parallel calls). This could be the case if
+  // there is an unknown function call with side effects in the target region.
+  bool RequiresFallback = std::any_of(
+      SideEffectInst.begin(), SideEffectInst.end(), [](Instruction *I) {
+        if (isa<CallInst>(I) && I->mayHaveSideEffects() &&
+                !isIgnoredCall(I)) {
+          LLVM_DEBUG(dbgs() << "Require SM Fallback due to side effect: " << *I << "\n");
+          return true;
+        }
+        return false;
+      });
+
+  auto MayContainParallelKernelCall = [](Function &F) {
+    for (Instruction &I : instructions(F)) {
+      if (isa<IntrinsicInst>(I))
+        continue;
+      if (!isa<CallInst>(I) || !I.mayHaveSideEffects())
+        continue;
+      if (CallInst *CI = dyn_cast<CallInst>(&I))
+        if (CI->isInlineAsm())
+          continue;
+      if (isIgnoredCall(&I))
+        continue;
+      if (isSPMDRelatedRTCall(&I) &&
+          !cast<CallInst>(I).getCalledFunction()->getName().equals(
+              "__kmpc_target_region_kernel_parallel"))
+        continue;
+      LLVM_DEBUG(dbgs() << "Require SM Fallback due to parallel function inst: "
+                        << I << "\n");
+      return true;
+    }
+    return false;
+  };
+
+  IP = ExecuteBB->getTerminator();
+  for (CallInst *ParCI : ParallelRTCalls) {
+    Function *ParFn =
+        dyn_cast<Function>(ParCI->getArgOperand(1)->stripPointerCasts());
+    // We also need to check the parallel regions (behind the
+    // __kmpc_target_region_kernel_parallel calls).
+    if (!ParFn) {
+      LLVM_DEBUG(dbgs() << "Require SM Fallback due to unknown parallel function\n");
+      RequiresFallback = true;
+      continue;
+    }
+    RequiresFallback |= MayContainParallelKernelCall(*ParFn);
+
+    Value *ParFnCnd =
+        new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check");
+    Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false);
+    IP->getParent()->setName("worker.check.next");
+    ParFnTI->getParent()->setName("worker.execute." + ParFn->getName());
+    CallInst::Create(ParFn, {SharedVars}, "", ParFnTI);
+    ParFnTI->setSuccessor(0, ParallelEndBB);
+  }
+
+  if (RequiresFallback)
+    CallInst::Create(WorkFn, {SharedVars}, "", IP);
+
+  BarrierCall->clone()->insertBefore(WaitTI);
+
+  FinishedTI->setSuccessor(0, FinalBB);
+  WaitTI->setSuccessor(0, WaitBB);
+  // TODO: Add the new loop to LI!
+
+  NumCustomStateMachinesCreated++;
+  NumCustomStateMachinesNoFallback += !RequiresFallback;
+  return true;
+}
+
+static void collectNonParallelGlobalSideEffectsInKernel(
+    CallInst *CInst, SmallVectorImpl<Instruction *> &SideEffectInst,
+    SmallVectorImpl<CallInst *> &RTCalls) {
+
+  SmallVector<Instruction *, 32> Worklist;
+  SmallPtrSet<BasicBlock *, 32> Visited;
+
+  Worklist.push_back(CInst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+
+    if (isSPMDRelatedRTCall(I))
+      RTCalls.push_back(cast<CallInst>(I));
+    else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      SideEffectInst.push_back(I);
+
+    if (!I->isTerminator()) {
+      Worklist.push_back(I->getNextNode());
+      continue;
+    }
+
+    for (BasicBlock *SuccBB : successors(I))
+      if (Visited.insert(SuccBB).second)
+        Worklist.push_back(&SuccBB->front());
+  }
+}
+
+
+static bool convertGPUKernelsToSPMD(Module &M) {
+  bool Changed = false;
+
+  GuardGenerator GuardGen(M);
+  Function *target_regionKernelInitFn =
+      M.getFunction("__kmpc_target_region_kernel_init");
+
+  // If the kernel init function is not present or unused, we are done.
+  if (!target_regionKernelInitFn ||
+      target_regionKernelInitFn->getNumUses() == 0)
+    return Changed;
+  if (target_regionKernelInitFn->arg_size() != 4)
+    return Changed;
+
+  SmallVector<CallInst *, 16> AllRTCalls;
+
+  LLVMContext &Ctx = M.getContext();
+  for (const Use &U : target_regionKernelInitFn->uses()) {
+    CallSite CS(U.getUser());
+
+    // Filter out non-callee uses.
+    if (!CS || !CS.isCallee(&U))
+      continue;
+
+    // Filter out non call-inst uses.
+    if (!isa<CallInst>(CS.getInstruction()))
+      continue;
+
+    auto *CInst = cast<CallInst>(CS.getInstruction());
+
+    // Filter out all but explicit non-SPMD cases.
+    Value *IsSPMDConstVal = CInst->getArgOperand(0);
+    if (!isa<ConstantInt>(IsSPMDConstVal) ||
+        !cast<ConstantInt>(IsSPMDConstVal)->isZero())
+      continue;
+
+    Function *KernelFn = CInst->getFunction();
+
+    // For now we require the init call to be in the entry block, not strictly
+    // necessary but it makes things easier.
+    if (CInst->getParent() != &KernelFn->getEntryBlock())
+      continue;
+
+    // Traverse the kernel from the init to the deinit call and determine if
+    // there are any global side effects outside of parallel sections. If so,
+    // we cannot compute the kernel in SPMD mode (right now).
+    SmallVector<Instruction *, 16> SideEffectInst;
+    SmallVector<CallInst *, 16> RTCalls;
+    collectNonParallelGlobalSideEffectsInKernel(CInst, SideEffectInst, RTCalls);
+    if (!GuardGen.guardAllSideEffects(SideEffectInst)) {
+      CInst->getFunction()->dump();
+      if (BuildCustomStateMachines)
+        Changed |= createCustomStateMachine(M, SideEffectInst, RTCalls);
+      CInst->getFunction()->dump();
+      AllRTCalls.append(RTCalls.begin(), RTCalls.end());
+      continue;
+    }
+
+    ConstantInt *COne = ConstantInt::get(IntegerType::getInt1Ty(Ctx), 1);
+    for (CallInst *RTCall : RTCalls) {
+      if (RTCall->getCalledFunction()->getName().equals(
+              "__kmpc_target_region_kernel_parallel")) {
+        Value *Callee = RTCall->getArgOperand(1)->stripPointerCasts();
+        Value *Payload = RTCall->getArgOperand(2);
+        CallInst::Create(Callee, {Payload}, "", RTCall);
+        RTCall->eraseFromParent();
+        continue;
+      }
+
+      assert(RTCall->getArgOperand(0)->getType()->isIntegerTy(1) &&
+             "IsSPMD flag with bool type expected!");
+      assert(isa<ConstantInt>(IsSPMDConstVal) &&
+             "Constant IsSPMD flag expected!");
+      assert(cast<ConstantInt>(IsSPMDConstVal)->isZero() &&
+             "Consistent IsSPMD flags expected!");
+
+      RTCall->setArgOperand(0, COne);
+      AllRTCalls.push_back(RTCall);
+      continue;
+    }
+
+    GlobalVariable *ExecMode =
+        M.getGlobalVariable((KernelFn->getName() + "_exec_mode").str());
+    assert(ExecMode &&
+           "Assumed to find an execution mode hint among the globals");
+    assert(ExecMode->getInitializer()->isOneValue() &&
+           "Assumed target_region execution mode prior to 'SPMD'-zation");
+    ExecMode->setInitializer(
+        Constant::getNullValue(ExecMode->getInitializer()->getType()));
+
+    NumKernelsConvertedToSPMD++;
+
+    Changed = true;
+  }
+
+  Changed |= inlineRTCalls(M, AllRTCalls);
+  return Changed;
+}
+#endif
+
+namespace {
+
+template<class T>
+static void collectCallersOf(Module &M, StringRef Name,
+                             SmallVectorImpl<T> &Callers) {
+  Function *Callee = M.getFunction(Name);
+
+  // If the callee function is not present, we are done.
+  if (!Callee)
+    return;
+
+  // If it exists we check all users.
+  for (const Use &U : Callee->uses()) {
+    CallSite CS(U.getUser());
+
+    // Filter out non-callee uses and non-call uses.
+    if (!CS || !CS.isCallee(&U) || !isa<CallInst>(CS.getInstruction()))
+      continue;
+
+    Callers.emplace_back(T(CS.getCaller()));
+  }
+}
+
+/// OpenMPOpt - The interprocedural OpenMP optimization pass
+struct OpenMPOpt {
+
+  bool runOnModule(Module &M) {
+    bool Changed = false;
+
+    // Collect GPU kernels
+    collectCallersOf(M, "__kmpc_target_region_kernel_init", Kernel);
+
+    for (KernelTy &K : Kernel)
+      Changed |= K.optimize();
+
+    return Changed;
+  }
+
+private:
+
+  SmallVector<KernelTy, 4> Kernel;
+};
+
+struct OpenMPOptLegacy : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  OpenMPOpt OMPOpt;
+
+  OpenMPOptLegacy() : ModulePass(ID) {
+    initializeOpenMPOptLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<PostDominatorTreeWrapperPass>();
+  }
+
+  bool runOnModule(Module &M) override {
+    return OMPOpt.runOnModule(M);
+  }
+};
+} // namespace
+
+char OpenMPOptLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(OpenMPOptLegacy, "openmp-opt",
+                      "OpenMP specific optimizations",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(OpenMPOptLegacy, "openmp-opt",
+                      "OpenMP specific optimizations",
+                    false, false)
+
+Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacy(); }
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -503,6 +503,14 @@
 
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
+  MPM.add(createPostOrderFunctionAttrsLegacyPass());
+
+  // Promote any localized global vars.
+  MPM.add(createPromoteMemoryToRegisterPass());
+
+  // Try to perform OpenMP specific optimizations. This is a no-op if OpenMP
+  // runtime calls are not present in the module.
+  MPM.add(createOpenMPOptLegacyPass());
 
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -1161,6 +1161,11 @@
   Function *F = CS.getCalledFunction();
   Instruction *I = CS.getInstruction();
 
+  // If the call site is explicitly marked as "optnone" we should not try to
+  // optimize it. Mark it as overdefined to prevent optimization.
+  if (CS.hasFnAttr(Attribute::OptimizeNone))
+    (void)markOverdefined(I);
+
   if (auto *II = dyn_cast<IntrinsicInst>(I)) {
     if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
       if (ValueState[I].isOverdefined())
@@ -1270,10 +1275,20 @@
   if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
     MarkBlockExecutable(&F->front());
 
+    bool IsOptNoneCS = CS.hasFnAttr(Attribute::OptimizeNone);
+
     // Propagate information from this call site into the callee.
     CallSite::arg_iterator CAI = CS.arg_begin();
     for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
          AI != E; ++AI, ++CAI) {
+
+      // If the call site is "optnone" we do not use information derived from the
+      // passed values.
+      if (IsOptNoneCS) {
+        markOverdefined(&*AI);
+        continue;
+      }
+
       // If this argument is byval, and if the function is not readonly, there
       // will be an implicit copy formed of the input aggregate.
       if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -173,6 +173,7 @@
 ; CHECK-NEXT:         Lazy Block Frequency Analysis
 ; CHECK-NEXT:         Optimization Remark Emitter
 ; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     A No-Op Barrier Pass
 ; CHECK-NEXT:     Eliminate Available Externally Globals
 ; CHECK-NEXT:     CallGraph Construction
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -178,6 +178,7 @@
 ; CHECK-NEXT:         Lazy Block Frequency Analysis
 ; CHECK-NEXT:         Optimization Remark Emitter
 ; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     A No-Op Barrier Pass
 ; CHECK-NEXT:     Eliminate Available Externally Globals
 ; CHECK-NEXT:     CallGraph Construction
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -160,6 +160,7 @@
 ; CHECK-NEXT:         Lazy Block Frequency Analysis
 ; CHECK-NEXT:         Optimization Remark Emitter
 ; CHECK-NEXT:         Combine redundant instructions
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     A No-Op Barrier Pass
 ; CHECK-NEXT:     Eliminate Available Externally Globals
 ; CHECK-NEXT:     CallGraph Construction
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
--- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -78,13 +78,14 @@
 
 # These flags are required to emit LLVM Bitcode. We check them together because
 # if any of them are not supported, there is no point in finding out which are.
-set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required -emit-llvm -std=c++11 -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
 set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
 check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
 
 # It makes no sense to continue given that the compiler doesn't support
 # emitting basic LLVM Bitcode
 if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+  message(ERROR "NO FLAG SUPPORT")
   return()
 endif()
 
@@ -101,6 +102,7 @@
   check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
 
   if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+    message(ERROR "NO FCUDA RDC")
     return()
   endif()
 
diff --git a/openmp/libomptarget/deviceRTLs/common/target_region.h b/openmp/libomptarget/deviceRTLs/common/target_region.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/target_region.h
@@ -0,0 +1,167 @@
+//===-- target_region.h --- Target region OpenMP devie runtime interface --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target region interfaces are simple interfaces designed to allow middle-end
+// (=LLVM) passes to analyze and transform the code. To achieve good performance
+// it may be required to run the associated passes. However, implementations of
+// this interface shall always provide a correct implementation as close to the
+// user expected code as possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _DEVICERTL_COMMON_INTERFACES_H_
+#define _DEVICERTL_COMMON_INTERFACES_H_
+
+#ifndef EXTERN
+#define EXTERN
+#endif
+#ifndef CALLBACK
+#define CALLBACK(Callee, Payload0, Payload1)
+#endif
+
+/// The target region _kernel_ interface for GPUs
+///
+/// This deliberatly simple interface provides the middle-end (=LLVM) with
+/// easier means to reason about the semantic of the code and transform it as
+/// well. The runtime calls are therefore also desiged to carry sufficient
+/// information necessary for optimization.
+///
+///
+/// Intended usage:
+///
+/// \code
+/// void kernel(...) {
+///
+///   char ThreadKind = __kmpc_target_region_kernel_init(...);
+///
+///   if (ThreadKind == -1) {               //  actual worker thread
+///     if (!UsedLibraryStateMachine)
+///       user_state_machine();
+///     goto exit;
+///   } else if (ThreadKind == 0) {         // surplus worker thread
+///     goto exit;
+///   } else {                              //    team master thread
+///     goto user_code;
+///   }
+///
+/// user_code:
+///
+///   // User defined kernel code, parallel regions are replaced by
+///   // by __kmpc_target_region_kernel_parallel(...) calls.
+///
+///   // Fallthrough to de-initialization
+///
+/// deinit:
+///   __kmpc_target_region_kernel_deinit(...);
+///
+/// exit:
+///   /* exit the kernel */
+/// }
+/// \endcode
+///
+///
+///{
+
+/// Initialization
+///
+///
+/// In SPMD mode, all threads will execute their respective initialization
+/// routines.
+///
+/// In non-SPMD mode, team masters will invoke the initialization routines while
+/// the rest is considered a worker thread. Worker threads required for this
+/// target region will be trapped inside the function if \p UseStateMachine is
+/// true. Otherwise they will escape with a return value of -1
+///
+/// \param UseSPMDMode         Flag to indicate if execution is performed in
+///                            SPMD mode.
+/// \param RequiresOMPRuntime  Flag to indicate if the runtime is required and
+///                            needs to be initialized.
+/// \param UseStateMachine     Flag to indicate if the runtime state machine
+///                            should be used in non-SPMD mode.
+/// \param RequiresDataSharing Flag to indicate if there might be inter-thread
+///                            sharing which needs runtime support.
+///
+/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the
+///            team master.
+///         0, in non-SPMD mode and the thread is a surplus worker that should
+///            not execute anything in the target region.
+///        -1, in non-SPMD mode and the thread is a required worker which:
+///             - finished work and should be terminated if \p UseStateMachine
+///               is true.
+///             - has not performed work and should be put in a user provied
+///               state machine (as defined above).
+///
+EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode,
+                                               bool RequiresOMPRuntime,
+                                               bool UseStateMachine,
+                                               bool RequiresDataSharing);
+
+/// De-Initialization
+///
+///
+/// In non-SPMD, this function releases the workers trapped in a state machine
+/// and also any memory dynamically allocated by the runtime.
+///
+/// \param UseSPMDMode        Flag to indicate if execution is performed in
+///                           SPMD mode.
+/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and
+///                           is therefore initialized.
+///
+EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+                                               bool RequiredOMPRuntime);
+
+/// Generic type of a work function in the target region kernel interface. The
+/// two arguments are pointers to structures that contains the shared and
+/// firstprivate variables respectively. Since the layout and size was known at
+/// compile time, the front-end is expected to generate appropriate packing and
+/// unpacking code.
+typedef void (*ParallelWorkFnTy)(char * /* SharedValues */,
+                                 char * /* PrivateValues */);
+
+/// Enter a parallel region
+///
+///
+/// The parallel region is defined by \p ParallelWorkFn. The shared variables,
+/// \p SharedMemorySize bytes in total, start at \p SharedValues. The
+/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at
+/// \p PrivateValues.
+///
+/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and
+/// \p PrivateValues as arguments before it returns.
+///
+/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues
+/// are communicated to the workers before they are released from the state
+/// machine to run the code defined by \p ParallelWorkFn in parallel. This
+/// function will only return after all workers are finished.
+///
+/// \param UseSPMDMode        Flag to indicate if execution is performed in
+///                           SPMD mode.
+/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and
+///                           is therefore initialized.
+/// \param ParallelWorkFn     The outlined code that is executed in parallel by
+///                           the threads in the team.
+/// \param SharedValues       A pointer to the location of all shared values.
+/// \param SharedValuesBytes  The total size of the shared values in bytes.
+/// \param PrivateValues      A pointer to the location of all private values.
+/// \param PrivateValuesBytes The total size of the private values in bytes.
+/// \param SharedMemPointers  Flag to indicate that the pointer \p SharedValues
+///                           and \p PrivateValues point into shared memory.
+///                           If this flag is true, it also requires that all
+///                           private values, if any, are stored directly after
+///                           the shared values.
+///
+CALLBACK(ParallelWorkFnTy, SharedValues, PrivateValues)
+EXTERN void __kmpc_target_region_kernel_parallel(
+    bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn,
+    char *SharedValues, uint16_t SharedValuesBytes, char *PrivateValues,
+    uint16_t PrivateValuesBytes, bool SharedMemPointers);
+
+///}
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -53,6 +53,7 @@
       src/reduction.cu
       src/sync.cu
       src/task.cu
+      src/target_region.cu
   )
 
   set(omp_data_objects src/omp_data.cu)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -63,3 +63,9 @@
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
 __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Pointer to share memory between team threads in the target region interface.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ target_region_shared_buffer _target_region_shared_memory;
+
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -101,6 +101,66 @@
   uint32_t nArgs;
 };
 
+/// Helper structure to manage the memory shared by the threads in a team.
+///
+/// Note: Only the team master is allowed to call non-const functions!
+struct target_region_shared_buffer {
+#define PRE_SHARED_BYTES 128
+
+  INLINE void init() {
+    _ptr = &_data[0];
+    _size = PRE_SHARED_BYTES;
+    _offset = 0;
+  }
+
+  /// Release any dynamic allocated memory.
+  INLINE void release() {
+    if (_size == PRE_SHARED_BYTES)
+      return;
+    SafeFree(_ptr, (char *)"free shared dynamic buffer");
+    init();
+  }
+
+  INLINE void set(char *ptr, size_t offset) {
+    release();
+    _ptr = ptr;
+    _offset = offset;
+  }
+
+  INLINE void resize(size_t size, size_t offset) {
+    _offset = offset;
+
+    if (size <= _size)
+      return;
+
+    if (_size != PRE_SHARED_BYTES)
+      SafeFree(_ptr, (char *)"free shared dynamic buffer");
+
+    _size = size;
+    _ptr = (char *)SafeMalloc(_size, (char *)"new shared buffer");
+  }
+
+  // Called by all threads.
+  INLINE char *begin() const { return _ptr; };
+  INLINE size_t size() const { return _size; };
+  INLINE size_t get_offset() const { return _offset; };
+
+private:
+  // Pre-allocated space that holds PRE_SHARED_BYTES many bytes.
+  char _data[PRE_SHARED_BYTES];
+
+  // Pointer to the currently used buffer.
+  char *_ptr;
+
+  // Size of the currently used buffer.
+  uint32_t _size;
+
+  // Offset into the currently used buffer.
+  uint32_t _offset;
+
+#undef PRE_SHARED_BYTES
+};
+
 extern __device__ __shared__ omptarget_nvptx_SharedArgs
     omptarget_nvptx_globalArgs;
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu
@@ -0,0 +1,197 @@
+//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the common target region interface.
+//
+//===----------------------------------------------------------------------===//
+
+// Include the native definitions first as certain defines might be needed in
+// the common interface definition below.
+#include "omptarget-nvptx.h"
+#include "interface.h"
+
+#include "../../common/target_region.h"
+
+/// The pointer used to share memory between team threads.
+extern __device__ __shared__ target_region_shared_buffer
+    _target_region_shared_memory;
+
+EXTERN char *__kmpc_target_region_kernel_get_shared_memory() {
+  return _target_region_shared_memory.begin();
+}
+EXTERN char *__kmpc_target_region_kernel_get_private_memory() {
+  return _target_region_shared_memory.begin() +
+         _target_region_shared_memory.get_offset();
+}
+
+/// Simple generic state machine for worker threads.
+INLINE static void
+__kmpc_target_region_state_machine(bool IsOMPRuntimeInitialized) {
+
+  do {
+    void *WorkFn = 0;
+
+    // Wait for the signal that we have a new work function.
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      char *SharedVars = __kmpc_target_region_kernel_get_shared_memory();
+      char *PrivateVars = __kmpc_target_region_kernel_get_private_memory();
+
+      ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars);
+
+      __kmpc_kernel_end_parallel();
+    }
+
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+  } while (true);
+}
+
+/// Filter threads into masters and workers. If \p UseStateMachine is true,
+/// required workers will enter a state machine through and be trapped there.
+/// Master and surplus worker threads will return from this function immediately
+/// while required workers will only return once there is no more work. The
+/// return value indicates if the thread is a master (1), a surplus worker (0),
+/// or a finished required worker released from the state machine (-1).
+INLINE static int8_t
+__kmpc_target_region_thread_filter(unsigned ThreadLimit, bool UseStateMachine,
+                                   bool IsOMPRuntimeInitialized) {
+
+  unsigned TId = GetThreadIdInBlock();
+  bool IsWorker = TId < ThreadLimit;
+
+  if (IsWorker) {
+    if (UseStateMachine)
+      __kmpc_target_region_state_machine(IsOMPRuntimeInitialized);
+    return -1;
+  }
+
+  return TId == GetMasterThreadID();
+}
+
+EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode,
+                                               bool UseStateMachine,
+                                               bool RequiresOMPRuntime,
+                                               bool RequiresDataSharing) {
+  unsigned NumThreads = GetNumberOfThreadsInBlock();
+
+  // Handle the SPMD case first.
+  if (UseSPMDMode) {
+
+    __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime,
+                            RequiresDataSharing);
+
+    if (RequiresDataSharing)
+      __kmpc_data_sharing_init_stack_spmd();
+
+    return 1;
+  }
+
+  // Reserve one WARP in non-SPMD mode for the masters.
+  unsigned ThreadLimit = NumThreads - WARPSIZE;
+  int8_t FilterVal = __kmpc_target_region_thread_filter(
+      ThreadLimit, UseStateMachine, RequiresOMPRuntime);
+
+  // If the filter returns 1 the executing thread is a team master which will
+  // initialize the kernel in the following.
+  if (FilterVal == 1) {
+    __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime);
+    __kmpc_data_sharing_init_stack();
+    _target_region_shared_memory.init();
+  }
+
+  return FilterVal;
+}
+
+EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+                                               bool RequiredOMPRuntime) {
+  // Handle the SPMD case first.
+  if (UseSPMDMode) {
+    __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime);
+    return;
+  }
+
+  __kmpc_kernel_deinit(RequiredOMPRuntime);
+
+  // Barrier to terminate worker threads.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // Release any dynamically allocated memory used for sharing.
+  _target_region_shared_memory.release();
+}
+
+EXTERN void __kmpc_target_region_kernel_parallel(
+    bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn,
+    char *SharedVars, uint16_t SharedVarsBytes, char *PrivateVars,
+    uint16_t PrivateVarsBytes, bool SharedMemPointers) {
+
+  if (UseSPMDMode) {
+    ParallelWorkFn(SharedVars, PrivateVars);
+    return;
+  }
+
+  if (SharedMemPointers) {
+    // If shared memory pointers are used the user guarantees that any private
+    // variables, if any, are stored directly after the shared ones in memory
+    // and that this memory can be accessed by all the threads. In that case,
+    // we do not need to copy memory around but simply use the provided
+    // locations.
+
+    _target_region_shared_memory.set(SharedVars, SharedVarsBytes);
+
+  } else {
+
+    size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes;
+    if (BytesToCopy) {
+      // Resize the shared memory to be able to hold the data which is required
+      // to be in shared memory. Also set the offset to the beginning to the
+      // private variables.
+      _target_region_shared_memory.resize(BytesToCopy, SharedVarsBytes);
+
+      // Copy the shared and private variables into shared memory.
+      char *SVMemory = __kmpc_target_region_kernel_get_shared_memory();
+      char *PVMemory = __kmpc_target_region_kernel_get_private_memory();
+      memcpy(SVMemory, SharedVars, SharedVarsBytes);
+      memcpy(PVMemory, PrivateVars, PrivateVarsBytes);
+    }
+  }
+
+  // TODO: It seems we could store the work function in the same shared space
+  // as the rest of the variables above.
+  //
+  // Initialize the parallel work, e.g., make sure the work function is known.
+  __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime);
+
+  // TODO: It is odd that we call the *_spmd version in non-SPMD mode here.
+  //
+  // Activate workers. This barrier is used by the master to signal
+  // work for the workers.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // OpenMP [2.5, Parallel Construct, p.49]
+  // There is an implied barrier at the end of a parallel region. After the
+  // end of a parallel region, only the master thread of the team resumes
+  // execution of the enclosing task region.
+  //
+  // The master waits at this barrier until all workers are done.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // Update the shared variables if necessary.
+  if (!SharedPointers && SharedVarsBytes)
+    memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(),
+           SharedVarsBytes);
+}