diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
new file mode 100644
--- /dev/null
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.h
@@ -0,0 +1,260 @@
+//===-- CGOpenMPRuntimeTRegion.h --- OpenMP RT TRegion interface codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Code generation interface for OpenMP target offloading though the generic
+// Target Region (TRegion) interface.
+//
+// See openmp/libomptarget/deviceRTLs/common/target_Region.h for further
+// information on the interface functions and their intended use.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H
+#define LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMETREGION_H
+
+#include "CGOpenMPRuntime.h"
+#include "llvm/ADT/SmallBitVector.h"
+
+namespace clang {
+namespace CodeGen {
+
+class CGOpenMPRuntimeTRegion : public CGOpenMPRuntime {
+  // TODO: The target region interface only covers kernel codes for now. This
+  //       therefore codegen implicitly assumes the target region kernel
+  //       interface is targeted. Once a second target region interface is put
+  //       in place, e.g., specialized to many-core offloading, we might need
+  //       to make the target interface explicit.
+
+  /// Create an outlined function for a target kernel.
+  ///
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param CodeGen Object containing the target statements.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+                  const RegionCodeGenTy &CodeGen);
+
+  /// Helper for generic kernel mode, target directive's entry function.
+  void emitKernelHeader(CodeGenFunction &CGF, llvm::BasicBlock *&ExitBB);
+
+  /// Signal termination of generic mode execution.
+  void emitKernelFooter(CodeGenFunction &CGF, llvm::BasicBlock *ExitBB);
+
+  //
+  // Base class overrides.
+  //
+
+  /// Creates offloading entry for the provided entry ID \a ID,
+  /// address \a Addr, size \a Size, and flags \a Flags.
+  void createOffloadEntry(llvm::Constant *ID, llvm::Constant *Addr,
+                          uint64_t Size, int32_t Flags,
+                          llvm::GlobalValue::LinkageTypes Linkage) override;
+
+  /// Emit outlined function for 'target' directive.
+  ///
+  /// \param D Directive to emit.
+  /// \param ParentName Name of the function that encloses the target region.
+  /// \param OutlinedFn Outlined function value to be defined by this call.
+  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
+  /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// An outlined function may not be an entry if, e.g. the if clause always
+  /// evaluates to false.
+  void emitTargetOutlinedFunction(const OMPExecutableDirective &D,
+                                  StringRef ParentName,
+                                  llvm::Function *&OutlinedFn,
+                                  llvm::Constant *&OutlinedFnID,
+                                  bool IsOffloadEntry,
+                                  const RegionCodeGenTy &CodeGen) override;
+
+protected:
+  /// Get the function name of an outlined region, customized to the target.
+  StringRef getOutlinedHelperName() const override { return ".omp_TRegion."; }
+
+public:
+  explicit CGOpenMPRuntimeTRegion(CodeGenModule &CGM);
+
+  /// Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32
+  /// global_tid, int proc_bind) to generate code for 'proc_bind' clause.
+  virtual void emitProcBindClause(CodeGenFunction &CGF,
+                                  OpenMPProcBindClauseKind ProcBind,
+                                  SourceLocation Loc) override;
+
+  /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
+  /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
+  /// clause.
+  /// \param NumThreads An integer value of threads.
+  virtual void emitNumThreadsClause(CodeGenFunction &CGF,
+                                    llvm::Value *NumThreads,
+                                    SourceLocation Loc) override;
+
+  /// Set the number of teams to \p NumTeams and the thread limit to
+  /// \p ThreadLimit.
+  ///
+  /// \param NumTeams An integer expression of teams.
+  /// \param ThreadLimit An integer expression of threads.
+  void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams,
+                          const Expr *ThreadLimit, SourceLocation Loc) override;
+
+  /// Emits inlined function for the specified OpenMP parallel directive.
+  ///
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Function *
+  emitParallelOutlinedFunction(const OMPExecutableDirective &D,
+                               const VarDecl *ThreadIDVar,
+                               OpenMPDirectiveKind InnermostKind,
+                               const RegionCodeGenTy &CodeGen) override;
+
+  /// Emits code for parallel or serial call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  /// \param IfCond Condition in the associated 'if' clause, if it was
+  /// specified, nullptr otherwise.
+  void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                        llvm::Function *OutlinedFn,
+                        ArrayRef<llvm::Value *> CapturedVars,
+                        const Expr *IfCond) override;
+
+  /// Emits a critical region.
+  /// \param CriticalName Name of the critical region.
+  /// \param CriticalOpGen Generator for the statement associated with the given
+  /// critical region.
+  /// \param Hint Value of the 'hint' clause (optional).
+  void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName,
+                          const RegionCodeGenTy &CriticalOpGen,
+                          SourceLocation Loc,
+                          const Expr *Hint = nullptr) override;
+
+  /// Emit a code for reduction clause.
+  ///
+  /// \param Privates List of private copies for original reduction arguments.
+  /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+  /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+  /// or 'operator binop(LHS, RHS)'.
+  /// \param Options List of options for reduction codegen:
+  ///     WithNowait true if parent directive has also nowait clause, false
+  ///     otherwise.
+  ///     SimpleReduction Emit reduction operation only. Used for omp simd
+  ///     directive on the host.
+  ///     ReductionKind The kind of reduction to perform.
+  virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
+                             ArrayRef<const Expr *> Privates,
+                             ArrayRef<const Expr *> LHSExprs,
+                             ArrayRef<const Expr *> RHSExprs,
+                             ArrayRef<const Expr *> ReductionOps,
+                             ReductionOptionsTy Options) override;
+
+  /// Emits OpenMP-specific function prolog.
+  /// Required for device constructs.
+  void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override;
+
+  /// Cleans up references to the objects in finished function.
+  ///
+  void functionFinished(CodeGenFunction &CGF) override;
+
+  /// Choose a default value for the dist_schedule clause.
+  void
+  getDefaultDistScheduleAndChunk(CodeGenFunction &CGF,
+                                 const OMPLoopDirective &S,
+                                 OpenMPDistScheduleClauseKind &ScheduleKind,
+                                 llvm::Value *&Chunk) const override;
+
+  /// Choose a default value for the schedule clause.
+  void getDefaultScheduleAndChunk(CodeGenFunction &CGF,
+                                  const OMPLoopDirective &S,
+                                  OpenMPScheduleClauseKind &ScheduleKind,
+                                  const Expr *&ChunkExpr) const override;
+
+  /// Perform check on requires decl to ensure that target architecture
+  /// supports unified addressing
+  void checkArchForUnifiedAddressing(CodeGenModule &CGM,
+                                     const OMPRequiresDecl *D) const override;
+
+  /// Emits inlined function for the specified OpenMP teams
+  //  directive.
+  /// \a D. This outlined function has type void(*)(kmp_int32 *ThreadID,
+  /// kmp_int32 BoundID, struct context_vars*).
+  /// \param D OpenMP directive.
+  /// \param ThreadIDVar Variable for thread id in the current OpenMP region.
+  /// \param InnermostKind Kind of innermost directive (for simple directives it
+  /// is a directive itself, for combined - its innermost directive).
+  /// \param CodeGen Code generation sequence for the \a D directive.
+  llvm::Function *
+  emitTeamsOutlinedFunction(const OMPExecutableDirective &D,
+                            const VarDecl *ThreadIDVar,
+                            OpenMPDirectiveKind InnermostKind,
+                            const RegionCodeGenTy &CodeGen) override;
+
+  /// Emits code for teams call of the \a OutlinedFn with
+  /// variables captured in a record which address is stored in \a
+  /// CapturedStruct.
+  /// \param OutlinedFn Outlined function to be run by team masters. Type of
+  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
+  /// \param CapturedVars A pointer to the record with the references to
+  /// variables used in \a OutlinedFn function.
+  ///
+  void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D,
+                     SourceLocation Loc, llvm::Function *OutlinedFn,
+                     ArrayRef<llvm::Value *> CapturedVars) override;
+
+protected:
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// SPMD mode.
+  virtual bool isKnownSPMDMode() const { return false; }
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// execution without runtime support.
+  virtual bool mayNeedRuntimeSupport() const { return true; }
+
+  /// Hook to allow derived classes to perform checks on the AST that justify
+  /// execution without data sharing support.
+  virtual bool mayPerformDataSharing() const { return true; }
+
+private:
+  /// Simple container for a wrapper of an outlined parallel function and the
+  /// layout of the passed variables (= captured variables, both shared and
+  /// firstprivate).
+  struct WrapperInfo {
+    llvm::Function *WrapperFn = nullptr;
+    llvm::StructType *SharedVarsStructTy = nullptr;
+    llvm::StructType *PrivateVarsStructTy = nullptr;
+    llvm::SmallBitVector CaptureIsPrivate;
+  };
+
+  /// Map an outlined function to its wrapper and shared struct type. The latter
+  /// defines the layout of the payload and the wrapper will unpack that payload
+  /// and pass the values to the outlined function.
+  llvm::DenseMap<llvm::Function *, WrapperInfo> WrapperInfoMap;
+
+  /// Emit function which wraps the outline parallel region
+  /// and controls the parameters which are passed to this function.
+  /// The wrapper ensures that the outlined function is called
+  /// with the correct arguments when data is shared.
+  void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn,
+                                        const OMPExecutableDirective &D);
+};
+
+} // namespace CodeGen
+} // namespace clang
+
+#endif // LLVM_CLANG_LIB_CODEGEN_CGOPENMPRUNTIMEKERNEL_H
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
new file mode 100644
--- /dev/null
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeTRegion.cpp
@@ -0,0 +1,712 @@
+//===-- CGOpenMPRuntimeTRegion.cpp - OpenMP RT TRegion interface codegen --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the code generation interface for OpenMP target offloading
+// though the Target Region (TRegion) interface.
+//
+// See the file comment in CGOpenMPRuntimeTRegion.h for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CGOpenMPRuntimeTRegion.h"
+#include "CodeGenFunction.h"
+#include "clang/AST/StmtVisitor.h"
+
+using namespace clang;
+using namespace CodeGen;
+
+namespace {
+
+/// Enums for all functions in the target
+enum OpenMPTargetRuntimeLibraryCalls {
+  OMPRTL__kmpc_target_region_kernel_init,
+  OMPRTL__kmpc_target_region_kernel_deinit,
+  OMPRTL__kmpc_target_region_kernel_parallel,
+};
+
+/// Return the runtime function declaration specified by \p Function.
+static llvm::Function *getOrCreateRuntimeFunctionDeclaration(
+    CGOpenMPRuntimeTRegion &CG, CodeGenModule &CGM,
+    OpenMPTargetRuntimeLibraryCalls Function) {
+
+  llvm::Function *RTFn;
+  auto *I1Ty = llvm::IntegerType::getInt1Ty(CGM.getLLVMContext());
+  switch (Function) {
+  case OMPRTL__kmpc_target_region_kernel_init: {
+    // char __kmpc_target_region_kernel_init(bool UseSPMDMode,
+    //                                       bool UseStateMachine,
+    //                                       bool RequiresOMPRuntime,
+    //                                       bool RequiresDataSharing);
+    llvm::Type *TypeParams[] = {I1Ty, I1Ty, I1Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.Int8Ty, TypeParams, /* isVarArg */ false);
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_init")
+            .getCallee());
+    break;
+  }
+  case OMPRTL__kmpc_target_region_kernel_deinit: {
+    // void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+    //                                         bool RequiredOMPRuntime);
+    llvm::Type *TypeParams[] = {I1Ty, I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false);
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_deinit")
+            .getCallee());
+    break;
+  }
+  case OMPRTL__kmpc_target_region_kernel_parallel: {
+    // typedef void (*ParallelWorkFnTy)(void *, void *);
+    auto *ParWorkFnTy =
+        llvm::FunctionType::get(CGM.VoidTy, {CGM.VoidPtrTy, CGM.VoidPtrTy},
+                                /* isVarArg */ false);
+    //
+    // void __kmpc_target_region_kernel_parallel(bool UseSPMDMode,
+    //                                           bool RequiredOMPRuntime,
+    //                                           ParallelWorkFnTy WorkFn,
+    //                                           void *SharedVars,
+    //                                           uint16_t SharedVarsBytes,
+    //                                           void *PrivateVars,
+    //                                           uint16_t PrivateVarsBytes,
+    //                                           bool SharedPointers);
+    llvm::Type *TypeParams[] = {
+        I1Ty,          I1Ty,        ParWorkFnTy->getPointerTo(),
+        CGM.VoidPtrTy, CGM.Int16Ty, CGM.VoidPtrTy,
+        CGM.Int16Ty,   I1Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /* isVarArg */ false);
+
+    RTFn = cast<llvm::Function>(
+        CGM.CreateRuntimeFunction(FnTy, "__kmpc_target_region_kernel_parallel")
+            .getCallee());
+
+    RTFn->addParamAttr(2, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(3, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(5, llvm::Attribute::NoCapture);
+    RTFn->addParamAttr(5, llvm::Attribute::ReadOnly);
+
+    // Add the callback metadata if it is not present already.
+    if (!RTFn->hasMetadata(llvm::LLVMContext::MD_callback)) {
+      llvm::LLVMContext &Ctx = RTFn->getContext();
+      llvm::MDBuilder MDB(Ctx);
+      // Annotate the callback behavior of __kmpc_target_region_kernel_parallel:
+      //  - The callback callee is WorkFn, argument 2 starting with 0.
+      //  - The first callback payload is SharedVars.
+      //  - The second callback payload is PrivateVars.
+      RTFn->addMetadata(
+          llvm::LLVMContext::MD_callback,
+          *llvm::MDNode::get(
+              Ctx, {MDB.createCallbackEncoding(2, {3, 5},
+                                               /* VarArgsArePassed */ false)}));
+    }
+    break;
+  }
+  }
+
+  // TODO: Remove all globals and set this attribute.
+  //
+  // This is overwritten when the definition is linked in.
+  // RTFn->addFnAttr(llvm::Attribute::InaccessibleMemOrArgMemOnly);
+
+  return RTFn;
+}
+
+} // anonymous namespace
+
+void CGOpenMPRuntimeTRegion::emitKernel(const OMPExecutableDirective &D,
+                                        StringRef ParentName,
+                                        llvm::Function *&OutlinedFn,
+                                        llvm::Constant *&OutlinedFnID,
+                                        const RegionCodeGenTy &CodeGen) {
+  WrapperInfoMap.clear();
+
+  // Emit target region as a standalone region.
+  class KernelPrePostActionTy : public PrePostActionTy {
+    CGOpenMPRuntimeTRegion &RT;
+    llvm::BasicBlock *ExitBB;
+
+  public:
+    KernelPrePostActionTy(CGOpenMPRuntimeTRegion &RT)
+        : RT(RT), ExitBB(nullptr) {}
+
+    void Enter(CodeGenFunction &CGF) override {
+      RT.emitKernelHeader(CGF, ExitBB);
+      // Skip target region initialization.
+      RT.setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true);
+    }
+
+    void Exit(CodeGenFunction &CGF) override {
+      RT.clearLocThreadIdInsertPt(CGF);
+      RT.emitKernelFooter(CGF, ExitBB);
+    }
+
+  } Action(*this);
+  CodeGen.setAction(Action);
+
+  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
+                                   /* IsOffloadEntry */ true, CodeGen);
+}
+
+void CGOpenMPRuntimeTRegion::emitKernelHeader(CodeGenFunction &CGF,
+                                              llvm::BasicBlock *&ExitBB) {
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Setup BBs in entry function.
+  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
+  ExitBB = CGF.createBasicBlock(".exit");
+
+  llvm::Value *Args[] = {
+      /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()),
+      /* UseStateMachine */ Bld.getInt1(1),
+      /* RequiresOMPRuntime */
+      Bld.getInt1(mayNeedRuntimeSupport()),
+      /* RequiresDataSharing */ Bld.getInt1(mayPerformDataSharing())};
+  llvm::CallInst *InitCI = CGF.EmitRuntimeCall(
+      getOrCreateRuntimeFunctionDeclaration(
+          *this, CGM, OMPRTL__kmpc_target_region_kernel_init),
+      Args);
+
+  llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(InitCI, Bld.getInt8(1));
+
+  Bld.CreateCondBr(ExecuteCnd, ExecuteBB, ExitBB);
+  CGF.EmitBlock(ExecuteBB);
+}
+
+void CGOpenMPRuntimeTRegion::emitKernelFooter(CodeGenFunction &CGF,
+                                              llvm::BasicBlock *ExitBB) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
+  CGF.EmitBranch(OMPDeInitBB);
+
+  CGF.EmitBlock(OMPDeInitBB);
+
+  CGBuilderTy &Bld = CGF.Builder;
+  // DeInitialize the OMP state in the runtime; called by all active threads.
+  llvm::Value *Args[] = {/* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()),
+                         /* RequiredOMPRuntime */
+                         Bld.getInt1(mayNeedRuntimeSupport())};
+
+  CGF.EmitRuntimeCall(getOrCreateRuntimeFunctionDeclaration(
+                          *this, CGM, OMPRTL__kmpc_target_region_kernel_deinit),
+                      Args);
+
+  CGF.EmitBranch(ExitBB);
+  CGF.EmitBlock(ExitBB);
+}
+
+void CGOpenMPRuntimeTRegion::emitTargetOutlinedFunction(
+    const OMPExecutableDirective &D, StringRef ParentName,
+    llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
+    bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
+  if (!IsOffloadEntry) // Nothing to do.
+    return;
+
+  assert(!ParentName.empty() && "Invalid target region parent name!");
+
+  emitKernel(D, ParentName, OutlinedFn, OutlinedFnID, CodeGen);
+
+  // Create a unique global variable to indicate the execution mode of this
+  // target region. The execution mode is either 'non-SPMD' or 'SPMD'. Initially
+  // all regions are executed in non-SPMD mode. This variable is picked up by
+  // the offload library to setup the device appropriately before kernel launch.
+  auto *GVMode = new llvm::GlobalVariable(
+      CGM.getModule(), CGM.Int8Ty, /* isConstant */ true,
+      llvm::GlobalValue::WeakAnyLinkage, llvm::ConstantInt::get(CGM.Int8Ty, 1),
+      Twine(OutlinedFn->getName(), "_exec_mode"));
+  CGM.addCompilerUsedGlobal(GVMode);
+}
+
+CGOpenMPRuntimeTRegion::CGOpenMPRuntimeTRegion(CodeGenModule &CGM)
+    : CGOpenMPRuntime(CGM, "_", "$") {
+  if (!CGM.getLangOpts().OpenMPIsDevice)
+    llvm_unreachable("TRegion code generation does only handle device code!");
+}
+
+void CGOpenMPRuntimeTRegion::emitProcBindClause(
+    CodeGenFunction &CGF, OpenMPProcBindClauseKind ProcBind,
+    SourceLocation Loc) {
+  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
+}
+
+void CGOpenMPRuntimeTRegion::emitNumThreadsClause(CodeGenFunction &CGF,
+                                                  llvm::Value *NumThreads,
+                                                  SourceLocation Loc) {
+  CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
+}
+
+void CGOpenMPRuntimeTRegion::emitNumTeamsClause(CodeGenFunction &CGF,
+                                                const Expr *NumTeams,
+                                                const Expr *ThreadLimit,
+                                                SourceLocation Loc) {
+  // Nothing to do for kernel mode, no other modes supported yet.
+}
+
+llvm::Function *CGOpenMPRuntimeTRegion::emitParallelOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  // Emit target region as a standalone region.
+  llvm::Function *OutlinedFun =
+      cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
+          D, ThreadIDVar, InnermostKind, CodeGen));
+
+  createParallelDataSharingWrapper(OutlinedFun, D);
+
+  return OutlinedFun;
+}
+
+// TODO: Check if this is actually needed.
+static const ValueDecl *getUnderlyingVar(const Expr *E) {
+  E = E->IgnoreParens();
+  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E)) {
+    const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    E = Base;
+  } else if (auto *OASE = dyn_cast<OMPArraySectionExpr>(E)) {
+    const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempOASE = dyn_cast<OMPArraySectionExpr>(Base))
+      Base = TempOASE->getBase()->IgnoreParenImpCasts();
+    while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
+      Base = TempASE->getBase()->IgnoreParenImpCasts();
+    E = Base;
+  }
+  E = E->IgnoreParenImpCasts();
+  if (const auto *DE = dyn_cast<DeclRefExpr>(E))
+    return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
+  const auto *ME = cast<MemberExpr>(E);
+  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
+}
+
+void CGOpenMPRuntimeTRegion::createParallelDataSharingWrapper(
+    llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
+  ASTContext &Ctx = CGM.getContext();
+  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
+
+  // Create a function that takes as argument the source thread.
+  FunctionArgList WrapperArgs;
+  ImplicitParamDecl SharedVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(),
+                                      /* Id */ nullptr, Ctx.VoidPtrTy,
+                                      ImplicitParamDecl::Other);
+  ImplicitParamDecl PrivateVarsArgDecl(Ctx, /* DC */ nullptr, D.getBeginLoc(),
+                                       /* Id */ nullptr, Ctx.VoidPtrTy,
+                                       ImplicitParamDecl::Other);
+  WrapperArgs.emplace_back(&SharedVarsArgDecl);
+  WrapperArgs.emplace_back(&PrivateVarsArgDecl);
+
+  const CGFunctionInfo &CGFI =
+      CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
+
+  auto *WrapperFn = llvm::Function::Create(
+      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
+      Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
+  CGM.SetInternalFunctionAttributes(GlobalDecl(), WrapperFn, CGFI);
+
+  OutlinedParallelFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  OutlinedParallelFn->setDoesNotRecurse();
+  WrapperFn->setLinkage(llvm::GlobalValue::InternalLinkage);
+  WrapperFn->setDoesNotRecurse();
+
+  CodeGenFunction CGF(CGM, /* suppressNewContext */ true);
+  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WrapperFn, CGFI, WrapperArgs,
+                    D.getBeginLoc(), D.getBeginLoc());
+
+  auto AI = WrapperFn->arg_begin();
+  llvm::Argument &SharedVarsArg = *(AI++);
+  llvm::Argument &PrivateVarsArg = *(AI);
+  SharedVarsArg.setName("shared_vars");
+  PrivateVarsArg.setName("private_vars");
+
+  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
+                                           /* DestWidth */ 32, /* Signed */ 1),
+                                       /* Name */ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/* C */ 0));
+
+  setLocThreadIdInsertPt(CGF, /* AtCurrentPoint */ true);
+
+  // Create the array of arguments and fill it with boilerplate values.
+  SmallVector<llvm::Value *, 8> Args;
+  Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer());
+  Args.emplace_back(ZeroAddr.getPointer());
+
+  CGBuilderTy &Bld = CGF.Builder;
+
+  // Collect all variables marked as shared.
+  llvm::SmallPtrSet<const ValueDecl *, 16> SharedVars;
+  for (const auto *C : D.getClausesOfKind<OMPSharedClause>())
+    for (const Expr *E : C->getVarRefs())
+      SharedVars.insert(getUnderlyingVar(E));
+
+  // Retrieve the shared and private variables from argument pointers and pass
+  // them to the outlined function.
+  llvm::SmallVector<llvm::Type *, 8> SharedStructMemberTypes;
+  llvm::SmallVector<llvm::Type *, 8> PrivateStructMemberTypes;
+
+  WrapperInfo &WI = WrapperInfoMap[OutlinedParallelFn];
+  WI.WrapperFn = WrapperFn;
+
+  auto ArgIt = OutlinedParallelFn->arg_begin() + /* global_tid & bound_tid */ 2;
+
+  // If we require loop bounds they are already part of the outlined function
+  // encoding, just after global_tid and bound_tid.
+  bool RequiresLoopBounds =
+      isOpenMPLoopBoundSharingDirective(D.getDirectiveKind());
+  if (RequiresLoopBounds) {
+    // Register the lower bound in the wrapper info.
+    WI.CaptureIsPrivate.push_back(true);
+    PrivateStructMemberTypes.push_back((ArgIt++)->getType());
+    // Register the upper bound in the wrapper info.
+    WI.CaptureIsPrivate.push_back(true);
+    PrivateStructMemberTypes.push_back((ArgIt++)->getType());
+  }
+
+  auto CIt = CS.capture_begin();
+  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CIt) {
+    bool IsPrivate = CIt->capturesVariableArrayType() ||
+                     CIt->capturesVariableByCopy() ||
+                     !SharedVars.count(CIt->getCapturedVar());
+    WI.CaptureIsPrivate.push_back(IsPrivate);
+
+    auto &StructMemberTypes =
+        IsPrivate ? PrivateStructMemberTypes : SharedStructMemberTypes;
+    llvm::Type *ArgTy = (ArgIt++)->getType();
+    if (!IsPrivate) {
+      assert(ArgTy->isPointerTy());
+      ArgTy = ArgTy->getPointerElementType();
+    }
+    StructMemberTypes.push_back(ArgTy);
+  }
+
+  // Verify the position of the outlined function argument iterator as a sanity
+  // check.
+  assert(ArgIt == OutlinedParallelFn->arg_end() &&
+         "Not all arguments have been processed!");
+
+  llvm::Value *SharedVarsStructPtr = nullptr;
+  llvm::Value *PrivateVarsStructPtr = nullptr;
+  llvm::LLVMContext &LLVMCtx = OutlinedParallelFn->getContext();
+  if (!PrivateStructMemberTypes.empty()) {
+    WI.PrivateVarsStructTy = llvm::StructType::create(
+        LLVMCtx, PrivateStructMemberTypes, "omp.private.struct");
+    PrivateVarsStructPtr = Bld.CreateBitCast(
+        &PrivateVarsArg, WI.PrivateVarsStructTy->getPointerTo());
+  }
+  if (!SharedStructMemberTypes.empty()) {
+    WI.SharedVarsStructTy = llvm::StructType::create(
+        LLVMCtx, SharedStructMemberTypes, "omp.shared.struct");
+    SharedVarsStructPtr = Bld.CreateBitCast(
+        &SharedVarsArg, WI.SharedVarsStructTy->getPointerTo());
+  }
+
+  assert(WI.CaptureIsPrivate.size() + /* global_tid & bound_tid */ 2 ==
+             OutlinedParallelFn->arg_size() &&
+         "Not all arguments have been processed!");
+
+  unsigned PrivateIdx = 0, SharedIdx = 0;
+  for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+    bool IsPrivate = WI.CaptureIsPrivate[i];
+
+    llvm::Value *StructPtr =
+        IsPrivate ? PrivateVarsStructPtr : SharedVarsStructPtr;
+    unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx;
+
+    // TODO: Figure out the real alignment
+    if (IsPrivate) {
+      Args.emplace_back(
+          Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1));
+    } else {
+      llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++);
+      Args.emplace_back(GEP);
+    }
+  }
+
+  assert(Args.size() == OutlinedParallelFn->arg_size());
+  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
+
+  CGF.FinishFunction();
+
+  clearLocThreadIdInsertPt(CGF);
+}
+
+void CGOpenMPRuntimeTRegion::emitParallelCall(
+    CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *Fn,
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  const WrapperInfo &WI = WrapperInfoMap[Fn];
+
+  auto &&ParGen = [this, CapturedVars, WI](CodeGenFunction &CGF,
+                                           PrePostActionTy &) {
+    CGBuilderTy &Bld = CGF.Builder;
+    assert(WI.WrapperFn && "Wrapper function does not exist!");
+
+    llvm::Value *SharedVarsSize = llvm::Constant::getNullValue(CGM.Int16Ty);
+    llvm::Value *PrivateVarsSize = SharedVarsSize;
+    llvm::Value *SharedStructAlloca = llvm::UndefValue::get(CGM.VoidPtrTy);
+    llvm::Value *PrivateStructAlloca = SharedStructAlloca;
+
+    if (WI.SharedVarsStructTy) {
+      SharedStructAlloca = CGF.CreateDefaultAlignTempAlloca(
+                                  WI.SharedVarsStructTy, ".shared.vars")
+                               .getPointer();
+      const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout();
+      SharedVarsSize = Bld.getInt16(DL.getTypeAllocSize(WI.SharedVarsStructTy));
+    }
+    if (WI.PrivateVarsStructTy) {
+      PrivateStructAlloca = CGF.CreateDefaultAlignTempAlloca(
+                                   WI.PrivateVarsStructTy, ".private.vars")
+                                .getPointer();
+      const llvm::DataLayout &DL = WI.WrapperFn->getParent()->getDataLayout();
+      PrivateVarsSize =
+          Bld.getInt16(DL.getTypeAllocSize(WI.PrivateVarsStructTy));
+    }
+
+    llvm::SmallVector<llvm::Value *, 4> Args;
+    Args.push_back(
+        /* UseSPMDMode */ Bld.getInt1(isKnownSPMDMode()));
+    Args.push_back(
+        /* RequiredOMPRuntime */ Bld.getInt1(mayNeedRuntimeSupport()));
+    Args.push_back(WI.WrapperFn);
+    Args.push_back(CGF.EmitCastToVoidPtr(SharedStructAlloca));
+    Args.push_back(SharedVarsSize);
+    Args.push_back(CGF.EmitCastToVoidPtr(PrivateStructAlloca));
+    Args.push_back(PrivateVarsSize);
+    Args.push_back(
+        /* SharedPointers */ Bld.getInt1(0));
+
+    assert((CapturedVars.empty() ||
+            (WI.SharedVarsStructTy || WI.PrivateVarsStructTy)) &&
+           "Expected the shared or private struct type to be set if variables "
+           "are captured!");
+    assert((CapturedVars.empty() ||
+            CapturedVars.size() ==
+                (WI.SharedVarsStructTy ? WI.SharedVarsStructTy->getNumElements()
+                                       : 0) +
+                    (WI.PrivateVarsStructTy
+                         ? WI.PrivateVarsStructTy->getNumElements()
+                         : 0)) &&
+           "# elements in shared struct types should be number of captured "
+           "variables!");
+
+    // Store all captured variables into a single local structure that is then
+    // passed to the runtime library.
+    unsigned PrivateIdx = 0, SharedIdx = 0;
+    for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+      bool IsPrivate = WI.CaptureIsPrivate[i];
+
+      llvm::Value *StructPtr =
+          IsPrivate ? PrivateStructAlloca : SharedStructAlloca;
+      unsigned &Idx = IsPrivate ? PrivateIdx : SharedIdx;
+      llvm::Value *GEP = Bld.CreateStructGEP(StructPtr, Idx++);
+      llvm::Value *Var = IsPrivate ? CapturedVars[i]
+                                   : Bld.CreateAlignedLoad(CapturedVars[i], 1);
+      Bld.CreateDefaultAlignedStore(Var, GEP);
+    }
+
+    CGF.EmitRuntimeCall(
+        getOrCreateRuntimeFunctionDeclaration(
+            *this, CGM, OMPRTL__kmpc_target_region_kernel_parallel),
+        Args);
+
+    SharedIdx = 0;
+    for (int i = 0, e = WI.CaptureIsPrivate.size(); i < e; i++) {
+      bool IsPrivate = WI.CaptureIsPrivate[i];
+      if (IsPrivate)
+        continue;
+
+      llvm::Value *GEP = Bld.CreateStructGEP(SharedStructAlloca, SharedIdx++);
+      llvm::Value *Var = Bld.CreateAlignedLoad(GEP, 1);
+      Bld.CreateDefaultAlignedStore(Var, CapturedVars[i]);
+    }
+  };
+
+  auto &&SeqGen = [this, &ParGen, Loc](CodeGenFunction &CGF,
+                                       PrePostActionTy &Action) {
+    // Use an artifical "num_threads(1)" clause to force sequential execution if
+    // the expression in the 'if clause' evaluated to false. We expect the
+    // middle-end to clean this up.
+    emitNumThreadsClause(CGF, CGF.Builder.getInt32(/* C */ 1), Loc);
+    ParGen(CGF, Action);
+  };
+
+  if (IfCond) {
+    emitOMPIfClause(CGF, IfCond, ParGen, SeqGen);
+  } else {
+    CodeGenFunction::RunCleanupsScope Scope(CGF);
+    RegionCodeGenTy ThenRCG(ParGen);
+    ThenRCG(CGF);
+  }
+}
+
+llvm::Function *CGOpenMPRuntimeTRegion::emitTeamsOutlinedFunction(
+    const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
+    OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
+
+  // Emit target region as a standalone region.
+  llvm::Function *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
+      D, ThreadIDVar, InnermostKind, CodeGen);
+
+  return OutlinedFunVal;
+}
+
+void CGOpenMPRuntimeTRegion::emitTeamsCall(
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc,
+    llvm::Function *OutlinedFn, ArrayRef<llvm::Value *> CapturedVars) {
+  if (!CGF.HaveInsertPoint())
+    return;
+
+  Address ZeroAddr = CGF.CreateMemTemp(
+      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
+      /*Name*/ ".zero.addr");
+  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
+  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
+  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
+  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
+  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
+}
+
+void CGOpenMPRuntimeTRegion::emitCriticalRegion(
+    CodeGenFunction &CGF, StringRef CriticalName,
+    const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
+    const Expr *Hint) {
+  llvm_unreachable(
+      "TODO: TRegion code generation does not support critical regions yet!");
+}
+
+void CGOpenMPRuntimeTRegion::emitReduction(
+    CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
+    ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
+    ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
+  llvm_unreachable(
+      "TODO: TRegion code generation does not support reductions yet!");
+}
+
+void CGOpenMPRuntimeTRegion::emitFunctionProlog(CodeGenFunction &CGF,
+                                                const Decl *D) {}
+
+void CGOpenMPRuntimeTRegion::functionFinished(CodeGenFunction &CGF) {
+  CGOpenMPRuntime::functionFinished(CGF);
+}
+
+void CGOpenMPRuntimeTRegion::getDefaultDistScheduleAndChunk(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const {
+  CGOpenMPRuntime::getDefaultDistScheduleAndChunk(CGF, S, ScheduleKind, Chunk);
+}
+
+void CGOpenMPRuntimeTRegion::getDefaultScheduleAndChunk(
+    CodeGenFunction &CGF, const OMPLoopDirective &S,
+    OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const {
+  ScheduleKind = OMPC_SCHEDULE_static;
+  // Chunk size is 1 in this case.
+  llvm::APInt ChunkSize(32, 1);
+  ChunkExpr = IntegerLiteral::Create(
+      CGF.getContext(), ChunkSize,
+      CGF.getContext().getIntTypeForBitwidth(32, /* Signed */ 0),
+      SourceLocation());
+}
+
+// ------------------------------------------------------------------------ //
+// TODO: The following cuda specific part should live somewhere else,
+//       potentially in a derived class.
+
+void CGOpenMPRuntimeTRegion::createOffloadEntry(
+    llvm::Constant *ID, llvm::Constant *Addr, uint64_t Size, int32_t,
+    llvm::GlobalValue::LinkageTypes) {
+  // TODO: Add support for global variables on the device after declare target
+  // support.
+  if (!isa<llvm::Function>(Addr))
+    return;
+  llvm::Module &M = CGM.getModule();
+  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
+
+  // Get "nvvm.annotations" metadata node
+  llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+
+  llvm::Metadata *MDVals[] = {
+      llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
+      llvm::ConstantAsMetadata::get(
+          llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
+  // Append metadata to nvvm.annotations
+  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
+}
+
+#include "clang/Basic/Cuda.h"
+
+// Get current CudaArch and ignore any unknown values
+static CudaArch getCudaArch(CodeGenModule &CGM) {
+  if (!CGM.getTarget().hasFeature("ptx"))
+    return CudaArch::UNKNOWN;
+  llvm::StringMap<bool> Features;
+  CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
+                                 CGM.getTarget().getTargetOpts().CPU,
+                                 CGM.getTarget().getTargetOpts().Features);
+  for (const auto &Feature : Features) {
+    if (Feature.getValue()) {
+      CudaArch Arch = StringToCudaArch(Feature.getKey());
+      if (Arch != CudaArch::UNKNOWN)
+        return Arch;
+    }
+  }
+  return CudaArch::UNKNOWN;
+}
+
+/// Check to see if target architecture supports unified addressing which is
+/// a restriction for OpenMP requires clause "unified_shared_memory".
+void CGOpenMPRuntimeTRegion::checkArchForUnifiedAddressing(
+    CodeGenModule &CGM, const OMPRequiresDecl *D) const {
+  for (const OMPClause *Clause : D->clauselists()) {
+    if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
+      switch (getCudaArch(CGM)) {
+      case CudaArch::SM_20:
+      case CudaArch::SM_21:
+      case CudaArch::SM_30:
+      case CudaArch::SM_32:
+      case CudaArch::SM_35:
+      case CudaArch::SM_37:
+      case CudaArch::SM_50:
+      case CudaArch::SM_52:
+      case CudaArch::SM_53:
+      case CudaArch::SM_60:
+      case CudaArch::SM_61:
+      case CudaArch::SM_62:
+        CGM.Error(Clause->getBeginLoc(),
+                  "Target architecture does not support unified addressing");
+        return;
+      case CudaArch::SM_70:
+      case CudaArch::SM_72:
+      case CudaArch::SM_75:
+      case CudaArch::GFX600:
+      case CudaArch::GFX601:
+      case CudaArch::GFX700:
+      case CudaArch::GFX701:
+      case CudaArch::GFX702:
+      case CudaArch::GFX703:
+      case CudaArch::GFX704:
+      case CudaArch::GFX801:
+      case CudaArch::GFX802:
+      case CudaArch::GFX803:
+      case CudaArch::GFX810:
+      case CudaArch::GFX900:
+      case CudaArch::GFX902:
+      case CudaArch::GFX904:
+      case CudaArch::GFX906:
+      case CudaArch::GFX909:
+      case CudaArch::UNKNOWN:
+        break;
+      case CudaArch::LAST:
+        llvm_unreachable("Unexpected Cuda arch.");
+      }
+    }
+  }
+}
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -69,6 +69,7 @@
   CGOpenCLRuntime.cpp
   CGOpenMPRuntime.cpp
   CGOpenMPRuntimeNVPTX.cpp
+  CGOpenMPRuntimeTRegion.cpp
   CGRecordLayoutBuilder.cpp
   CGStmt.cpp
   CGStmtOpenMP.cpp
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -20,6 +20,7 @@
 #include "CGOpenCLRuntime.h"
 #include "CGOpenMPRuntime.h"
 #include "CGOpenMPRuntimeNVPTX.h"
+#include "CGOpenMPRuntimeTRegion.h"
 #include "CodeGenFunction.h"
 #include "CodeGenPGO.h"
 #include "ConstantEmitter.h"
@@ -67,6 +68,11 @@
     llvm::cl::desc("Emit limited coverage mapping information (experimental)"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<bool> UseGenericTRegionInterface(
+    "openmp-tregion-runtime", llvm::cl::ZeroOrMore, llvm::cl::Hidden,
+    llvm::cl::desc("Use the generic target region OpenMP runtime interface"),
+    llvm::cl::init(false));
+
 static const char AnnotationSection[] = "llvm.metadata";
 
 static CGCXXABI *createCXXABI(CodeGenModule &CGM) {
@@ -206,7 +212,10 @@
   case llvm::Triple::nvptx64:
     assert(getLangOpts().OpenMPIsDevice &&
            "OpenMP NVPTX is only prepared to deal with device code.");
-    OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
+    if (UseGenericTRegionInterface)
+      OpenMPRuntime.reset(new CGOpenMPRuntimeTRegion(*this));
+    else
+      OpenMPRuntime.reset(new CGOpenMPRuntimeNVPTX(*this));
     break;
   default:
     if (LangOpts.OpenMPSimd)
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -287,6 +287,7 @@
 void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
 void initializeNaryReassociateLegacyPassPass(PassRegistry&);
 void initializeNewGVNLegacyPassPass(PassRegistry&);
+void initializeOpenMPOptLegacyPass(PassRegistry&);
 void initializeObjCARCAAWrapperPassPass(PassRegistry&);
 void initializeObjCARCAPElimPass(PassRegistry&);
 void initializeObjCARCContractPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -143,6 +143,7 @@
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
       (void) llvm::createNaryReassociatePass();
+      (void) llvm::createOpenMPOptLegacyPass();
       (void) llvm::createObjCARCAAWrapperPass();
       (void) llvm::createObjCARCAPElimPass();
       (void) llvm::createObjCARCExpandPass();
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -156,6 +156,11 @@
 ///
 ModulePass *createIPConstantPropagationPass();
 
+//===----------------------------------------------------------------------===//
+/// createOpenMPOpt - This pass performs OpenMP specific optimizations.
+///
+Pass *createOpenMPOptLegacyPass();
+
 //===----------------------------------------------------------------------===//
 /// createIPSCCPPass - This pass propagates constants from call sites into the
 /// bodies of functions, and keeps track of whether basic blocks are executable
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -25,6 +25,7 @@
   LoopExtractor.cpp
   LowerTypeTests.cpp
   MergeFunctions.cpp
+  OpenMPOpt.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -35,6 +35,7 @@
   initializeGlobalSplitPass(Registry);
   initializeHotColdSplittingLegacyPassPass(Registry);
   initializeIPCPPass(Registry);
+  initializeOpenMPOptLegacyPass(Registry);
   initializeAlwaysInlinerLegacyPassPass(Registry);
   initializeSimpleInlinerPass(Registry);
   initializeInferFunctionAttrsLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -0,0 +1,784 @@
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpenMP specific optimizations
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> BuildCustomStateMachines(
+    "openmp-opt-kernel-state-machines", cl::ZeroOrMore,
+    cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> PerformOpenMPSIMDIZATION(
+    "openmp-opt-kernel-simdization", cl::ZeroOrMore,
+    cl::desc("Convert non-SPMD kernels to SPMD mode if possible."), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ForceOpenMPSIMDIZATION(
+    "openmp-opt-kernel-force-simdization", cl::ZeroOrMore,
+    cl::desc("Force execution of non-SPMD kernels in SPMD mode."), cl::Hidden,
+    cl::init(false));
+
+STATISTIC(NumKernelsConvertedToSPMD,
+          "Number of GPU kernels converted to SPMD mode");
+STATISTIC(NumParallelCallsConvertedToSPMD,
+          "Number of parallel GPU kernel regions converted to SPMD mode");
+STATISTIC(NumKernelsNonSPMDNoParallelism,
+          "Number of GPU kernel in non-SPMD mode without parallelism");
+STATISTIC(NumCustomStateMachinesCreated,
+          "Number of custom GPU kernel non-SPMD mode state machines created");
+STATISTIC(NumCustomStateMachinesNoFallback,
+          "Number of custom GPU kernel non-SPMD mode state machines without "
+          "fallback");
+
+namespace {
+
+/// Set of constants that describe the positions of arguments (ARG_FN_NAME) and
+/// the meaning of return values (RET_FN_MEANING) for the target region kernel
+/// interface. Has to be kept in sync with
+///   openmp/libomptarget/deviceRTLs/common/target_region.h
+/// and the respective implementations.
+enum {
+  ARG_INIT_USE_SPMD_MODE = 0,
+  ARG_INIT_REQUIRES_OMP_RUNTIME = 1,
+  ARG_INIT_USE_STATE_MACHINE = 2,
+  ARG_INIT_REQUIRES_DATA_SHARING = 3,
+
+  ARG_DEINIT_USE_SPMD_MODE = 0,
+  ARG_DEINIT_REQUIRES_OMP_RUNTIME = 1,
+
+  ARG_PARALLEL_USE_SPMD_MODE = 0,
+  ARG_PARALLEL_REQUIRES_OMP_RUNTIME = 1,
+  ARG_PARALLEL_WORK_FUNCTION = 2,
+  ARG_PARALLEL_SHARED_VARS = 3,
+  ARG_PARALLEL_SHARED_VARS_BYTES = 4,
+
+  RET_INIT_IS_WORKER = -1,
+  RET_INIT_IS_SURPLUS = 0,
+  RET_INIT_IS_MASTER = 1,
+};
+
+/// A macro list to represent known functions from the omp, __kmpc, and target
+/// region interfaces. The first value is an enum identifier, see FunctionID,
+/// the second value is the function name, and the third the expected number of
+/// arguments.
+#define KNOWN_FUNCTIONS()                                                      \
+  KF(FID_OMP_GET_TEAM_NUM, "omp_get_team_num", 0)                              \
+  KF(FID_OMP_GET_NUM_TEAMS, "omp_get_num_teams", 0)                            \
+  KF(FID_OMP_GET_THREAD_NUM, "omp_get_thread_num", 0)                          \
+  KF(FID_OMP_GET_NUM_THREADS, "omp_get_num_threads", 0)                        \
+  KF(FID_OMP_SET_NUM_THREADS, "omp_set_num_threads", 1)                        \
+  KF(FID_KMPC_TREGION_KERNEL_INIT, "__kmpc_target_region_kernel_init", 4)      \
+  KF(FID_KMPC_TREGION_KERNEL_DEINIT, "__kmpc_target_region_kernel_deinit", 2)  \
+  KF(FID_KMPC_TREGION_KERNEL_PARALLEL, "__kmpc_target_region_kernel_parallel", \
+     8)                                                                        \
+  KF(FID_KMPC_FOR_STATIC_INIT_4, "__kmpc_for_static_init_4", 9)                \
+  KF(FID_KMPC_FOR_STATIC_FINI, "__kmpc_for_static_fini", 2)                    \
+  KF(FID_KMPC_GLOBAL_THREAD_NUM, "__kmpc_global_thread_num", 1)                \
+  KF(FID_KMPC_DISPATCH_INIT_4, "__kmpc_dispatch_init_4", 7)                    \
+  KF(FID_KMPC_DISPATCH_NEXT_4, "__kmpc_dispatch_next_4", 6)
+
+/// An identifier enum for each known function as well as the different kinds
+/// of unknown functions we distinguish.
+enum FunctionID {
+#define KF(NAME, STR, NARGS) NAME,
+  KNOWN_FUNCTIONS()
+#undef KF
+  // Unknown functions
+  //{
+  FID_KMPC_UNKNOWN, ///< unknown __kmpc_XXXX function
+  FID_OMP_UNKOWN,   ///< unknown omp_XXX function
+  FID_NVVM_UNKNOWN, ///< unknown llvm.nvvm.XXX function
+  FID_LLVM_UNKNOWN, ///< unknown llvm.XXX function
+  FID_UNKNOWN       ///< unknown function without known prefix.
+  //}
+};
+
+static FunctionID getFunctionID(Function *F) {
+  if (!F)
+    return FID_UNKNOWN;
+#define KF(NAME, STR, NARGS) .Case(STR, NAME)
+  return StringSwitch<FunctionID>(F->getName()) KNOWN_FUNCTIONS()
+      .StartsWith("__kmpc_", FID_KMPC_UNKNOWN)
+      .StartsWith("omp_", FID_OMP_UNKOWN)
+      .StartsWith("llvm.nvvm.", FID_NVVM_UNKNOWN)
+      .StartsWith("llvm.", FID_LLVM_UNKNOWN)
+      .Default(FID_UNKNOWN);
+#undef KF
+}
+
+static Type *getOrCreateStructIdentTypePtr(Module &M) {
+  // TODO create if not present!
+  return M.getTypeByName("struct.ident_t")->getPointerTo();
+}
+
+// TODO: Simplify function declaration
+static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1, const char *Name,
+                               Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {T0, T1}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+
+static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) {
+  static const char *Name = "__kmpc_barrier_simple_spmd";
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    LLVMContext &Ctx = M.getContext();
+    FunctionType *FType = FunctionType::get(
+        Type::getVoidTy(Ctx),
+        {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+
+/// A helper class to introduce smart guarding code.
+struct GuardGenerator {
+
+  /// Inform the guard generator about the side-effect instructions collected in
+  /// @p SideEffectInst.
+  ///
+  /// \Returns True if all registered side-effects can be (efficiently) guarded.
+  bool registerSideEffects(SmallVectorImpl<Instruction *> &SideEffectInst) {
+    bool Guarded = true;
+    if (SideEffectInst.empty())
+      return Guarded;
+
+    const Module &M = *SideEffectInst.front()->getModule();
+    const DataLayout &DL = M.getDataLayout();
+
+    SmallVector<Instruction *, 16> UnguardedSideEffectInst;
+    for (Instruction *I : SideEffectInst) {
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (getFunctionID(CI->getCalledFunction()) != FID_UNKNOWN)
+          continue;
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (isa<AllocaInst>(
+                SI->getPointerOperand()->stripInBoundsConstantOffsets()))
+          continue;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (isSafeToLoadUnconditionally(LI->getPointerOperand(),
+                                        LI->getAlignment(), DL))
+          continue;
+      }
+      LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n");
+      UnguardedSideEffectInst.push_back(I);
+    }
+
+    return UnguardedSideEffectInst.empty();
+  }
+
+  bool registerReadEffects(SmallVectorImpl<Instruction *> &ReadEffectInst) {
+    return registerSideEffects(ReadEffectInst);
+  }
+
+  void introduceGuards() {
+    // TODO: The guard generator cannot introduce guards yet but the registerXXX
+    //       functions above are aware of that!
+  }
+};
+
+/// Helper structure to represent and work with a target region kernel.
+struct KernelTy {
+
+  KernelTy(Function *KernelFn) : KernelFn(*KernelFn) {}
+
+  /// Optimize this kernel, return true if something was done.
+  bool optimize();
+
+private:
+  /// Analyze this kernel, return true if successful.
+  bool analyze(Function &F, SmallPtrSetImpl<Function *> &Visited,
+               bool InParallelRegion);
+
+  /// Return true if the kernel is executed in SPMD mode.
+  bool isExecutedInSPMDMode();
+
+  /// Convert a non-SPMD mode kernel to SPMD mode, return true if successful.
+  bool convertToSPMD();
+
+  /// Create a custom state machine in the module, return true if successful.
+  bool createCustomStateMachine();
+
+  /// All side-effect instructions potentially executed in this kernel.
+  SmallVector<Instruction *, 16> SideEffectInst;
+
+  /// All read-only instructions potentially executed in this kernel.
+  SmallVector<Instruction *, 16> ReadOnlyInst;
+
+  /// All non-analyzed calls contained in this kernel. They are separated by
+  /// their function ID which describes identifies known calls.
+  SmallVector<CallInst *, 2> KernelCalls[FID_UNKNOWN + 1];
+
+  /// All non-analyzed calls contained in parallel regions which are part of
+  /// this kernel. They are separated by their function ID which describes
+  /// identifies known calls.
+  SmallVector<CallInst *, 2> ParallelRegionCalls[FID_UNKNOWN + 1];
+
+  /// The entry function of this kernel.
+  Function &KernelFn;
+};
+
+bool KernelTy::analyze(Function &F, SmallPtrSetImpl<Function *> &Visited,
+                       bool InParallelRegion) {
+  if (!Visited.insert(&F).second)
+    return true;
+
+  LLVM_DEBUG(dbgs() << "Analyze "
+                    << (InParallelRegion ? "parallel-region" : "kernel")
+                    << " function: " << F.getName() << "\n");
+
+  // Determine where we remember the call.
+  auto &CallsArray = InParallelRegion ? ParallelRegionCalls : KernelCalls;
+
+  for (Instruction &I : instructions(&F)) {
+
+    // In parallel regions we only look for calls, outside, we look for all
+    // side-effect and read-only instructions.
+    if (!InParallelRegion) {
+      // Handle non-side-effect instructions first. These will not write or
+      // throw which makes reading the only interesting potential property.
+      if (!I.mayHaveSideEffects()) {
+        if (I.mayReadFromMemory()) {
+          LLVM_DEBUG(dbgs() << "- read-only: " << I << "\n");
+          ReadOnlyInst.push_back(&I);
+        }
+        continue;
+      }
+
+      // Now we handle all non-call instructions.
+      if (!isa<CallInst>(I)) {
+        LLVM_DEBUG(dbgs() << "- side-effect: " << I << "\n");
+        SideEffectInst.push_back(&I);
+        continue;
+      }
+    }
+
+    if (!isa<CallInst>(I))
+      continue;
+
+    CallInst &CI = cast<CallInst>(I);
+    Function *Callee = CI.getCalledFunction();
+
+    // For exact definitions we recurs.
+    if (Callee && !Callee->isDeclaration() && Callee->isDefinitionExact()) {
+      // If recursive analysis failed we bail, otherwise the
+      // information was collected in the internal state.
+      if (!analyze(*Callee, Visited, InParallelRegion))
+        return false;
+      continue;
+    }
+
+    // Check that know functions have the right number of arguments early on.
+    // Additionally provide debug output based on the function ID.
+    FunctionID ID = getFunctionID(Callee);
+
+    switch (ID) {
+#define KF(NAME, STR, NARGS)                                                   \
+  case NAME:                                                                   \
+    LLVM_DEBUG(                                                                \
+        dbgs() << "- known call "                                              \
+               << (CI.getNumArgOperands() != NARGS ? "[#arg missmatch!]" : "") \
+               << ": " << I << "\n");                                          \
+    if (CI.getNumArgOperands() != NARGS)                                       \
+      ID = FID_UNKNOWN;                                                        \
+    break;
+      KNOWN_FUNCTIONS()
+#undef KF
+    case FID_KMPC_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown __kmpc_* call: " << I << "\n");
+      break;
+    case FID_OMP_UNKOWN:
+      LLVM_DEBUG(dbgs() << "- unknown omp_* call: " << I << "\n");
+      break;
+    case FID_NVVM_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown llvm.nvvm.* call: " << I << "\n");
+      break;
+    case FID_LLVM_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown llvm.* call: " << I << "\n");
+      break;
+    case FID_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown call: " << I << "\n");
+      break;
+    }
+
+    CallsArray[ID].push_back(&CI);
+  }
+
+  // If we did not analyze the kernel function but some other one down the call
+  // chain we are done now.
+  // TODO: Add more verification code here.
+  if (&F != &KernelFn)
+    return true;
+
+  assert(&KernelCalls == &CallsArray);
+
+  // If we are analyzing the kernel function we need to verify we have at least
+  // the calls we expect to see in the right places.
+  if (KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() != 1 ||
+      KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() != 1 ||
+      KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getParent() !=
+          &F.getEntryBlock()) {
+    LLVM_DEBUG(dbgs() << "- malformed kernel: [#Init: "
+                      << KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size()
+                      << "][#DeInit: "
+                      << KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size()
+                      << "]\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool KernelTy::isExecutedInSPMDMode() {
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  auto *SPMDFlag = cast<Constant>(
+      KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getArgOperand(0));
+  assert(SPMDFlag->isZeroValue() || SPMDFlag->isOneValue());
+  return SPMDFlag->isOneValue();
+}
+
+bool KernelTy::optimize() {
+  bool Changed = false;
+
+  // First analyze the code. If that fails for some reason we bail out early.
+  SmallPtrSet<Function *, 8> Visited;
+  if (!analyze(KernelFn, Visited, /* InParallelRegion */ false))
+    return Changed;
+
+  Visited.clear();
+  for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) {
+    Value *ParCIParallelFnArg =
+        ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION);
+    Function *ParallelFn =
+        dyn_cast<Function>(ParCIParallelFnArg->stripPointerCasts());
+    if (!ParallelFn ||
+        !analyze(*ParallelFn, Visited, /* InParallelRegion */ true))
+      return Changed;
+  }
+
+  Changed |= convertToSPMD();
+  Changed |= createCustomStateMachine();
+
+  return Changed;
+}
+
+bool KernelTy::convertToSPMD() {
+  if (isExecutedInSPMDMode())
+    return false;
+
+  bool Changed = false;
+
+  // Use a generic guard generator to determine if suitable guards for all
+  // side effect instructions can be placed.
+  GuardGenerator GG;
+
+  // Check if SIMDIZATION is possible, in case it is not forced.
+  if (!ForceOpenMPSIMDIZATION) {
+    // Unknown calls are not handled yet and will cause us to bail.
+    if (!KernelCalls[FID_UNKNOWN].empty())
+      return Changed;
+
+    // If we cannot guard all side effect instructions bail out.
+    if (!GG.registerSideEffects(SideEffectInst))
+      return Changed;
+
+    if (!GG.registerReadEffects(ReadOnlyInst))
+      return Changed;
+
+    // TODO: Emit a remark.
+    LLVM_DEBUG(dbgs() << "Transformation to SPMD OK\n");
+
+    // If we disabled SIMDIZATION we only emit the debug message and bail.
+    if (!PerformOpenMPSIMDIZATION)
+      return Changed;
+  }
+
+  // Actually emit the guard code after we decided to perform SIMDIZATION.
+  GG.introduceGuards();
+
+  // Create an "is-SPMD" flag.
+  Type *FlagTy = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]
+                     ->getArgOperand(ARG_INIT_USE_SPMD_MODE)
+                     ->getType();
+  Constant *SPMDFlag = ConstantInt::getTrue(FlagTy);
+
+  // Update the init and deinit calls with the "is-SPMD" flag to indicate
+  // SPMD mode.
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]->setArgOperand(
+      ARG_INIT_USE_SPMD_MODE, SPMDFlag);
+  KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT][0]->setArgOperand(
+      ARG_DEINIT_USE_SPMD_MODE, SPMDFlag);
+
+  // Use the simple barrier to synchronize all threads in SPMD mode after each
+  // parallel region.
+  Function *SimpleBarrierFn =
+      getOrCreateSimpleSPMDBarrierFn(*KernelFn.getParent());
+
+  // For each parallel region, identified by the
+  // __kmpc_target_region_kernel_parallel call, we set the "is-SPMD" flag and
+  // introduce a succeeding barrier call.
+  for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) {
+    ParCI->setArgOperand(ARG_PARALLEL_USE_SPMD_MODE, SPMDFlag);
+    auto AI = SimpleBarrierFn->arg_begin();
+    CallInst::Create(SimpleBarrierFn,
+                     {Constant::getNullValue((AI++)->getType()),
+                      Constant::getNullValue((AI)->getType())},
+                     "", ParCI->getNextNode());
+  }
+
+  // TODO: serialize nested parallel regions
+
+  // Finally, we change the global exec_mode variable to indicate SPMD mode.
+  GlobalVariable *ExecMode = KernelFn.getParent()->getGlobalVariable(
+      (KernelFn.getName() + "_exec_mode").str());
+  assert(ExecMode &&
+         "Assumed to find an execution mode hint among the globals");
+  assert(ExecMode->getInitializer()->isOneValue() &&
+         "Assumed target_region execution mode prior to 'SPMD'-zation");
+  ExecMode->setInitializer(
+      Constant::getNullValue(ExecMode->getInitializer()->getType()));
+
+  // Bookkeeping
+  NumKernelsConvertedToSPMD++;
+  NumParallelCallsConvertedToSPMD +=
+      KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size();
+
+  return Changed;
+}
+
+bool KernelTy::createCustomStateMachine() {
+  if (isExecutedInSPMDMode())
+    return false;
+
+  // TODO: Warn or eliminate the offloading if no parallel regions are present.
+  // TODO: Use reachability to eliminate the loop and if-cascade
+  //
+  // The user module code looks as follows if this function returns true.
+  //
+  //   ThreadKind = __kmpc_target_region_kernel_init(...)
+  //   if (ThreadKind == -1) {               //  actual worker thread
+  //     do {
+  //       __kmpc_barrier_simple_spmd(...)
+  //       void *WorkFn;
+  //       bool IsActive = __kmpc_kernel_parallel(&WorkFn, ...);
+  //       if (!WorkFn)
+  //         goto exit;
+  //       if (IsActive) {
+  //         char *SharedVars = __kmpc_target_region_kernel_get_shared_memory();
+  //         char *PrivateVars =
+  //         __kmpc_target_region_kernel_get_private_memory();
+  //
+  //         ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars);
+  //
+  //         __kmpc_kernel_end_parallel();
+  //       }
+  //       __kmpc_barrier_simple_spmd(...)
+  //     } while (true);
+  //   } else if (ThreadKind == 0) {         // surplus worker thread
+  //     goto exit;
+  //   } else {                              //    team master thread
+  //     goto user_code;
+  //   }
+
+  if (KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size() == 0) {
+    LLVM_DEBUG(dbgs() << "Will not build a custom state machine because there "
+                         "are no known parallel regions in the kernel.\n");
+    // TODO: If we also know there are no hidden parallel calls we can terminate
+    // all but the
+    //       master thread right away.
+    NumKernelsNonSPMDNoParallelism++;
+    return false;
+  }
+
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  CallInst *InitCI = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0];
+
+  // Check if a custom state machine was already implemented.
+  auto *UseSM =
+      dyn_cast<ConstantInt>(InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE));
+  if (!UseSM || !UseSM->isOne()) {
+    LLVM_DEBUG(dbgs() << "Will not build a custom state machine because of "
+                      << *KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0] << "\n");
+    return false;
+  }
+
+  InitCI->setName("thread_kind");
+  LLVMContext &Ctx = InitCI->getContext();
+
+  // Create local storage for the work function pointer.
+  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "work_fn.addr",
+                                        &KernelFn.getEntryBlock().front());
+
+  Instruction *IP = InitCI->getNextNode();
+
+  Type *FlagTy = InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE)->getType();
+  Constant *SMFlag = ConstantInt::getFalse(FlagTy);
+  InitCI->setArgOperand(ARG_INIT_USE_STATE_MACHINE, SMFlag);
+
+  // Check the return value of __kmpc_target_region_kernel_init. First compare
+  // it to RET_INIT_IS_WORKER.
+  Instruction *WorkerCnd = new ICmpInst(
+      IP, ICmpInst::ICMP_EQ, InitCI,
+      ConstantInt::getSigned(InitCI->getType(), RET_INIT_IS_WORKER),
+      "is_worker");
+
+  // Create the conditional which is entered by worker threads.
+  Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false);
+  BasicBlock *WaitBB = WaitTI->getParent();
+  WaitBB->setName("worker.wait");
+  IP->getParent()->setName("master_check");
+
+  Instruction *MasterCheckTI = IP->getParent()->getTerminator();
+  assert(MasterCheckTI->getNumSuccessors() == 2);
+  assert(WaitTI->getNumSuccessors() == 1);
+
+  // Determine the final block, that is a trivial one where the kernel ends.
+  BasicBlock *FinalBB = nullptr;
+  if (MasterCheckTI->getSuccessor(0)->size() == 1 &&
+      isa<ReturnInst>(MasterCheckTI->getSuccessor(0)->getTerminator()))
+    FinalBB = MasterCheckTI->getSuccessor(0);
+  else if (MasterCheckTI->getSuccessor(1)->size() == 1 &&
+           isa<ReturnInst>(MasterCheckTI->getSuccessor(1)->getTerminator()))
+    FinalBB = MasterCheckTI->getSuccessor(1);
+  assert(FinalBB && "Could not determine the final kernal block.");
+
+  // Use the simple barrier to synchronize all threads in SPMD mode after each
+  // parallel region.
+  Module &M = *KernelFn.getParent();
+  Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M);
+
+  auto AI = SimpleBarrierFn->arg_begin();
+  Instruction *BarrierCall =
+      CallInst::Create(SimpleBarrierFn,
+                       {Constant::getNullValue((AI++)->getType()),
+                        Constant::getNullValue((AI)->getType())},
+                       "", WaitTI);
+
+  Function *KernelParallelFn =
+      getOrCreateFn(Type::getInt1Ty(Ctx), VoidPtrTy->getPointerTo(),
+                    Type::getInt16Ty(Ctx), "__kmpc_kernel_parallel", M);
+
+  Value *RequiresOMPRuntime = CastInst::CreateZExtOrBitCast(
+      InitCI->getArgOperand(ARG_INIT_REQUIRES_OMP_RUNTIME),
+      Type::getInt16Ty(Ctx), "", WaitTI);
+  Instruction *ActiveCnd = CallInst::Create(
+      KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI);
+
+  Type *WorkFnPrototype =
+      FunctionType::get(Type::getVoidTy(Ctx), {VoidPtrTy, VoidPtrTy}, false)
+          ->getPointerTo();
+  Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+      WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI);
+  Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI);
+
+  Instruction *WorkFnCnd =
+      new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn,
+                   Constant::getNullValue(WorkFn->getType()), "no_work");
+
+  Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false);
+  FinishedTI->getParent()->setName("worker.finished");
+  WaitTI->getParent()->setName("worker.active_check");
+
+  Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false);
+  ActiveTI->getParent()->setName("worker.active");
+  WaitTI->getParent()->setName("worker.inactive");
+
+  Function *KernelGetSharedVars = getOrCreateFn(
+      VoidPtrTy, "__kmpc_target_region_kernel_get_shared_memory", M);
+  Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI);
+  Function *KernelGetPrivateVars = getOrCreateFn(
+      VoidPtrTy, "__kmpc_target_region_kernel_get_private_memory", M);
+  Value *PrivateVars = CallInst::Create(KernelGetPrivateVars, "", ActiveTI);
+
+  BasicBlock *ExecuteBB = ActiveTI->getParent();
+  BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI);
+  ParallelEndBB->setName("worker.parallel_end");
+
+  Function *KernelEndParallelFn =
+      getOrCreateFn(Type::getVoidTy(Ctx), "__kmpc_kernel_end_parallel", M);
+  CallInst::Create(KernelEndParallelFn, "", ActiveTI);
+
+  // A fallback is required if we might not see all parallel regions
+  // (__kmpc_target_region_kernel_parallel calls). This could be the case if
+  // there is an unknown function call with side effects in the target region
+  // or inside one of the parallel regions.
+  bool RequiresFallback = !KernelCalls[FID_UNKNOWN].empty() ||
+                          !ParallelRegionCalls[FID_UNKNOWN].empty();
+
+  // Collect all target region parallel calls
+  // (__kmpc_target_region_kernel_parallel).
+  SmallVector<CallInst *, 16> KernelParallelCalls;
+  KernelParallelCalls.append(
+      KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(),
+      KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end());
+  KernelParallelCalls.append(
+      ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(),
+      ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end());
+
+  IP = ExecuteBB->getTerminator();
+
+  // For each parallel call create a conditional that compares the work function
+  // against the parallel work function of this parallel call, if available. If
+  // the function pointers are equal we call the known parallel call work
+  // function directly and continue to the end of the if-cascade.
+  for (CallInst *ParCI : KernelParallelCalls) {
+    Function *ParFn = dyn_cast<Function>(
+        ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION)->stripPointerCasts());
+    if (!ParFn) {
+      LLVM_DEBUG(
+          dbgs() << "Require fallback due to unknown parallel function\n");
+      RequiresFallback = true;
+      continue;
+    }
+
+    Value *ParFnCnd =
+        new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check");
+    Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false);
+    IP->getParent()->setName("worker.check.next");
+    ParFnTI->getParent()->setName("worker.execute." + ParFn->getName());
+    CallInst::Create(ParFn, {SharedVars, PrivateVars}, "", ParFnTI);
+    ParFnTI->setSuccessor(0, ParallelEndBB);
+  }
+
+  // If a fallback is required we emit a indirect call before we jump to the
+  // point where all cases converge.
+  if (RequiresFallback)
+    CallInst::Create(WorkFn, {SharedVars, PrivateVars}, "", IP);
+
+  // Insert a barrier call at the convergence point, right before the back edge.
+  BarrierCall->clone()->insertBefore(WaitTI);
+
+  // Rewire the CFG edges to introduce the back and exit edge of the new loop.
+  // TODO: Add the new loop to LI!
+  FinishedTI->setSuccessor(0, FinalBB);
+  WaitTI->setSuccessor(0, WaitBB);
+
+  // Bookkeeping.
+  NumCustomStateMachinesCreated++;
+  NumCustomStateMachinesNoFallback += !RequiresFallback;
+
+  return true;
+}
+
+template <class T>
+static void collectCallersOf(Module &M, StringRef Name,
+                             SmallVectorImpl<T> &Callers) {
+  Function *Callee = M.getFunction(Name);
+
+  // If the callee function is not present, we are done.
+  if (!Callee)
+    return;
+
+  // If it exists we check all users.
+  for (const Use &U : Callee->uses()) {
+    CallSite CS(U.getUser());
+
+    // Filter out non-callee uses and non-call uses.
+    if (!CS || !CS.isCallee(&U) || !isa<CallInst>(CS.getInstruction()))
+      continue;
+
+    // Found a caller, use it to create a T type object and put the result
+    // in the Callers vector.
+    Callers.emplace_back(T(CS.getCaller()));
+  }
+}
+
+/// OpenMPOpt - The interprocedural OpenMP optimization pass implementation.
+struct OpenMPOpt {
+
+  bool runOnModule(Module &M) {
+    bool Changed = false;
+
+    // Collect target regions kernels identified by a call to
+    // __kmpc_target_region_kernel_init.
+    collectCallersOf(M, "__kmpc_target_region_kernel_init", TRKernels);
+
+    for (KernelTy &K : TRKernels)
+      Changed |= K.optimize();
+
+    return Changed;
+  }
+
+private:
+  /// A collection of all target regions kernels we found.
+  SmallVector<KernelTy, 4> TRKernels;
+};
+
+// TODO: This could be a CGSCC pass as well.
+struct OpenMPOptLegacy : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  OpenMPOpt OMPOpt;
+
+  OpenMPOptLegacy() : ModulePass(ID) {
+    initializeOpenMPOptLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+  bool runOnModule(Module &M) override { return OMPOpt.runOnModule(M); }
+};
+
+// TODO: Add a new PM entry point.
+
+} // namespace
+
+char OpenMPOptLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(OpenMPOptLegacy, "openmp-opt",
+                      "OpenMP specific optimizations", false, false)
+INITIALIZE_PASS_END(OpenMPOptLegacy, "openmp-opt",
+                    "OpenMP specific optimizations", false, false)
+
+Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacy(); }
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -517,6 +517,10 @@
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
+  // Try to perform OpenMP specific optimizations. This is a no-op if OpenMP
+  // runtime calls are not present in the module.
+  MPM.add(createOpenMPOptLegacyPass());
+
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
 ; CHECK-NEXT:     Called Value Propagation
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Call-site splitting
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
 ; CHECK-NEXT:     Called Value Propagation
diff --git a/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll
@@ -0,0 +1,1064 @@
+; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s
+;
+; REQUIRES: asserts
+;
+; Check that we will not execute any of the below target regions in SPMD-mode.
+; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted.
+;
+; See the to_SPMD_mode.ll file for almost the same functions that can be translated to SPMD mode.
+;
+; STATS-DAG: 1 openmp-opt - Number of GPU kernel in non-SPMD mode without parallelism
+; STATS-DAG: 3 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines created
+; STATS-DAG: 2 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines without fallback
+;
+; No state machine needed because there is no parallel region.
+; CHECK: void @{{.*}}loop_in_loop_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 true
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void loop_in_loop_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++) {
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[i + j];
+;   }
+; }
+;
+;
+; Custom state machine needed but no fallback because all parallel regions are known
+; CHECK: void @{{.*}}parallel_loops_and_accesses_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false
+; The "check.next" block should not contain a fallback call
+; CHECK:       worker.check.next4:
+; CHECK-NEXT:    br label %worker.parallel_end
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void parallel_loops_and_accesses_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[0 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[1 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[2 + j];
+;
+;     // This needs a guard in SPMD mode
+;     A[0] = B[0];
+;   }
+; }
+;
+; void extern_func();
+; static void parallel_loop(int *A, int *B, int i) {
+; #pragma omp parallel for
+;   for (int j = 0; j < 1024; j++)
+;     A[j] += B[i + j];
+; }
+;
+; int Global[512];
+;
+;
+; Custom state machine needed but no fallback because all parallel regions are known
+; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_with_global_acc_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false
+; The "check.next" block should not contain a fallback call
+; CHECK:       worker.check.next:
+; CHECK-NEXT:    br label %worker.parallel_end
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void parallel_loop_in_function_in_loop_with_global_acc_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++) {
+;     parallel_loop(A, B, i);
+;     Global[i]++;
+;   }
+; }
+;
+; Custom state machine needed with fallback because "extern_func" might contain parallel regions.
+; CHECK: void @{{.*}}parallel_loops_in_functions_and_extern_func_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false
+; The "check.next" block should contain a fallback call
+; CHECK:       worker.check.next:
+; CHECK-NEXT:    call void %work_fn(
+; CHECK-NEXT:    br label %worker.parallel_end
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void parallel_loops_in_functions_and_extern_func_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+;     parallel_loop(A, B, 1);
+;     parallel_loop(A, B, 2);
+;     extern_func();
+;     parallel_loop(A, B, 3);
+;   }
+; }
+
+source_filename = "../llvm/test/Transforms/OpenMP/no_SPMD_mode.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.private.struct = type { i32**, i32** }
+%omp.private.struct.0 = type { i32**, i32** }
+%omp.private.struct.1 = type { i32**, i32** }
+%omp.private.struct.2 = type { i32**, i32**, i32* }
+
+@__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode = weak constant i8 1
+@__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode = weak constant i8 1
+@__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc8, %.execute
+  %3 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %3, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  store i32 2, i32* %cleanup.dest.slot, align 4
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #4
+  br label %for.end10
+
+for.body:                                         ; preds = %for.cond
+  %5 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 0, i32* %j, align 4, !tbaa !15
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %6 = load i32, i32* %j, align 4, !tbaa !15
+  %cmp2 = icmp slt i32 %6, 1024
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+
+for.cond.cleanup3:                                ; preds = %for.cond1
+  store i32 5, i32* %cleanup.dest.slot, align 4
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #4
+  br label %for.end
+
+for.body4:                                        ; preds = %for.cond1
+  %8 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %9 = load i32, i32* %i, align 4, !tbaa !15
+  %10 = load i32, i32* %j, align 4, !tbaa !15
+  %add = add nsw i32 %9, %10
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %11 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %12 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %13 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom5 = sext i32 %13 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %12, i64 %idxprom5
+  %14 = load i32, i32* %arrayidx6, align 4, !tbaa !15
+  %add7 = add nsw i32 %14, %11
+  store i32 %add7, i32* %arrayidx6, align 4, !tbaa !15
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body4
+  %15 = load i32, i32* %j, align 4, !tbaa !15
+  %inc = add nsw i32 %15, 1
+  store i32 %inc, i32* %j, align 4, !tbaa !15
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond.cleanup3
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.end
+  %16 = load i32, i32* %i, align 4, !tbaa !15
+  %inc9 = add nsw i32 %16, 1
+  store i32 %inc9, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end10:                                        ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end10
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+declare void @__kmpc_target_region_kernel_deinit(i1, i1)
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %.private.vars = alloca %omp.private.struct, align 8
+  %.private.vars1 = alloca %omp.private.struct.0, align 8
+  %.private.vars2 = alloca %omp.private.struct.1, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast %omp.private.struct* %.private.vars to i8*
+  %3 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %3
+  %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %4
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false)
+  %5 = bitcast %omp.private.struct.0* %.private.vars1 to i8*
+  %6 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 0
+  store i32** %A.addr, i32*** %6
+  %7 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 1
+  store i32** %B.addr, i32*** %7
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false)
+  %8 = bitcast %omp.private.struct.1* %.private.vars2 to i8*
+  %9 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 0
+  store i32** %A.addr, i32*** %9
+  %10 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 1
+  store i32** %B.addr, i32*** %10
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false)
+  %11 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 0
+  %12 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %13 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %arrayidx3 = getelementptr inbounds i32, i32* %13, i64 0
+  store i32 %12, i32* %arrayidx3, align 4, !tbaa !15
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 0, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct*
+  %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1)
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 1, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.0*
+  %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 2, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.1*
+  %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35(i32* %A, i32* %B, [512 x i32]* dereferenceable(2048) %Global) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %Global.addr = alloca [512 x i32]*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store [512 x i32]* %Global, [512 x i32]** %Global.addr, align 8, !tbaa !11
+  %0 = load [512 x i32]*, [512 x i32]** %Global.addr, align 8, !tbaa !11
+  %1 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %2 = icmp eq i8 %1, 1
+  br i1 %2, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %3 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %.execute
+  %4 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %4, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %5 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %5) #4
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %6 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %7 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %8 = load i32, i32* %i, align 4, !tbaa !15
+  call void @parallel_loop(i32* %6, i32* %7, i32 %8)
+  %9 = load i32, i32* %i, align 4, !tbaa !15
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds [512 x i32], [512 x i32]* %0, i64 0, i64 %idxprom
+  %10 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %arrayidx, align 4, !tbaa !15
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32, i32* %i, align 4, !tbaa !15
+  %inc1 = add nsw i32 %11, 1
+  store i32 %inc1, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal void @parallel_loop(i32* %A, i32* %B, i32 %i) #2 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i.addr = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct.2, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store i32 %i, i32* %i.addr, align 4, !tbaa !15
+  %0 = bitcast %omp.private.struct.2* %.private.vars to i8*
+  %1 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %1
+  %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %2
+  %3 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 2
+  store i32* %i.addr, i32** %3
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %3 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop(i32* %2, i32* %3, i32 1)
+  %4 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %5 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop(i32* %4, i32* %5, i32 2)
+  call void bitcast (void (...)* @extern_func to void ()*)()
+  %6 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %7 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop(i32* %6, i32* %7, i32 3)
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare void @extern_func(...) #3
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %i.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  store i32* %i, i32** %i.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = load i32*, i32** %i.addr, align 8, !tbaa !11
+  %3 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  %4 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %7 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %8 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #4
+  %9 = load i32*, i32** %.global_tid..addr, align 8
+  %10 = load i32, i32* %9, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %11, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %13, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %14, %15
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %16, %17
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %18, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %19 = load i32*, i32** %1, align 8, !tbaa !11
+  %20 = load i32, i32* %2, align 4, !tbaa !15
+  %21 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 %20, %21
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom
+  %22 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %23 = load i32*, i32** %0, align 8, !tbaa !11
+  %24 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %24 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4
+  %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %25, %22
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %26, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %27, %28
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %29, %30
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10)
+  %31 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  %35 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #4
+  %36 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.2*
+  %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  %6 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 2
+  %7 = load i32*, i32** %6, align 1
+  call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_and_accesses_in_tregion", i32 9, i32 1}
+!1 = !{i32 0, i32 24, i32 43713508, !"loop_in_loop_in_tregion", i32 2, i32 0}
+!2 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_in_functions_and_extern_func_in_tregion", i32 43, i32 3}
+!3 = !{i32 0, i32 24, i32 43713508, !"parallel_loop_in_function_in_loop_with_global_acc_in_tregion", i32 35, i32 2}
+!4 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2, !"kernel", i32 1}
+!5 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9, !"kernel", i32 1}
+!6 = !{void (i32*, i32*, [512 x i32]*)* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35, !"kernel", i32 1}
+!7 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43, !"kernel", i32 1}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 9.0.0 "}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !13, i64 0}
+!17 = !{!18}
+!18 = !{i64 2, i64 3, i64 5, i1 false}
diff --git a/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll
@@ -0,0 +1,1208 @@
+; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s
+;
+; REQUIRES: asserts
+;
+; Check that we will not execute any of the below target regions in SPMD-mode.
+; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted.
+;
+; See the no_SPMD_mode.ll file for almost the same functions that cannot be translated to SPMD mode.
+;
+;
+; STATS-DAG: 4 openmp-opt - Number of GPU kernels converted to SPMD mode
+; STATS-DAG: 6 openmp-opt - Number of parallel GPU kernel regions converted to SPMD mode
+;
+; Check for SPMD mode.
+; CHECK: void @{{.*}}parallel_loop_in_loop_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+;
+; void parallel_loop_in_loop_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++) {
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[i + j];
+;   }
+; }
+;
+;
+; Check for SPMD mode.
+; CHECK: void @{{.*}}parallel_loops_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+;
+; void parallel_loops_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[0 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[1 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[2 + j];
+;   }
+; }
+;
+; FIXME: For now we copy the parallel_loop function below to
+;        make sure they have only uses in one kernel. As all
+;        kernels can be translated to SPMD mode we don't need
+;        this. In the future we should handle that and clone
+;        the function automatically (or add a new argument) to
+;        facilitate partial SPMD-mode execution.
+;
+; static void parallel_loop1(int *A, int *B, int i) {
+; #pragma omp parallel for
+;   for (int j = 0; j < 1024; j++)
+;     A[j] += B[i + j];
+; }
+; static void parallel_loop2(int *A, int *B, int i) {
+; #pragma omp parallel for
+;   for (int j = 0; j < 1024; j++)
+;     A[j] += B[i + j];
+; }
+;
+;
+; Check for SPMD mode.
+; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @parallel_loop1(
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+; CHECK: define internal void @parallel_loop1
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+;
+; void parallel_loop_in_function_in_loop_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++)
+;     parallel_loop1(A, B, i);
+; }
+;
+;
+; Check for SPMD mode.
+; CHECK: define internal void @parallel_loop2
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK: void @{{.*}}parallel_loops_in_functions_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @parallel_loop2(
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+;
+; void parallel_loops_in_functions_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+;     parallel_loop2(A, B, 1);
+;     parallel_loop2(A, B, 2);
+;     parallel_loop2(A, B, 3);
+;   }
+; }
+
+source_filename = "../llvm/test/Transforms/OpenMP/to_SPMD_mode.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.private.struct = type { i32**, i32**, i32* }
+%omp.private.struct.0 = type { i32**, i32** }
+%omp.private.struct.1 = type { i32**, i32** }
+%omp.private.struct.2 = type { i32**, i32** }
+%omp.private.struct.3 = type { i32**, i32**, i32* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1
+@__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode = weak constant i8 1
+@__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode = weak constant i8 1
+@__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %.execute
+  %3 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %3, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %5 = bitcast %omp.private.struct* %.private.vars to i8*
+  %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %6
+  %7 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %7
+  %8 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 2
+  store i32* %i, i32** %8
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %5, i16 24, i1 false)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32, i32* %i, align 4, !tbaa !15
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %i.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  store i32* %i, i32** %i.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = load i32*, i32** %i.addr, align 8, !tbaa !11
+  %3 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  %4 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %7 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %8 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3
+  %9 = load i32*, i32** %.global_tid..addr, align 8
+  %10 = load i32, i32* %9, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %11, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %13, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %14, %15
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %16, %17
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %18, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %19 = load i32*, i32** %1, align 8, !tbaa !11
+  %20 = load i32, i32* %2, align 4, !tbaa !15
+  %21 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 %20, %21
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom
+  %22 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %23 = load i32*, i32** %0, align 8, !tbaa !11
+  %24 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %24 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4
+  %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %25, %22
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %26, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %27, %28
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %29, %30
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10)
+  %31 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  %35 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3
+  %36 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct*
+  %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 2
+  %7 = load i32*, i32** %6, align 1
+  call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1)
+
+declare void @__kmpc_target_region_kernel_deinit(i1, i1)
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %.private.vars = alloca %omp.private.struct.0, align 8
+  %.private.vars1 = alloca %omp.private.struct.1, align 8
+  %.private.vars2 = alloca %omp.private.struct.2, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast %omp.private.struct.0* %.private.vars to i8*
+  %3 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %3
+  %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %4
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false)
+  %5 = bitcast %omp.private.struct.1* %.private.vars1 to i8*
+  %6 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 0
+  store i32** %A.addr, i32*** %6
+  %7 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 1
+  store i32** %B.addr, i32*** %7
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false)
+  %8 = bitcast %omp.private.struct.2* %.private.vars2 to i8*
+  %9 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 0
+  store i32** %A.addr, i32*** %9
+  %10 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 1
+  store i32** %B.addr, i32*** %10
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false)
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 0, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.0*
+  %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 1, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.1*
+  %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 2, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.2*
+  %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %.execute
+  %3 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %3, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %5 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %6 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %7 = load i32, i32* %i, align 4, !tbaa !15
+  call void @parallel_loop1(i32* %5, i32* %6, i32 %7)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32, i32* %i, align 4, !tbaa !15
+  %inc = add nsw i32 %8, 1
+  store i32 %inc, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal void @parallel_loop1(i32* %A, i32* %B, i32 %i) #2 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i.addr = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct.3, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store i32 %i, i32* %i.addr, align 4, !tbaa !15
+  %0 = bitcast %omp.private.struct.3* %.private.vars to i8*
+  %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %1
+  %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %2
+  %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2
+  store i32* %i.addr, i32** %3
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+  ret void
+}
+; Function Attrs: nounwind
+define internal void @parallel_loop2(i32* %A, i32* %B, i32 %i) #2 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i.addr = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct.3, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store i32 %i, i32* %i.addr, align 4, !tbaa !15
+  %0 = bitcast %omp.private.struct.3* %.private.vars to i8*
+  %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %1
+  %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %2
+  %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2
+  store i32* %i.addr, i32** %3
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %3 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop2(i32* %2, i32* %3, i32 1)
+  %4 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %5 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop2(i32* %4, i32* %5, i32 2)
+  %6 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %7 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop2(i32* %6, i32* %7, i32 3)
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %i.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  store i32* %i, i32** %i.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = load i32*, i32** %i.addr, align 8, !tbaa !11
+  %3 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  %4 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %7 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %8 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3
+  %9 = load i32*, i32** %.global_tid..addr, align 8
+  %10 = load i32, i32* %9, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %11, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %13, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %14, %15
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %16, %17
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %18, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %19 = load i32*, i32** %1, align 8, !tbaa !11
+  %20 = load i32, i32* %2, align 4, !tbaa !15
+  %21 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 %20, %21
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom
+  %22 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %23 = load i32*, i32** %0, align 8, !tbaa !11
+  %24 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %24 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4
+  %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %25, %22
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %26, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %27, %28
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %29, %30
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10)
+  %31 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  %35 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3
+  %36 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.4_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.3*
+  %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  %6 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 2
+  %7 = load i32*, i32** %6, align 1
+  call void @.omp_TRegion.4(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_loop_in_tregion", i32 2, i32 0}
+!1 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_functions_in_tregion", i32 36, i32 3}
+!2 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_tregion", i32 10, i32 1}
+!3 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_function_in_loop_in_tregion", i32 30, i32 2}
+!4 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2, !"kernel", i32 1}
+!5 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10, !"kernel", i32 1}
+!6 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30, !"kernel", i32 1}
+!7 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36, !"kernel", i32 1}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 9.0.0 "}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !13, i64 0}
+!17 = !{!18}
+!18 = !{i64 2, i64 3, i64 5, i1 false}
diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
--- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
+++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake
@@ -78,13 +78,14 @@
 
 # These flags are required to emit LLVM Bitcode. We check them together because
 # if any of them are not supported, there is no point in finding out which are.
-set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
+set(compiler_flags_required -emit-llvm -std=c++11 -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
 set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
 check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
 
 # It makes no sense to continue given that the compiler doesn't support
 # emitting basic LLVM Bitcode
 if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED)
+  message(ERROR "NO FLAG SUPPORT")
   return()
 endif()
 
@@ -101,6 +102,7 @@
   check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full})
 
   if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC)
+    message(ERROR "NO FCUDA RDC")
     return()
   endif()
 
diff --git a/openmp/libomptarget/deviceRTLs/common/target_region.h b/openmp/libomptarget/deviceRTLs/common/target_region.h
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/common/target_region.h
@@ -0,0 +1,167 @@
+//===-- target_region.h --- Target region OpenMP devie runtime interface --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Target region interfaces are simple interfaces designed to allow middle-end
+// (=LLVM) passes to analyze and transform the code. To achieve good performance
+// it may be required to run the associated passes. However, implementations of
+// this interface shall always provide a correct implementation as close to the
+// user expected code as possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _DEVICERTL_COMMON_INTERFACES_H_
+#define _DEVICERTL_COMMON_INTERFACES_H_
+
+#ifndef EXTERN
+#define EXTERN
+#endif
+#ifndef CALLBACK
+#define CALLBACK(Callee, Payload0, Payload1)
+#endif
+
+/// The target region _kernel_ interface for GPUs
+///
+/// This deliberatly simple interface provides the middle-end (=LLVM) with
+/// easier means to reason about the semantic of the code and transform it as
+/// well. The runtime calls are therefore also desiged to carry sufficient
+/// information necessary for optimization.
+///
+///
+/// Intended usage:
+///
+/// \code
+/// void kernel(...) {
+///
+///   char ThreadKind = __kmpc_target_region_kernel_init(...);
+///
+///   if (ThreadKind == -1) {               //  actual worker thread
+///     if (!UsedLibraryStateMachine)
+///       user_state_machine();
+///     goto exit;
+///   } else if (ThreadKind == 0) {         // surplus worker thread
+///     goto exit;
+///   } else {                              //    team master thread
+///     goto user_code;
+///   }
+///
+/// user_code:
+///
+///   // User defined kernel code, parallel regions are replaced by
+///   // by __kmpc_target_region_kernel_parallel(...) calls.
+///
+///   // Fallthrough to de-initialization
+///
+/// deinit:
+///   __kmpc_target_region_kernel_deinit(...);
+///
+/// exit:
+///   /* exit the kernel */
+/// }
+/// \endcode
+///
+///
+///{
+
+/// Initialization
+///
+///
+/// In SPMD mode, all threads will execute their respective initialization
+/// routines.
+///
+/// In non-SPMD mode, team masters will invoke the initialization routines while
+/// the rest is considered a worker thread. Worker threads required for this
+/// target region will be trapped inside the function if \p UseStateMachine is
+/// true. Otherwise they will escape with a return value of -1
+///
+/// \param UseSPMDMode         Flag to indicate if execution is performed in
+///                            SPMD mode.
+/// \param RequiresOMPRuntime  Flag to indicate if the runtime is required and
+///                            needs to be initialized.
+/// \param UseStateMachine     Flag to indicate if the runtime state machine
+///                            should be used in non-SPMD mode.
+/// \param RequiresDataSharing Flag to indicate if there might be inter-thread
+///                            sharing which needs runtime support.
+///
+/// \return 1, always in SPMD mode, and in non-SPMD mode if the thread is the
+///            team master.
+///         0, in non-SPMD mode and the thread is a surplus worker that should
+///            not execute anything in the target region.
+///        -1, in non-SPMD mode and the thread is a required worker which:
+///             - finished work and should be terminated if \p UseStateMachine
+///               is true.
+///             - has not performed work and should be put in a user provied
+///               state machine (as defined above).
+///
+EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode,
+                                               bool RequiresOMPRuntime,
+                                               bool UseStateMachine,
+                                               bool RequiresDataSharing);
+
+/// De-Initialization
+///
+///
+/// In non-SPMD, this function releases the workers trapped in a state machine
+/// and also any memory dynamically allocated by the runtime.
+///
+/// \param UseSPMDMode        Flag to indicate if execution is performed in
+///                           SPMD mode.
+/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and
+///                           is therefore initialized.
+///
+EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+                                               bool RequiredOMPRuntime);
+
+/// Generic type of a work function in the target region kernel interface. The
+/// two arguments are pointers to structures that contains the shared and
+/// firstprivate variables respectively. Since the layout and size was known at
+/// compile time, the front-end is expected to generate appropriate packing and
+/// unpacking code.
+typedef void (*ParallelWorkFnTy)(char * /* SharedValues */,
+                                 char * /* PrivateValues */);
+
+/// Enter a parallel region
+///
+///
+/// The parallel region is defined by \p ParallelWorkFn. The shared variables,
+/// \p SharedMemorySize bytes in total, start at \p SharedValues. The
+/// firstprivate variables, \p PrivateValuesBytes bytes in total, start at
+/// \p PrivateValues.
+///
+/// In SPMD mode, this function calls \p ParallelWorkFn with \p SharedValues and
+/// \p PrivateValues as arguments before it returns.
+///
+/// In non-SPMD mode, \p ParallelWorkFn, \p SharedValues, and \p PrivateValues
+/// are communicated to the workers before they are released from the state
+/// machine to run the code defined by \p ParallelWorkFn in parallel. This
+/// function will only return after all workers are finished.
+///
+/// \param UseSPMDMode        Flag to indicate if execution is performed in
+///                           SPMD mode.
+/// \param RequiredOMPRuntime Flag to indicate if the runtime was required and
+///                           is therefore initialized.
+/// \param ParallelWorkFn     The outlined code that is executed in parallel by
+///                           the threads in the team.
+/// \param SharedValues       A pointer to the location of all shared values.
+/// \param SharedValuesBytes  The total size of the shared values in bytes.
+/// \param PrivateValues      A pointer to the location of all private values.
+/// \param PrivateValuesBytes The total size of the private values in bytes.
+/// \param SharedMemPointers  Flag to indicate that the pointer \p SharedValues
+///                           and \p PrivateValues point into shared memory.
+///                           If this flag is true, it also requires that all
+///                           private values, if any, are stored directly after
+///                           the shared values.
+///
+CALLBACK(ParallelWorkFnTy, SharedValues, PrivateValues)
+EXTERN void __kmpc_target_region_kernel_parallel(
+    bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn,
+    char *SharedValues, uint16_t SharedValuesBytes, char *PrivateValues,
+    uint16_t PrivateValuesBytes, bool SharedMemPointers);
+
+///}
+
+#endif
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
--- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
+++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt
@@ -53,6 +53,7 @@
       src/reduction.cu
       src/sync.cu
       src/task.cu
+      src/target_region.cu
   )
 
   set(omp_data_objects src/omp_data.cu)
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -63,3 +63,9 @@
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
 __device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Pointer to share memory between team threads in the target region interface.
+////////////////////////////////////////////////////////////////////////////////
+__device__ __shared__ target_region_shared_buffer _target_region_shared_memory;
+
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
--- a/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -101,6 +101,66 @@
   uint32_t nArgs;
 };
 
+/// Helper structure to manage the memory shared by the threads in a team.
+///
+/// Note: Only the team master is allowed to call non-const functions!
+struct target_region_shared_buffer {
+#define PRE_SHARED_BYTES 128
+
+  INLINE void init() {
+    _ptr = &_data[0];
+    _size = PRE_SHARED_BYTES;
+    _offset = 0;
+  }
+
+  /// Release any dynamic allocated memory.
+  INLINE void release() {
+    if (_size == PRE_SHARED_BYTES)
+      return;
+    SafeFree(_ptr, (char *)"free shared dynamic buffer");
+    init();
+  }
+
+  INLINE void set(char *ptr, size_t offset) {
+    release();
+    _ptr = ptr;
+    _offset = offset;
+  }
+
+  INLINE void resize(size_t size, size_t offset) {
+    _offset = offset;
+
+    if (size <= _size)
+      return;
+
+    if (_size != PRE_SHARED_BYTES)
+      SafeFree(_ptr, (char *)"free shared dynamic buffer");
+
+    _size = size;
+    _ptr = (char *)SafeMalloc(_size, (char *)"new shared buffer");
+  }
+
+  // Called by all threads.
+  INLINE char *begin() const { return _ptr; };
+  INLINE size_t size() const { return _size; };
+  INLINE size_t get_offset() const { return _offset; };
+
+private:
+  // Pre-allocated space that holds PRE_SHARED_BYTES many bytes.
+  char _data[PRE_SHARED_BYTES];
+
+  // Pointer to the currently used buffer.
+  char *_ptr;
+
+  // Size of the currently used buffer.
+  uint32_t _size;
+
+  // Offset into the currently used buffer.
+  uint32_t _offset;
+
+#undef PRE_SHARED_BYTES
+};
+
 extern __device__ __shared__ omptarget_nvptx_SharedArgs
     omptarget_nvptx_globalArgs;
 
diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu
new file mode 100644
--- /dev/null
+++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_region.cu
@@ -0,0 +1,197 @@
+//===-- target_region.cu ---- CUDA impl. of the target region interface -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the common target region interface.
+//
+//===----------------------------------------------------------------------===//
+
+// Include the native definitions first as certain defines might be needed in
+// the common interface definition below.
+#include "omptarget-nvptx.h"
+#include "interface.h"
+
+#include "../../common/target_region.h"
+
+/// The pointer used to share memory between team threads.
+extern __device__ __shared__ target_region_shared_buffer
+    _target_region_shared_memory;
+
+EXTERN char *__kmpc_target_region_kernel_get_shared_memory() {
+  return _target_region_shared_memory.begin();
+}
+EXTERN char *__kmpc_target_region_kernel_get_private_memory() {
+  return _target_region_shared_memory.begin() +
+         _target_region_shared_memory.get_offset();
+}
+
+/// Simple generic state machine for worker threads.
+INLINE static void
+__kmpc_target_region_state_machine(bool IsOMPRuntimeInitialized) {
+
+  do {
+    void *WorkFn = 0;
+
+    // Wait for the signal that we have a new work function.
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+    // Retrieve the work function from the runtime.
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      char *SharedVars = __kmpc_target_region_kernel_get_shared_memory();
+      char *PrivateVars = __kmpc_target_region_kernel_get_private_memory();
+
+      ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars);
+
+      __kmpc_kernel_end_parallel();
+    }
+
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+  } while (true);
+}
+
+/// Filter threads into masters and workers. If \p UseStateMachine is true,
+/// required workers will enter a state machine through and be trapped there.
+/// Master and surplus worker threads will return from this function immediately
+/// while required workers will only return once there is no more work. The
+/// return value indicates if the thread is a master (1), a surplus worker (0),
+/// or a finished required worker released from the state machine (-1).
+INLINE static int8_t
+__kmpc_target_region_thread_filter(unsigned ThreadLimit, bool UseStateMachine,
+                                   bool IsOMPRuntimeInitialized) {
+
+  unsigned TId = GetThreadIdInBlock();
+  bool IsWorker = TId < ThreadLimit;
+
+  if (IsWorker) {
+    if (UseStateMachine)
+      __kmpc_target_region_state_machine(IsOMPRuntimeInitialized);
+    return -1;
+  }
+
+  return TId == GetMasterThreadID();
+}
+
+EXTERN int8_t __kmpc_target_region_kernel_init(bool UseSPMDMode,
+                                               bool UseStateMachine,
+                                               bool RequiresOMPRuntime,
+                                               bool RequiresDataSharing) {
+  unsigned NumThreads = GetNumberOfThreadsInBlock();
+
+  // Handle the SPMD case first.
+  if (UseSPMDMode) {
+
+    __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime,
+                            RequiresDataSharing);
+
+    if (RequiresDataSharing)
+      __kmpc_data_sharing_init_stack_spmd();
+
+    return 1;
+  }
+
+  // Reserve one WARP in non-SPMD mode for the masters.
+  unsigned ThreadLimit = NumThreads - WARPSIZE;
+  int8_t FilterVal = __kmpc_target_region_thread_filter(
+      ThreadLimit, UseStateMachine, RequiresOMPRuntime);
+
+  // If the filter returns 1 the executing thread is a team master which will
+  // initialize the kernel in the following.
+  if (FilterVal == 1) {
+    __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime);
+    __kmpc_data_sharing_init_stack();
+    _target_region_shared_memory.init();
+  }
+
+  return FilterVal;
+}
+
+EXTERN void __kmpc_target_region_kernel_deinit(bool UseSPMDMode,
+                                               bool RequiredOMPRuntime) {
+  // Handle the SPMD case first.
+  if (UseSPMDMode) {
+    __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime);
+    return;
+  }
+
+  __kmpc_kernel_deinit(RequiredOMPRuntime);
+
+  // Barrier to terminate worker threads.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // Release any dynamically allocated memory used for sharing.
+  _target_region_shared_memory.release();
+}
+
+EXTERN void __kmpc_target_region_kernel_parallel(
+    bool UseSPMDMode, bool RequiredOMPRuntime, ParallelWorkFnTy ParallelWorkFn,
+    char *SharedVars, uint16_t SharedVarsBytes, char *PrivateVars,
+    uint16_t PrivateVarsBytes, bool SharedMemPointers) {
+
+  if (UseSPMDMode) {
+    ParallelWorkFn(SharedVars, PrivateVars);
+    return;
+  }
+
+  if (SharedMemPointers) {
+    // If shared memory pointers are used the user guarantees that any private
+    // variables, if any, are stored directly after the shared ones in memory
+    // and that this memory can be accessed by all the threads. In that case,
+    // we do not need to copy memory around but simply use the provided
+    // locations.
+
+    _target_region_shared_memory.set(SharedVars, SharedVarsBytes);
+
+  } else {
+
+    size_t BytesToCopy = SharedVarsBytes + PrivateVarsBytes;
+    if (BytesToCopy) {
+      // Resize the shared memory to be able to hold the data which is required
+      // to be in shared memory. Also set the offset to the beginning to the
+      // private variables.
+      _target_region_shared_memory.resize(BytesToCopy, SharedVarsBytes);
+
+      // Copy the shared and private variables into shared memory.
+      char *SVMemory = __kmpc_target_region_kernel_get_shared_memory();
+      char *PVMemory = __kmpc_target_region_kernel_get_private_memory();
+      memcpy(SVMemory, SharedVars, SharedVarsBytes);
+      memcpy(PVMemory, PrivateVars, PrivateVarsBytes);
+    }
+  }
+
+  // TODO: It seems we could store the work function in the same shared space
+  // as the rest of the variables above.
+  //
+  // Initialize the parallel work, e.g., make sure the work function is known.
+  __kmpc_kernel_prepare_parallel((void *)ParallelWorkFn, RequiredOMPRuntime);
+
+  // TODO: It is odd that we call the *_spmd version in non-SPMD mode here.
+  //
+  // Activate workers. This barrier is used by the master to signal
+  // work for the workers.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // OpenMP [2.5, Parallel Construct, p.49]
+  // There is an implied barrier at the end of a parallel region. After the
+  // end of a parallel region, only the master thread of the team resumes
+  // execution of the enclosing task region.
+  //
+  // The master waits at this barrier until all workers are done.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // Update the shared variables if necessary.
+  if (!SharedVars && SharedVarsBytes)
+    memcpy(SharedVars, __kmpc_target_region_kernel_get_shared_memory(),
+           SharedVarsBytes);
+}