diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -287,6 +287,7 @@
 void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
 void initializeNaryReassociateLegacyPassPass(PassRegistry&);
 void initializeNewGVNLegacyPassPass(PassRegistry&);
+void initializeOpenMPOptLegacyPass(PassRegistry&);
 void initializeObjCARCAAWrapperPassPass(PassRegistry&);
 void initializeObjCARCAPElimPass(PassRegistry&);
 void initializeObjCARCContractPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -143,6 +143,7 @@
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
       (void) llvm::createNaryReassociatePass();
+      (void) llvm::createOpenMPOptLegacyPass();
       (void) llvm::createObjCARCAAWrapperPass();
       (void) llvm::createObjCARCAPElimPass();
       (void) llvm::createObjCARCExpandPass();
diff --git a/llvm/include/llvm/Transforms/IPO.h b/llvm/include/llvm/Transforms/IPO.h
--- a/llvm/include/llvm/Transforms/IPO.h
+++ b/llvm/include/llvm/Transforms/IPO.h
@@ -156,6 +156,11 @@
 ///
 ModulePass *createIPConstantPropagationPass();
 
+//===----------------------------------------------------------------------===//
+/// createOpenMPOpt - This pass performs OpenMP specific optimizations.
+///
+Pass *createOpenMPOptLegacyPass();
+
 //===----------------------------------------------------------------------===//
 /// createIPSCCPPass - This pass propagates constants from call sites into the
 /// bodies of functions, and keeps track of whether basic blocks are executable
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -25,6 +25,7 @@
   LoopExtractor.cpp
   LowerTypeTests.cpp
   MergeFunctions.cpp
+  OpenMPOpt.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -35,6 +35,7 @@
   initializeGlobalSplitPass(Registry);
   initializeHotColdSplittingLegacyPassPass(Registry);
   initializeIPCPPass(Registry);
+  initializeOpenMPOptLegacyPass(Registry);
   initializeAlwaysInlinerLegacyPassPass(Registry);
   initializeSimpleInlinerPass(Registry);
   initializeInferFunctionAttrsLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -0,0 +1,784 @@
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpenMP specific optimizations
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> BuildCustomStateMachines(
+    "openmp-opt-kernel-state-machines", cl::ZeroOrMore,
+    cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> PerformOpenMPSIMDIZATION(
+    "openmp-opt-kernel-simdization", cl::ZeroOrMore,
+    cl::desc("Convert non-SPMD kernels to SPMD mode if possible."), cl::Hidden,
+    cl::init(true));
+
+static cl::opt<bool> ForceOpenMPSIMDIZATION(
+    "openmp-opt-kernel-force-simdization", cl::ZeroOrMore,
+    cl::desc("Force execution of non-SPMD kernels in SPMD mode."), cl::Hidden,
+    cl::init(false));
+
+STATISTIC(NumKernelsConvertedToSPMD,
+          "Number of GPU kernels converted to SPMD mode");
+STATISTIC(NumParallelCallsConvertedToSPMD,
+          "Number of parallel GPU kernel regions converted to SPMD mode");
+STATISTIC(NumKernelsNonSPMDNoParallelism,
+          "Number of GPU kernel in non-SPMD mode without parallelism");
+STATISTIC(NumCustomStateMachinesCreated,
+          "Number of custom GPU kernel non-SPMD mode state machines created");
+STATISTIC(NumCustomStateMachinesNoFallback,
+          "Number of custom GPU kernel non-SPMD mode state machines without "
+          "fallback");
+
+namespace {
+
+/// Set of constants that describe the positions of arguments (ARG_FN_NAME) and
+/// the meaning of return values (RET_FN_MEANING) for the target region kernel
+/// interface. Has to be kept in sync with
+///   openmp/libomptarget/deviceRTLs/common/target_region.h
+/// and the respective implementations.
+enum {
+  ARG_INIT_USE_SPMD_MODE = 0,
+  ARG_INIT_REQUIRES_OMP_RUNTIME = 1,
+  ARG_INIT_USE_STATE_MACHINE = 2,
+  ARG_INIT_REQUIRES_DATA_SHARING = 3,
+
+  ARG_DEINIT_USE_SPMD_MODE = 0,
+  ARG_DEINIT_REQUIRES_OMP_RUNTIME = 1,
+
+  ARG_PARALLEL_USE_SPMD_MODE = 0,
+  ARG_PARALLEL_REQUIRES_OMP_RUNTIME = 1,
+  ARG_PARALLEL_WORK_FUNCTION = 2,
+  ARG_PARALLEL_SHARED_VARS = 3,
+  ARG_PARALLEL_SHARED_VARS_BYTES = 4,
+
+  RET_INIT_IS_WORKER = -1,
+  RET_INIT_IS_SURPLUS = 0,
+  RET_INIT_IS_MASTER = 1,
+};
+
+/// A macro list to represent known functions from the omp, __kmpc, and target
+/// region interfaces. The first value is an enum identifier, see FunctionID,
+/// the second value is the function name, and the third the expected number of
+/// arguments.
+#define KNOWN_FUNCTIONS()                                                      \
+  KF(FID_OMP_GET_TEAM_NUM, "omp_get_team_num", 0)                              \
+  KF(FID_OMP_GET_NUM_TEAMS, "omp_get_num_teams", 0)                            \
+  KF(FID_OMP_GET_THREAD_NUM, "omp_get_thread_num", 0)                          \
+  KF(FID_OMP_GET_NUM_THREADS, "omp_get_num_threads", 0)                        \
+  KF(FID_OMP_SET_NUM_THREADS, "omp_set_num_threads", 1)                        \
+  KF(FID_KMPC_TREGION_KERNEL_INIT, "__kmpc_target_region_kernel_init", 4)      \
+  KF(FID_KMPC_TREGION_KERNEL_DEINIT, "__kmpc_target_region_kernel_deinit", 2)  \
+  KF(FID_KMPC_TREGION_KERNEL_PARALLEL, "__kmpc_target_region_kernel_parallel", \
+     8)                                                                        \
+  KF(FID_KMPC_FOR_STATIC_INIT_4, "__kmpc_for_static_init_4", 9)                \
+  KF(FID_KMPC_FOR_STATIC_FINI, "__kmpc_for_static_fini", 2)                    \
+  KF(FID_KMPC_GLOBAL_THREAD_NUM, "__kmpc_global_thread_num", 1)                \
+  KF(FID_KMPC_DISPATCH_INIT_4, "__kmpc_dispatch_init_4", 7)                    \
+  KF(FID_KMPC_DISPATCH_NEXT_4, "__kmpc_dispatch_next_4", 6)
+
+/// An identifier enum for each known function as well as the different kinds
+/// of unknown functions we distinguish.
+enum FunctionID {
+#define KF(NAME, STR, NARGS) NAME,
+  KNOWN_FUNCTIONS()
+#undef KF
+  // Unknown functions
+  //{
+  FID_KMPC_UNKNOWN, ///< unknown __kmpc_XXXX function
+  FID_OMP_UNKOWN,   ///< unknown omp_XXX function
+  FID_NVVM_UNKNOWN, ///< unknown llvm.nvvm.XXX function
+  FID_LLVM_UNKNOWN, ///< unknown llvm.XXX function
+  FID_UNKNOWN       ///< unknown function without known prefix.
+  //}
+};
+
+static FunctionID getFunctionID(Function *F) {
+  if (!F)
+    return FID_UNKNOWN;
+#define KF(NAME, STR, NARGS) .Case(STR, NAME)
+  return StringSwitch<FunctionID>(F->getName()) KNOWN_FUNCTIONS()
+      .StartsWith("__kmpc_", FID_KMPC_UNKNOWN)
+      .StartsWith("omp_", FID_OMP_UNKOWN)
+      .StartsWith("llvm.nvvm.", FID_NVVM_UNKNOWN)
+      .StartsWith("llvm.", FID_LLVM_UNKNOWN)
+      .Default(FID_UNKNOWN);
+#undef KF
+}
+
+static Type *getOrCreateStructIdentTypePtr(Module &M) {
+  // TODO create if not present!
+  return M.getTypeByName("struct.ident_t")->getPointerTo();
+}
+
+// TODO: Simplify function declaration
+static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1, const char *Name,
+                               Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {T0, T1}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+
+static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) {
+  static const char *Name = "__kmpc_barrier_simple_spmd";
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    LLVMContext &Ctx = M.getContext();
+    FunctionType *FType = FunctionType::get(
+        Type::getVoidTy(Ctx),
+        {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false);
+    Fn =
+        Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+
+/// A helper class to introduce smart guarding code.
+struct GuardGenerator {
+
+  /// Inform the guard generator about the side-effect instructions collected in
+  /// @p SideEffectInst.
+  ///
+  /// \Returns True if all registered side-effects can be (efficiently) guarded.
+  bool registerSideEffects(SmallVectorImpl<Instruction *> &SideEffectInst) {
+    bool Guarded = true;
+    if (SideEffectInst.empty())
+      return Guarded;
+
+    const Module &M = *SideEffectInst.front()->getModule();
+    const DataLayout &DL = M.getDataLayout();
+
+    SmallVector<Instruction *, 16> UnguardedSideEffectInst;
+    for (Instruction *I : SideEffectInst) {
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (getFunctionID(CI->getCalledFunction()) != FID_UNKNOWN)
+          continue;
+      } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (isa<AllocaInst>(
+                SI->getPointerOperand()->stripInBoundsConstantOffsets()))
+          continue;
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        if (isSafeToLoadUnconditionally(LI->getPointerOperand(),
+                                        LI->getAlignment(), DL))
+          continue;
+      }
+      LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n");
+      UnguardedSideEffectInst.push_back(I);
+    }
+
+    return UnguardedSideEffectInst.empty();
+  }
+
+  bool registerReadEffects(SmallVectorImpl<Instruction *> &ReadEffectInst) {
+    return registerSideEffects(ReadEffectInst);
+  }
+
+  void introduceGuards() {
+    // TODO: The guard generator cannot introduce guards yet but the registerXXX
+    //       functions above are aware of that!
+  }
+};
+
+/// Helper structure to represent and work with a target region kernel.
+struct KernelTy {
+
+  KernelTy(Function *KernelFn) : KernelFn(*KernelFn) {}
+
+  /// Optimize this kernel, return true if something was done.
+  bool optimize();
+
+private:
+  /// Analyze this kernel, return true if successful.
+  bool analyze(Function &F, SmallPtrSetImpl<Function *> &Visited,
+               bool InParallelRegion);
+
+  /// Return true if the kernel is executed in SPMD mode.
+  bool isExecutedInSPMDMode();
+
+  /// Convert a non-SPMD mode kernel to SPMD mode, return true if successful.
+  bool convertToSPMD();
+
+  /// Create a custom state machine in the module, return true if successful.
+  bool createCustomStateMachine();
+
+  /// All side-effect instructions potentially executed in this kernel.
+  SmallVector<Instruction *, 16> SideEffectInst;
+
+  /// All read-only instructions potentially executed in this kernel.
+  SmallVector<Instruction *, 16> ReadOnlyInst;
+
+  /// All non-analyzed calls contained in this kernel. They are separated by
+  /// their function ID which describes identifies known calls.
+  SmallVector<CallInst *, 2> KernelCalls[FID_UNKNOWN + 1];
+
+  /// All non-analyzed calls contained in parallel regions which are part of
+  /// this kernel. They are separated by their function ID which describes
+  /// identifies known calls.
+  SmallVector<CallInst *, 2> ParallelRegionCalls[FID_UNKNOWN + 1];
+
+  /// The entry function of this kernel.
+  Function &KernelFn;
+};
+
+bool KernelTy::analyze(Function &F, SmallPtrSetImpl<Function *> &Visited,
+                       bool InParallelRegion) {
+  if (!Visited.insert(&F).second)
+    return true;
+
+  LLVM_DEBUG(dbgs() << "Analyze "
+                    << (InParallelRegion ? "parallel-region" : "kernel")
+                    << " function: " << F.getName() << "\n");
+
+  // Determine where we remember the call.
+  auto &CallsArray = InParallelRegion ? ParallelRegionCalls : KernelCalls;
+
+  for (Instruction &I : instructions(&F)) {
+
+    // In parallel regions we only look for calls, outside, we look for all
+    // side-effect and read-only instructions.
+    if (!InParallelRegion) {
+      // Handle non-side-effect instructions first. These will not write or
+      // throw which makes reading the only interesting potential property.
+      if (!I.mayHaveSideEffects()) {
+        if (I.mayReadFromMemory()) {
+          LLVM_DEBUG(dbgs() << "- read-only: " << I << "\n");
+          ReadOnlyInst.push_back(&I);
+        }
+        continue;
+      }
+
+      // Now we handle all non-call instructions.
+      if (!isa<CallInst>(I)) {
+        LLVM_DEBUG(dbgs() << "- side-effect: " << I << "\n");
+        SideEffectInst.push_back(&I);
+        continue;
+      }
+    }
+
+    if (!isa<CallInst>(I))
+      continue;
+
+    CallInst &CI = cast<CallInst>(I);
+    Function *Callee = CI.getCalledFunction();
+
+    // For exact definitions we recurs.
+    if (Callee && !Callee->isDeclaration() && Callee->isDefinitionExact()) {
+      // If recursive analysis failed we bail, otherwise the
+      // information was collected in the internal state.
+      if (!analyze(*Callee, Visited, InParallelRegion))
+        return false;
+      continue;
+    }
+
+    // Check that know functions have the right number of arguments early on.
+    // Additionally provide debug output based on the function ID.
+    FunctionID ID = getFunctionID(Callee);
+
+    switch (ID) {
+#define KF(NAME, STR, NARGS)                                                   \
+  case NAME:                                                                   \
+    LLVM_DEBUG(                                                                \
+        dbgs() << "- known call "                                              \
+               << (CI.getNumArgOperands() != NARGS ? "[#arg missmatch!]" : "") \
+               << ": " << I << "\n");                                          \
+    if (CI.getNumArgOperands() != NARGS)                                       \
+      ID = FID_UNKNOWN;                                                        \
+    break;
+      KNOWN_FUNCTIONS()
+#undef KF
+    case FID_KMPC_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown __kmpc_* call: " << I << "\n");
+      break;
+    case FID_OMP_UNKOWN:
+      LLVM_DEBUG(dbgs() << "- unknown omp_* call: " << I << "\n");
+      break;
+    case FID_NVVM_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown llvm.nvvm.* call: " << I << "\n");
+      break;
+    case FID_LLVM_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown llvm.* call: " << I << "\n");
+      break;
+    case FID_UNKNOWN:
+      LLVM_DEBUG(dbgs() << "- unknown call: " << I << "\n");
+      break;
+    }
+
+    CallsArray[ID].push_back(&CI);
+  }
+
+  // If we did not analyze the kernel function but some other one down the call
+  // chain we are done now.
+  // TODO: Add more verification code here.
+  if (&F != &KernelFn)
+    return true;
+
+  assert(&KernelCalls == &CallsArray);
+
+  // If we are analyzing the kernel function we need to verify we have at least
+  // the calls we expect to see in the right places.
+  if (KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() != 1 ||
+      KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() != 1 ||
+      KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getParent() !=
+          &F.getEntryBlock()) {
+    LLVM_DEBUG(dbgs() << "- malformed kernel: [#Init: "
+                      << KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size()
+                      << "][#DeInit: "
+                      << KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size()
+                      << "]\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool KernelTy::isExecutedInSPMDMode() {
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  auto *SPMDFlag = cast<Constant>(
+      KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].front()->getArgOperand(0));
+  assert(SPMDFlag->isZeroValue() || SPMDFlag->isOneValue());
+  return SPMDFlag->isOneValue();
+}
+
+bool KernelTy::optimize() {
+  bool Changed = false;
+
+  // First analyze the code. If that fails for some reason we bail out early.
+  SmallPtrSet<Function *, 8> Visited;
+  if (!analyze(KernelFn, Visited, /* InParallelRegion */ false))
+    return Changed;
+
+  Visited.clear();
+  for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) {
+    Value *ParCIParallelFnArg =
+        ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION);
+    Function *ParallelFn =
+        dyn_cast<Function>(ParCIParallelFnArg->stripPointerCasts());
+    if (!ParallelFn ||
+        !analyze(*ParallelFn, Visited, /* InParallelRegion */ true))
+      return Changed;
+  }
+
+  Changed |= convertToSPMD();
+  Changed |= createCustomStateMachine();
+
+  return Changed;
+}
+
+bool KernelTy::convertToSPMD() {
+  if (isExecutedInSPMDMode())
+    return false;
+
+  bool Changed = false;
+
+  // Use a generic guard generator to determine if suitable guards for all
+  // side effect instructions can be placed.
+  GuardGenerator GG;
+
+  // Check if SIMDIZATION is possible, in case it is not forced.
+  if (!ForceOpenMPSIMDIZATION) {
+    // Unknown calls are not handled yet and will cause us to bail.
+    if (!KernelCalls[FID_UNKNOWN].empty())
+      return Changed;
+
+    // If we cannot guard all side effect instructions bail out.
+    if (!GG.registerSideEffects(SideEffectInst))
+      return Changed;
+
+    if (!GG.registerReadEffects(ReadOnlyInst))
+      return Changed;
+
+    // TODO: Emit a remark.
+    LLVM_DEBUG(dbgs() << "Transformation to SPMD OK\n");
+
+    // If we disabled SIMDIZATION we only emit the debug message and bail.
+    if (!PerformOpenMPSIMDIZATION)
+      return Changed;
+  }
+
+  // Actually emit the guard code after we decided to perform SIMDIZATION.
+  GG.introduceGuards();
+
+  // Create an "is-SPMD" flag.
+  Type *FlagTy = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]
+                     ->getArgOperand(ARG_INIT_USE_SPMD_MODE)
+                     ->getType();
+  Constant *SPMDFlag = ConstantInt::getTrue(FlagTy);
+
+  // Update the init and deinit calls with the "is-SPMD" flag to indicate
+  // SPMD mode.
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0]->setArgOperand(
+      ARG_INIT_USE_SPMD_MODE, SPMDFlag);
+  KernelCalls[FID_KMPC_TREGION_KERNEL_DEINIT][0]->setArgOperand(
+      ARG_DEINIT_USE_SPMD_MODE, SPMDFlag);
+
+  // Use the simple barrier to synchronize all threads in SPMD mode after each
+  // parallel region.
+  Function *SimpleBarrierFn =
+      getOrCreateSimpleSPMDBarrierFn(*KernelFn.getParent());
+
+  // For each parallel region, identified by the
+  // __kmpc_target_region_kernel_parallel call, we set the "is-SPMD" flag and
+  // introduce a succeeding barrier call.
+  for (CallInst *ParCI : KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL]) {
+    ParCI->setArgOperand(ARG_PARALLEL_USE_SPMD_MODE, SPMDFlag);
+    auto AI = SimpleBarrierFn->arg_begin();
+    CallInst::Create(SimpleBarrierFn,
+                     {Constant::getNullValue((AI++)->getType()),
+                      Constant::getNullValue((AI)->getType())},
+                     "", ParCI->getNextNode());
+  }
+
+  // TODO: serialize nested parallel regions
+
+  // Finally, we change the global exec_mode variable to indicate SPMD mode.
+  GlobalVariable *ExecMode = KernelFn.getParent()->getGlobalVariable(
+      (KernelFn.getName() + "_exec_mode").str());
+  assert(ExecMode &&
+         "Assumed to find an execution mode hint among the globals");
+  assert(ExecMode->getInitializer()->isOneValue() &&
+         "Assumed target_region execution mode prior to 'SPMD'-zation");
+  ExecMode->setInitializer(
+      Constant::getNullValue(ExecMode->getInitializer()->getType()));
+
+  // Bookkeeping
+  NumKernelsConvertedToSPMD++;
+  NumParallelCallsConvertedToSPMD +=
+      KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size();
+
+  return Changed;
+}
+
+bool KernelTy::createCustomStateMachine() {
+  if (isExecutedInSPMDMode())
+    return false;
+
+  // TODO: Warn or eliminate the offloading if no parallel regions are present.
+  // TODO: Use reachability to eliminate the loop and if-cascade
+  //
+  // The user module code looks as follows if this function returns true.
+  //
+  //   ThreadKind = __kmpc_target_region_kernel_init(...)
+  //   if (ThreadKind == -1) {               //  actual worker thread
+  //     do {
+  //       __kmpc_barrier_simple_spmd(...)
+  //       void *WorkFn;
+  //       bool IsActive = __kmpc_kernel_parallel(&WorkFn, ...);
+  //       if (!WorkFn)
+  //         goto exit;
+  //       if (IsActive) {
+  //         char *SharedVars = __kmpc_target_region_kernel_get_shared_memory();
+  //         char *PrivateVars =
+  //         __kmpc_target_region_kernel_get_private_memory();
+  //
+  //         ((ParallelWorkFnTy)WorkFn)(SharedVars, PrivateVars);
+  //
+  //         __kmpc_kernel_end_parallel();
+  //       }
+  //       __kmpc_barrier_simple_spmd(...)
+  //     } while (true);
+  //   } else if (ThreadKind == 0) {         // surplus worker thread
+  //     goto exit;
+  //   } else {                              //    team master thread
+  //     goto user_code;
+  //   }
+
+  if (KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].size() == 0) {
+    LLVM_DEBUG(dbgs() << "Will not build a custom state machine because there "
+                         "are no known parallel regions in the kernel.\n");
+    // TODO: If we also know there are no hidden parallel calls we can terminate
+    // all but the
+    //       master thread right away.
+    NumKernelsNonSPMDNoParallelism++;
+    return false;
+  }
+
+  assert(KernelCalls[FID_KMPC_TREGION_KERNEL_INIT].size() == 1 &&
+         "Non-canonical kernel form!");
+  CallInst *InitCI = KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0];
+
+  // Check if a custom state machine was already implemented.
+  auto *UseSM =
+      dyn_cast<ConstantInt>(InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE));
+  if (!UseSM || !UseSM->isOne()) {
+    LLVM_DEBUG(dbgs() << "Will not build a custom state machine because of "
+                      << *KernelCalls[FID_KMPC_TREGION_KERNEL_INIT][0] << "\n");
+    return false;
+  }
+
+  InitCI->setName("thread_kind");
+  LLVMContext &Ctx = InitCI->getContext();
+
+  // Create local storage for the work function pointer.
+  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  AllocaInst *WorkFnAI = new AllocaInst(VoidPtrTy, 0, "work_fn.addr",
+                                        &KernelFn.getEntryBlock().front());
+
+  Instruction *IP = InitCI->getNextNode();
+
+  Type *FlagTy = InitCI->getArgOperand(ARG_INIT_USE_STATE_MACHINE)->getType();
+  Constant *SMFlag = ConstantInt::getFalse(FlagTy);
+  InitCI->setArgOperand(ARG_INIT_USE_STATE_MACHINE, SMFlag);
+
+  // Check the return value of __kmpc_target_region_kernel_init. First compare
+  // it to RET_INIT_IS_WORKER.
+  Instruction *WorkerCnd = new ICmpInst(
+      IP, ICmpInst::ICMP_EQ, InitCI,
+      ConstantInt::getSigned(InitCI->getType(), RET_INIT_IS_WORKER),
+      "is_worker");
+
+  // Create the conditional which is entered by worker threads.
+  Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false);
+  BasicBlock *WaitBB = WaitTI->getParent();
+  WaitBB->setName("worker.wait");
+  IP->getParent()->setName("master_check");
+
+  Instruction *MasterCheckTI = IP->getParent()->getTerminator();
+  assert(MasterCheckTI->getNumSuccessors() == 2);
+  assert(WaitTI->getNumSuccessors() == 1);
+
+  // Determine the final block, that is a trivial one where the kernel ends.
+  BasicBlock *FinalBB = nullptr;
+  if (MasterCheckTI->getSuccessor(0)->size() == 1 &&
+      isa<ReturnInst>(MasterCheckTI->getSuccessor(0)->getTerminator()))
+    FinalBB = MasterCheckTI->getSuccessor(0);
+  else if (MasterCheckTI->getSuccessor(1)->size() == 1 &&
+           isa<ReturnInst>(MasterCheckTI->getSuccessor(1)->getTerminator()))
+    FinalBB = MasterCheckTI->getSuccessor(1);
+  assert(FinalBB && "Could not determine the final kernal block.");
+
+  // Use the simple barrier to synchronize all threads in SPMD mode after each
+  // parallel region.
+  Module &M = *KernelFn.getParent();
+  Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M);
+
+  auto AI = SimpleBarrierFn->arg_begin();
+  Instruction *BarrierCall =
+      CallInst::Create(SimpleBarrierFn,
+                       {Constant::getNullValue((AI++)->getType()),
+                        Constant::getNullValue((AI)->getType())},
+                       "", WaitTI);
+
+  Function *KernelParallelFn =
+      getOrCreateFn(Type::getInt1Ty(Ctx), VoidPtrTy->getPointerTo(),
+                    Type::getInt16Ty(Ctx), "__kmpc_kernel_parallel", M);
+
+  Value *RequiresOMPRuntime = CastInst::CreateZExtOrBitCast(
+      InitCI->getArgOperand(ARG_INIT_REQUIRES_OMP_RUNTIME),
+      Type::getInt16Ty(Ctx), "", WaitTI);
+  Instruction *ActiveCnd = CallInst::Create(
+      KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI);
+
+  Type *WorkFnPrototype =
+      FunctionType::get(Type::getVoidTy(Ctx), {VoidPtrTy, VoidPtrTy}, false)
+          ->getPointerTo();
+  Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+      WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI);
+  Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI);
+
+  Instruction *WorkFnCnd =
+      new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn,
+                   Constant::getNullValue(WorkFn->getType()), "no_work");
+
+  Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false);
+  FinishedTI->getParent()->setName("worker.finished");
+  WaitTI->getParent()->setName("worker.active_check");
+
+  Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false);
+  ActiveTI->getParent()->setName("worker.active");
+  WaitTI->getParent()->setName("worker.inactive");
+
+  Function *KernelGetSharedVars = getOrCreateFn(
+      VoidPtrTy, "__kmpc_target_region_kernel_get_shared_memory", M);
+  Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI);
+  Function *KernelGetPrivateVars = getOrCreateFn(
+      VoidPtrTy, "__kmpc_target_region_kernel_get_private_memory", M);
+  Value *PrivateVars = CallInst::Create(KernelGetPrivateVars, "", ActiveTI);
+
+  BasicBlock *ExecuteBB = ActiveTI->getParent();
+  BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI);
+  ParallelEndBB->setName("worker.parallel_end");
+
+  Function *KernelEndParallelFn =
+      getOrCreateFn(Type::getVoidTy(Ctx), "__kmpc_kernel_end_parallel", M);
+  CallInst::Create(KernelEndParallelFn, "", ActiveTI);
+
+  // A fallback is required if we might not see all parallel regions
+  // (__kmpc_target_region_kernel_parallel calls). This could be the case if
+  // there is an unknown function call with side effects in the target region
+  // or inside one of the parallel regions.
+  bool RequiresFallback = !KernelCalls[FID_UNKNOWN].empty() ||
+                          !ParallelRegionCalls[FID_UNKNOWN].empty();
+
+  // Collect all target region parallel calls
+  // (__kmpc_target_region_kernel_parallel).
+  SmallVector<CallInst *, 16> KernelParallelCalls;
+  KernelParallelCalls.append(
+      KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(),
+      KernelCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end());
+  KernelParallelCalls.append(
+      ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].begin(),
+      ParallelRegionCalls[FID_KMPC_TREGION_KERNEL_PARALLEL].end());
+
+  IP = ExecuteBB->getTerminator();
+
+  // For each parallel call create a conditional that compares the work function
+  // against the parallel work function of this parallel call, if available. If
+  // the function pointers are equal we call the known parallel call work
+  // function directly and continue to the end of the if-cascade.
+  for (CallInst *ParCI : KernelParallelCalls) {
+    Function *ParFn = dyn_cast<Function>(
+        ParCI->getArgOperand(ARG_PARALLEL_WORK_FUNCTION)->stripPointerCasts());
+    if (!ParFn) {
+      LLVM_DEBUG(
+          dbgs() << "Require fallback due to unknown parallel function\n");
+      RequiresFallback = true;
+      continue;
+    }
+
+    Value *ParFnCnd =
+        new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check");
+    Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false);
+    IP->getParent()->setName("worker.check.next");
+    ParFnTI->getParent()->setName("worker.execute." + ParFn->getName());
+    CallInst::Create(ParFn, {SharedVars, PrivateVars}, "", ParFnTI);
+    ParFnTI->setSuccessor(0, ParallelEndBB);
+  }
+
+  // If a fallback is required we emit a indirect call before we jump to the
+  // point where all cases converge.
+  if (RequiresFallback)
+    CallInst::Create(WorkFn, {SharedVars, PrivateVars}, "", IP);
+
+  // Insert a barrier call at the convergence point, right before the back edge.
+  BarrierCall->clone()->insertBefore(WaitTI);
+
+  // Rewire the CFG edges to introduce the back and exit edge of the new loop.
+  // TODO: Add the new loop to LI!
+  FinishedTI->setSuccessor(0, FinalBB);
+  WaitTI->setSuccessor(0, WaitBB);
+
+  // Bookkeeping.
+  NumCustomStateMachinesCreated++;
+  NumCustomStateMachinesNoFallback += !RequiresFallback;
+
+  return true;
+}
+
+template <class T>
+static void collectCallersOf(Module &M, StringRef Name,
+                             SmallVectorImpl<T> &Callers) {
+  Function *Callee = M.getFunction(Name);
+
+  // If the callee function is not present, we are done.
+  if (!Callee)
+    return;
+
+  // If it exists we check all users.
+  for (const Use &U : Callee->uses()) {
+    CallSite CS(U.getUser());
+
+    // Filter out non-callee uses and non-call uses.
+    if (!CS || !CS.isCallee(&U) || !isa<CallInst>(CS.getInstruction()))
+      continue;
+
+    // Found a caller, use it to create a T type object and put the result
+    // in the Callers vector.
+    Callers.emplace_back(T(CS.getCaller()));
+  }
+}
+
+/// OpenMPOpt - The interprocedural OpenMP optimization pass implementation.
+struct OpenMPOpt {
+
+  bool runOnModule(Module &M) {
+    bool Changed = false;
+
+    // Collect target regions kernels identified by a call to
+    // __kmpc_target_region_kernel_init.
+    collectCallersOf(M, "__kmpc_target_region_kernel_init", TRKernels);
+
+    for (KernelTy &K : TRKernels)
+      Changed |= K.optimize();
+
+    return Changed;
+  }
+
+private:
+  /// A collection of all target regions kernels we found.
+  SmallVector<KernelTy, 4> TRKernels;
+};
+
+// TODO: This could be a CGSCC pass as well.
+struct OpenMPOptLegacy : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  OpenMPOpt OMPOpt;
+
+  OpenMPOptLegacy() : ModulePass(ID) {
+    initializeOpenMPOptLegacyPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+  bool runOnModule(Module &M) override { return OMPOpt.runOnModule(M); }
+};
+
+// TODO: Add a new PM entry point.
+
+} // namespace
+
+char OpenMPOptLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(OpenMPOptLegacy, "openmp-opt",
+                      "OpenMP specific optimizations", false, false)
+INITIALIZE_PASS_END(OpenMPOptLegacy, "openmp-opt",
+                    "OpenMP specific optimizations", false, false)
+
+Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacy(); }
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -517,6 +517,10 @@
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
+  // Try to perform OpenMP specific optimizations. This is a no-op if OpenMP
+  // runtime calls are not present in the module.
+  MPM.add(createOpenMPOptLegacyPass());
+
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
diff --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
 ; CHECK-NEXT:     Called Value Propagation
diff --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     FunctionPass Manager
 ; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Call-site splitting
diff --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -27,6 +27,7 @@
 ; CHECK-NEXT:   ModulePass Manager
 ; CHECK-NEXT:     Force set function attributes
 ; CHECK-NEXT:     Infer set function attributes
+; CHECK-NEXT:     OpenMP specific optimizations
 ; CHECK-NEXT:     Interprocedural Sparse Conditional Constant Propagation
 ; CHECK-NEXT:       Unnamed pass: implement Pass::getPassName()
 ; CHECK-NEXT:     Called Value Propagation
diff --git a/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/no_SPMD_mode.ll
@@ -0,0 +1,1064 @@
+; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s
+;
+; REQUIRES: asserts
+;
+; Check that we will not execute any of the below target regions in SPMD-mode.
+; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted.
+;
+; See the to_SPMD_mode.ll file for almost the same functions that can be translated to SPMD mode.
+;
+; STATS-DAG: 1 openmp-opt - Number of GPU kernel in non-SPMD mode without parallelism
+; STATS-DAG: 3 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines created
+; STATS-DAG: 2 openmp-opt - Number of custom GPU kernel non-SPMD mode state machines without fallback
+;
+; No state machine needed because there is no parallel region.
+; CHECK: void @{{.*}}loop_in_loop_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 true
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void loop_in_loop_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++) {
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[i + j];
+;   }
+; }
+;
+;
+; Custom state machine needed but no fallback because all parallel regions are known
+; CHECK: void @{{.*}}parallel_loops_and_accesses_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false
+; The "check.next" block should not contain a fallback call
+; CHECK:       worker.check.next4:
+; CHECK-NEXT:    br label %worker.parallel_end
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void parallel_loops_and_accesses_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[0 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[1 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[2 + j];
+;
+;     // This needs a guard in SPMD mode
+;     A[0] = B[0];
+;   }
+; }
+;
+; void extern_func();
+; static void parallel_loop(int *A, int *B, int i) {
+; #pragma omp parallel for
+;   for (int j = 0; j < 1024; j++)
+;     A[j] += B[i + j];
+; }
+;
+; int Global[512];
+;
+;
+; Custom state machine needed but no fallback because all parallel regions are known
+; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_with_global_acc_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false
+; The "check.next" block should not contain a fallback call
+; CHECK:       worker.check.next:
+; CHECK-NEXT:    br label %worker.parallel_end
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void parallel_loop_in_function_in_loop_with_global_acc_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++) {
+;     parallel_loop(A, B, i);
+;     Global[i]++;
+;   }
+; }
+;
+; Custom state machine needed with fallback because "extern_func" might contain parallel regions.
+; CHECK: void @{{.*}}parallel_loops_in_functions_and_extern_func_in_tregion
+; CHECK: call i8 @__kmpc_target_region_kernel_init(i1 false, i1 {{[a-z]*}}, i1 false
+; The "check.next" block should contain a fallback call
+; CHECK:       worker.check.next:
+; CHECK-NEXT:    call void %work_fn(
+; CHECK-NEXT:    br label %worker.parallel_end
+; CHECK: call void @__kmpc_target_region_kernel_deinit(i1 false,
+;
+; void parallel_loops_in_functions_and_extern_func_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+;     parallel_loop(A, B, 1);
+;     parallel_loop(A, B, 2);
+;     extern_func();
+;     parallel_loop(A, B, 3);
+;   }
+; }
+
+source_filename = "../llvm/test/Transforms/OpenMP/no_SPMD_mode.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.private.struct = type { i32**, i32** }
+%omp.private.struct.0 = type { i32**, i32** }
+%omp.private.struct.1 = type { i32**, i32** }
+%omp.private.struct.2 = type { i32**, i32**, i32* }
+
+@__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode = weak constant i8 1
+@__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode = weak constant i8 1
+@__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35_exec_mode, i8* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  %cleanup.dest.slot = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc8, %.execute
+  %3 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %3, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  store i32 2, i32* %cleanup.dest.slot, align 4
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #4
+  br label %for.end10
+
+for.body:                                         ; preds = %for.cond
+  %5 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 0, i32* %j, align 4, !tbaa !15
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.inc, %for.body
+  %6 = load i32, i32* %j, align 4, !tbaa !15
+  %cmp2 = icmp slt i32 %6, 1024
+  br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
+
+for.cond.cleanup3:                                ; preds = %for.cond1
+  store i32 5, i32* %cleanup.dest.slot, align 4
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %7) #4
+  br label %for.end
+
+for.body4:                                        ; preds = %for.cond1
+  %8 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %9 = load i32, i32* %i, align 4, !tbaa !15
+  %10 = load i32, i32* %j, align 4, !tbaa !15
+  %add = add nsw i32 %9, %10
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %11 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %12 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %13 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom5 = sext i32 %13 to i64
+  %arrayidx6 = getelementptr inbounds i32, i32* %12, i64 %idxprom5
+  %14 = load i32, i32* %arrayidx6, align 4, !tbaa !15
+  %add7 = add nsw i32 %14, %11
+  store i32 %add7, i32* %arrayidx6, align 4, !tbaa !15
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body4
+  %15 = load i32, i32* %j, align 4, !tbaa !15
+  %inc = add nsw i32 %15, 1
+  store i32 %inc, i32* %j, align 4, !tbaa !15
+  br label %for.cond1
+
+for.end:                                          ; preds = %for.cond.cleanup3
+  br label %for.inc8
+
+for.inc8:                                         ; preds = %for.end
+  %16 = load i32, i32* %i, align 4, !tbaa !15
+  %inc9 = add nsw i32 %16, 1
+  store i32 %inc9, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end10:                                        ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end10
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+declare void @__kmpc_target_region_kernel_deinit(i1, i1)
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %.private.vars = alloca %omp.private.struct, align 8
+  %.private.vars1 = alloca %omp.private.struct.0, align 8
+  %.private.vars2 = alloca %omp.private.struct.1, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast %omp.private.struct* %.private.vars to i8*
+  %3 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %3
+  %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %4
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false)
+  %5 = bitcast %omp.private.struct.0* %.private.vars1 to i8*
+  %6 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 0
+  store i32** %A.addr, i32*** %6
+  %7 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars1, i32 0, i32 1
+  store i32** %B.addr, i32*** %7
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false)
+  %8 = bitcast %omp.private.struct.1* %.private.vars2 to i8*
+  %9 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 0
+  store i32** %A.addr, i32*** %9
+  %10 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars2, i32 0, i32 1
+  store i32** %B.addr, i32*** %10
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false)
+  %11 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 0
+  %12 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %13 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %arrayidx3 = getelementptr inbounds i32, i32* %13, i64 0
+  store i32 %12, i32* %arrayidx3, align 4, !tbaa !15
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 0, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct*
+  %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1)
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 1, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.0*
+  %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 2, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #4
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #4
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.1*
+  %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35(i32* %A, i32* %B, [512 x i32]* dereferenceable(2048) %Global) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %Global.addr = alloca [512 x i32]*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store [512 x i32]* %Global, [512 x i32]** %Global.addr, align 8, !tbaa !11
+  %0 = load [512 x i32]*, [512 x i32]** %Global.addr, align 8, !tbaa !11
+  %1 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %2 = icmp eq i8 %1, 1
+  br i1 %2, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %3 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %.execute
+  %4 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %4, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %5 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %5) #4
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %6 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %7 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %8 = load i32, i32* %i, align 4, !tbaa !15
+  call void @parallel_loop(i32* %6, i32* %7, i32 %8)
+  %9 = load i32, i32* %i, align 4, !tbaa !15
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds [512 x i32], [512 x i32]* %0, i64 0, i64 %idxprom
+  %10 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %inc = add nsw i32 %10, 1
+  store i32 %inc, i32* %arrayidx, align 4, !tbaa !15
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %11 = load i32, i32* %i, align 4, !tbaa !15
+  %inc1 = add nsw i32 %11, 1
+  store i32 %inc1, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal void @parallel_loop(i32* %A, i32* %B, i32 %i) #2 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i.addr = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct.2, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store i32 %i, i32* %i.addr, align 4, !tbaa !15
+  %0 = bitcast %omp.private.struct.2* %.private.vars to i8*
+  %1 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %1
+  %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %2
+  %3 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars, i32 0, i32 2
+  store i32* %i.addr, i32** %3
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %3 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop(i32* %2, i32* %3, i32 1)
+  %4 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %5 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop(i32* %4, i32* %5, i32 2)
+  call void bitcast (void (...)* @extern_func to void ()*)()
+  %6 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %7 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop(i32* %6, i32* %7, i32 3)
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare void @extern_func(...) #3
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %i.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  store i32* %i, i32** %i.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = load i32*, i32** %i.addr, align 8, !tbaa !11
+  %3 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4
+  %4 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %7 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %8 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #4
+  %9 = load i32*, i32** %.global_tid..addr, align 8
+  %10 = load i32, i32* %9, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %11, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %13, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %14, %15
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %16, %17
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %18, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %19 = load i32*, i32** %1, align 8, !tbaa !11
+  %20 = load i32, i32* %2, align 4, !tbaa !15
+  %21 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 %20, %21
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom
+  %22 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %23 = load i32*, i32** %0, align 8, !tbaa !11
+  %24 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %24 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4
+  %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %25, %22
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %26, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %27, %28
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %29, %30
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10)
+  %31 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #4
+  %32 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #4
+  %33 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #4
+  %34 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #4
+  %35 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #4
+  %36 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #4
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.2*
+  %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  %6 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 2
+  %7 = load i32*, i32** %6, align 1
+  call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_and_accesses_in_tregion", i32 9, i32 1}
+!1 = !{i32 0, i32 24, i32 43713508, !"loop_in_loop_in_tregion", i32 2, i32 0}
+!2 = !{i32 0, i32 24, i32 43713508, !"parallel_loops_in_functions_and_extern_func_in_tregion", i32 43, i32 3}
+!3 = !{i32 0, i32 24, i32 43713508, !"parallel_loop_in_function_in_loop_with_global_acc_in_tregion", i32 35, i32 2}
+!4 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_loop_in_loop_in_tregion_l2, !"kernel", i32 1}
+!5 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_and_accesses_in_tregion_l9, !"kernel", i32 1}
+!6 = !{void (i32*, i32*, [512 x i32]*)* @__omp_offloading_18_29b03e4_parallel_loop_in_function_in_loop_with_global_acc_in_tregion_l35, !"kernel", i32 1}
+!7 = !{void (i32*, i32*)* @__omp_offloading_18_29b03e4_parallel_loops_in_functions_and_extern_func_in_tregion_l43, !"kernel", i32 1}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 9.0.0 "}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !13, i64 0}
+!17 = !{!18}
+!18 = !{i64 2, i64 3, i64 5, i1 false}
diff --git a/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/to_SPMD_mode.ll
@@ -0,0 +1,1208 @@
+; RUN: opt < %s -openmp-opt -stats -disable-output 2>&1 | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -openmp-opt -S 2>&1 | FileCheck %s
+;
+; REQUIRES: asserts
+;
+; Check that we will not execute any of the below target regions in SPMD-mode.
+; TODO: SPMD-mode is valid for target region 2 and 3 if proper guarding code is inserted.
+;
+; See the no_SPMD_mode.ll file for almost the same functions that cannot be translated to SPMD mode.
+;
+;
+; STATS-DAG: 4 openmp-opt - Number of GPU kernels converted to SPMD mode
+; STATS-DAG: 6 openmp-opt - Number of parallel GPU kernel regions converted to SPMD mode
+;
+; Check for SPMD mode.
+; CHECK: void @{{.*}}parallel_loop_in_loop_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+;
+; void parallel_loop_in_loop_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++) {
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[i + j];
+;   }
+; }
+;
+;
+; Check for SPMD mode.
+; CHECK: void @{{.*}}parallel_loops_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+;
+; void parallel_loops_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[0 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[1 + j];
+; #pragma omp parallel for
+;     for (int j = 0; j < 1024; j++)
+;       A[j] += B[2 + j];
+;   }
+; }
+;
+; FIXME: For now we copy the parallel_loop function below to
+;        make sure they have only uses in one kernel. As all
+;        kernels can be translated to SPMD mode we don't need
+;        this. In the future we should handle that and clone
+;        the function automatically (or add a new argument) to
+;        facilitate partial SPMD-mode execution.
+;
+; static void parallel_loop1(int *A, int *B, int i) {
+; #pragma omp parallel for
+;   for (int j = 0; j < 1024; j++)
+;     A[j] += B[i + j];
+; }
+; static void parallel_loop2(int *A, int *B, int i) {
+; #pragma omp parallel for
+;   for (int j = 0; j < 1024; j++)
+;     A[j] += B[i + j];
+; }
+;
+;
+; Check for SPMD mode.
+; CHECK: void @{{.*}}parallel_loop_in_function_in_loop_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @parallel_loop1(
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+; CHECK: define internal void @parallel_loop1
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+;
+; void parallel_loop_in_function_in_loop_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   for (int i = 0; i < 512; i++)
+;     parallel_loop1(A, B, i);
+; }
+;
+;
+; Check for SPMD mode.
+; CHECK: define internal void @parallel_loop2
+; CHECK:      call void @__kmpc_target_region_kernel_parallel(i1 true,
+; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+; CHECK: void @{{.*}}parallel_loops_in_functions_in_tregion
+; CHECK:      call i8 @__kmpc_target_region_kernel_init(i1 true,
+; CHECK:      call void @parallel_loop2(
+; CHECK:      call void @__kmpc_target_region_kernel_deinit(i1 true,
+;
+; void parallel_loops_in_functions_in_tregion(int *A, int *B) {
+; #pragma omp target
+;   {
+;     parallel_loop2(A, B, 1);
+;     parallel_loop2(A, B, 2);
+;     parallel_loop2(A, B, 3);
+;   }
+; }
+
+source_filename = "../llvm/test/Transforms/OpenMP/to_SPMD_mode.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.private.struct = type { i32**, i32**, i32* }
+%omp.private.struct.0 = type { i32**, i32** }
+%omp.private.struct.1 = type { i32**, i32** }
+%omp.private.struct.2 = type { i32**, i32** }
+%omp.private.struct.3 = type { i32**, i32**, i32* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode = weak constant i8 1
+@__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode = weak constant i8 1
+@__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode = weak constant i8 1
+@__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [4 x i8*] [i8* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30_exec_mode, i8* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %.execute
+  %3 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %3, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %5 = bitcast %omp.private.struct* %.private.vars to i8*
+  %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %6
+  %7 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %7
+  %8 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %.private.vars, i32 0, i32 2
+  store i32* %i, i32** %8
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion._wrapper, i8* undef, i16 0, i8* %5, i16 24, i1 false)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %9 = load i32, i32* %i, align 4, !tbaa !15
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i8 @__kmpc_target_region_kernel_init(i1, i1, i1, i1)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %i.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  store i32* %i, i32** %i.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = load i32*, i32** %i.addr, align 8, !tbaa !11
+  %3 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  %4 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %7 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %8 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3
+  %9 = load i32*, i32** %.global_tid..addr, align 8
+  %10 = load i32, i32* %9, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %11, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %13, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %14, %15
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %16, %17
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %18, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %19 = load i32*, i32** %1, align 8, !tbaa !11
+  %20 = load i32, i32* %2, align 4, !tbaa !15
+  %21 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 %20, %21
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom
+  %22 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %23 = load i32*, i32** %0, align 8, !tbaa !11
+  %24 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %24 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4
+  %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %25, %22
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %26, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %27, %28
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %29, %30
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10)
+  %31 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  %35 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3
+  %36 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3
+  ret void
+}
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion._wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct*
+  %2 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  %6 = getelementptr inbounds %omp.private.struct, %omp.private.struct* %1, i32 0, i32 2
+  %7 = load i32*, i32** %6, align 1
+  call void @.omp_TRegion.(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare !callback !17 void @__kmpc_target_region_kernel_parallel(i1, i1, void (i8*, i8*)* nocapture, i8* nocapture, i16, i8* nocapture readonly, i16, i1)
+
+declare void @__kmpc_target_region_kernel_deinit(i1, i1)
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %.private.vars = alloca %omp.private.struct.0, align 8
+  %.private.vars1 = alloca %omp.private.struct.1, align 8
+  %.private.vars2 = alloca %omp.private.struct.2, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast %omp.private.struct.0* %.private.vars to i8*
+  %3 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %3
+  %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %4
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.1_wrapper, i8* undef, i16 0, i8* %2, i16 16, i1 false)
+  %5 = bitcast %omp.private.struct.1* %.private.vars1 to i8*
+  %6 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 0
+  store i32** %A.addr, i32*** %6
+  %7 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %.private.vars1, i32 0, i32 1
+  store i32** %B.addr, i32*** %7
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.2_wrapper, i8* undef, i16 0, i8* %5, i16 16, i1 false)
+  %8 = bitcast %omp.private.struct.2* %.private.vars2 to i8*
+  %9 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 0
+  store i32** %A.addr, i32*** %9
+  %10 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %.private.vars2, i32 0, i32 1
+  store i32** %B.addr, i32*** %10
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.3_wrapper, i8* undef, i16 0, i8* %8, i16 16, i1 false)
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 0, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.1_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.0*
+  %2 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.0, %omp.private.struct.0* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.1(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 1, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.2_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.1*
+  %2 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.1, %omp.private.struct.1* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  %3 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %4 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %7 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  %8 = load i32*, i32** %.global_tid..addr, align 8
+  %9 = load i32, i32* %8, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %10 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %10, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %11, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %12 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %12, i32* %.omp.iv, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %13, %14
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %15 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %16 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %15, %16
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %17 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %17, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %18 = load i32*, i32** %1, align 8, !tbaa !11
+  %19 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 2, %19
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %18, i64 %idxprom
+  %20 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %21 = load i32*, i32** %0, align 8, !tbaa !11
+  %22 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %22 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %21, i64 %idxprom4
+  %23 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %23, %20
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %24 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %24, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %25 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %26 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %25, %26
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %27 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %27, %28
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9)
+  %29 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %29) #3
+  %30 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %30) #3
+  %31 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.3_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.2*
+  %2 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.2, %omp.private.struct.2* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  call void @.omp_TRegion.3(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #3
+  store i32 0, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %.execute
+  %3 = load i32, i32* %i, align 4, !tbaa !15
+  %cmp = icmp slt i32 %3, 512
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  %4 = bitcast i32* %i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %4) #3
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %5 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %6 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  %7 = load i32, i32* %i, align 4, !tbaa !15
+  call void @parallel_loop1(i32* %5, i32* %6, i32 %7)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %8 = load i32, i32* %i, align 4, !tbaa !15
+  %inc = add nsw i32 %8, 1
+  store i32 %inc, i32* %i, align 4, !tbaa !15
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %for.end
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+define internal void @parallel_loop1(i32* %A, i32* %B, i32 %i) #2 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i.addr = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct.3, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store i32 %i, i32* %i.addr, align 4, !tbaa !15
+  %0 = bitcast %omp.private.struct.3* %.private.vars to i8*
+  %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %1
+  %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %2
+  %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2
+  store i32* %i.addr, i32** %3
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+  ret void
+}
+; Function Attrs: nounwind
+define internal void @parallel_loop2(i32* %A, i32* %B, i32 %i) #2 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  %i.addr = alloca i32, align 4
+  %.private.vars = alloca %omp.private.struct.3, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  store i32 %i, i32* %i.addr, align 4, !tbaa !15
+  %0 = bitcast %omp.private.struct.3* %.private.vars to i8*
+  %1 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 0
+  store i32** %A.addr, i32*** %1
+  %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 1
+  store i32** %B.addr, i32*** %2
+  %3 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %.private.vars, i32 0, i32 2
+  store i32* %i.addr, i32** %3
+  call void @__kmpc_target_region_kernel_parallel(i1 false, i1 true, void (i8*, i8*)* @.omp_TRegion.4_wrapper, i8* undef, i16 0, i8* %0, i16 24, i1 false)
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36(i32* %A, i32* %B) #0 {
+entry:
+  %A.addr = alloca i32*, align 8
+  %B.addr = alloca i32*, align 8
+  store i32* %A, i32** %A.addr, align 8, !tbaa !11
+  store i32* %B, i32** %B.addr, align 8, !tbaa !11
+  %0 = call i8 @__kmpc_target_region_kernel_init(i1 false, i1 true, i1 true, i1 true)
+  %1 = icmp eq i8 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %3 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop2(i32* %2, i32* %3, i32 1)
+  %4 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %5 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop2(i32* %4, i32* %5, i32 2)
+  %6 = load i32*, i32** %A.addr, align 8, !tbaa !11
+  %7 = load i32*, i32** %B.addr, align 8, !tbaa !11
+  call void @parallel_loop2(i32* %6, i32* %7, i32 3)
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %.execute
+  call void @__kmpc_target_region_kernel_deinit(i1 false, i1 true)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.4(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %A, i32** dereferenceable(8) %B, i32* dereferenceable(4) %i) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %A.addr = alloca i32**, align 8
+  %B.addr = alloca i32**, align 8
+  %i.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8, !tbaa !11
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8, !tbaa !11
+  store i32** %A, i32*** %A.addr, align 8, !tbaa !11
+  store i32** %B, i32*** %B.addr, align 8, !tbaa !11
+  store i32* %i, i32** %i.addr, align 8, !tbaa !11
+  %0 = load i32**, i32*** %A.addr, align 8, !tbaa !11
+  %1 = load i32**, i32*** %B.addr, align 8, !tbaa !11
+  %2 = load i32*, i32** %i.addr, align 8, !tbaa !11
+  %3 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #3
+  %4 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #3
+  store i32 0, i32* %.omp.lb, align 4, !tbaa !15
+  %5 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #3
+  store i32 1023, i32* %.omp.ub, align 4, !tbaa !15
+  %6 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #3
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !15
+  %7 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #3
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !15
+  %8 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #3
+  %9 = load i32*, i32** %.global_tid..addr, align 8
+  %10 = load i32, i32* %9, align 4, !tbaa !15
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  br label %omp.dispatch.cond
+
+omp.dispatch.cond:                                ; preds = %omp.dispatch.inc, %entry
+  %11 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp = icmp sgt i32 %11, 1023
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %omp.dispatch.cond
+  br label %cond.end
+
+cond.false:                                       ; preds = %omp.dispatch.cond
+  %12 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 1023, %cond.true ], [ %12, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !15
+  %13 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  store i32 %13, i32* %.omp.iv, align 4, !tbaa !15
+  %14 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %15 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp1 = icmp sle i32 %14, %15
+  br i1 %cmp1, label %omp.dispatch.body, label %omp.dispatch.cleanup
+
+omp.dispatch.cleanup:                             ; preds = %cond.end
+  br label %omp.dispatch.end
+
+omp.dispatch.body:                                ; preds = %cond.end
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %omp.dispatch.body
+  %16 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %17 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %cmp2 = icmp sle i32 %16, %17
+  br i1 %cmp2, label %omp.inner.for.body, label %omp.inner.for.cond.cleanup
+
+omp.inner.for.cond.cleanup:                       ; preds = %omp.inner.for.cond
+  br label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %18 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %mul = mul nsw i32 %18, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %j, align 4, !tbaa !15
+  %19 = load i32*, i32** %1, align 8, !tbaa !11
+  %20 = load i32, i32* %2, align 4, !tbaa !15
+  %21 = load i32, i32* %j, align 4, !tbaa !15
+  %add3 = add nsw i32 %20, %21
+  %idxprom = sext i32 %add3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %19, i64 %idxprom
+  %22 = load i32, i32* %arrayidx, align 4, !tbaa !15
+  %23 = load i32*, i32** %0, align 8, !tbaa !11
+  %24 = load i32, i32* %j, align 4, !tbaa !15
+  %idxprom4 = sext i32 %24 to i64
+  %arrayidx5 = getelementptr inbounds i32, i32* %23, i64 %idxprom4
+  %25 = load i32, i32* %arrayidx5, align 4, !tbaa !15
+  %add6 = add nsw i32 %25, %22
+  store i32 %add6, i32* %arrayidx5, align 4, !tbaa !15
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %26 = load i32, i32* %.omp.iv, align 4, !tbaa !15
+  %add7 = add nsw i32 %26, 1
+  store i32 %add7, i32* %.omp.iv, align 4, !tbaa !15
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond.cleanup
+  br label %omp.dispatch.inc
+
+omp.dispatch.inc:                                 ; preds = %omp.inner.for.end
+  %27 = load i32, i32* %.omp.lb, align 4, !tbaa !15
+  %28 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add8 = add nsw i32 %27, %28
+  store i32 %add8, i32* %.omp.lb, align 4, !tbaa !15
+  %29 = load i32, i32* %.omp.ub, align 4, !tbaa !15
+  %30 = load i32, i32* %.omp.stride, align 4, !tbaa !15
+  %add9 = add nsw i32 %29, %30
+  store i32 %add9, i32* %.omp.ub, align 4, !tbaa !15
+  br label %omp.dispatch.cond
+
+omp.dispatch.end:                                 ; preds = %omp.dispatch.cleanup
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10)
+  %31 = bitcast i32* %j to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %31) #3
+  %32 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %32) #3
+  %33 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %33) #3
+  %34 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %34) #3
+  %35 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %35) #3
+  %36 = bitcast i32* %.omp.iv to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %36) #3
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @.omp_TRegion.4_wrapper(i8* %shared_vars, i8* %private_vars) #0 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.addr1 = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %shared_vars, i8** %.addr, align 8, !tbaa !11
+  store i8* %private_vars, i8** %.addr1, align 8, !tbaa !11
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %0, i32* %.threadid_temp., align 4, !tbaa !15
+  %1 = bitcast i8* %private_vars to %omp.private.struct.3*
+  %2 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  %4 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 1
+  %5 = load i32**, i32*** %4, align 1
+  %6 = getelementptr inbounds %omp.private.struct.3, %omp.private.struct.3* %1, i32 0, i32 2
+  %7 = load i32*, i32** %6, align 1
+  call void @.omp_TRegion.4(i32* %.threadid_temp., i32* %.zero.addr, i32** %3, i32** %5, i32* %7) #3
+  ret void
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0, !1, !2, !3}
+!nvvm.annotations = !{!4, !5, !6, !7}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_loop_in_tregion", i32 2, i32 0}
+!1 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_functions_in_tregion", i32 36, i32 3}
+!2 = !{i32 0, i32 24, i32 43712181, !"parallel_loops_in_tregion", i32 10, i32 1}
+!3 = !{i32 0, i32 24, i32 43712181, !"parallel_loop_in_function_in_loop_in_tregion", i32 30, i32 2}
+!4 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_loop_in_tregion_l2, !"kernel", i32 1}
+!5 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_tregion_l10, !"kernel", i32 1}
+!6 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loop_in_function_in_loop_in_tregion_l30, !"kernel", i32 1}
+!7 = !{void (i32*, i32*)* @__omp_offloading_18_29afeb5_parallel_loops_in_functions_in_tregion_l36, !"kernel", i32 1}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 7, !"PIC Level", i32 2}
+!10 = !{!"clang version 9.0.0 "}
+!11 = !{!12, !12, i64 0}
+!12 = !{!"any pointer", !13, i64 0}
+!13 = !{!"omnipotent char", !14, i64 0}
+!14 = !{!"Simple C/C++ TBAA"}
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !13, i64 0}
+!17 = !{!18}
+!18 = !{i64 2, i64 3, i64 5, i1 false}