Index: SPMD_examples/v0.1/target_offload_is_SPMD.c
===================================================================
--- /dev/null
+++ SPMD_examples/v0.1/target_offload_is_SPMD.c
@@ -0,0 +1,42 @@
+#include <omp.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define N 10
+#define TEAMS 3
+
+void foo(int* dis, int *team) {
+
+  #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N],team[:N])
+  {
+    #pragma omp distribute parallel for
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;                    // <- Increment dis[0:N] from i to i+1
+
+    #pragma omp distribute parallel for
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;
+
+
+    team[omp_get_team_num()] += 1;
+  }
+}
+
+int main() {
+  int dis[N], team[N];
+
+  for (int i = 0; i < N; i++) {
+    dis[i] = i;
+    team[i] = 0;
+  }
+
+  foo(dis, team);
+
+  for (int i = 0; i < N; i++) {
+    printf("dis[%3i] = %4i\t\tteam[%3i] = %4i\n", i, dis[i], i, team[i]);
+  }
+
+  return 0;
+}
Index: SPMD_examples/v0.1/target_offload_not_SPMD.c
===================================================================
--- /dev/null
+++ SPMD_examples/v0.1/target_offload_not_SPMD.c
@@ -0,0 +1,42 @@
+#include <omp.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define N 10
+#define TEAMS 3
+
+void foo(int* dis, int *team) {
+
+  #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N],team[:N])
+  {
+    #pragma omp distribute parallel for
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;                    // <- Increment dis[0:N] from i to i+1
+
+    #pragma omp parallel              // <- Not valid in SPMD mode without guard
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;                    // <- Increment dis[0:N] from i+1
+                                      //    to i+1+128 * TEAMS(-1/*masters*/) + 32
+
+    team[omp_get_team_num()] += 1;
+  }
+}
+
+int main() {
+  int dis[N], team[N];
+
+  for (int i = 0; i < N; i++) {
+    dis[i] = i;
+    team[i] = 0;
+  }
+
+  foo(dis, team);
+
+  for (int i = 0; i < N; i++) {
+    printf("dis[%3i] = %4i\t\tteam[%3i] = %4i\n", i, dis[i], i, team[i]);
+  }
+
+  return 0;
+}
Index: SPMD_examples/v0.2/target_offload_is_SPMD.new.host.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_is_SPMD.new.host.ll
@@ -0,0 +1,530 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/jdoerfert/target_offload_is_SPMD-7bb1c0.bc'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_2b_142c531_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40]
+@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c531_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_2b_142c531_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis, i32* %team) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %team.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %3, i64 0
+  %4 = load i32*, i32** %team.addr, align 8
+  %5 = load i32*, i32** %team.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %7 = bitcast i8** %6 to i32**
+  store i32* %2, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = bitcast i8** %8 to i32**
+  store i32* %arrayidx, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1
+  %11 = bitcast i8** %10 to i32**
+  store i32* %4, i32** %11, align 8
+  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1
+  %13 = bitcast i8** %12 to i32**
+  store i32* %arrayidx1, i32** %13, align 8
+  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %17 = icmp ne i32 %16, 0
+  br i1 %17, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_2b_142c531_foo_l10(i32* %0, i32* %1) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %team.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  %.omp.iv2 = alloca i32, align 4
+  %tmp3 = alloca i32, align 4
+  %.omp.comb.lb4 = alloca i32, align 4
+  %.omp.comb.ub5 = alloca i32, align 4
+  %.omp.stride6 = alloca i32, align 4
+  %.omp.is_last7 = alloca i32, align 4
+  %i8 = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %11, %12
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  store i32 0, i32* %.omp.comb.lb4, align 4
+  store i32 9, i32* %.omp.comb.ub5, align 4
+  store i32 1, i32* %.omp.stride6, align 4
+  store i32 0, i32* %.omp.is_last7, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last7, i32* %.omp.comb.lb4, i32* %.omp.comb.ub5, i32* %.omp.stride6, i32 1, i32 1)
+  %13 = load i32, i32* %.omp.comb.ub5, align 4
+  %cmp9 = icmp sgt i32 %13, 9
+  br i1 %cmp9, label %cond.true10, label %cond.false11
+
+cond.true10:                                      ; preds = %omp.loop.exit
+  br label %cond.end12
+
+cond.false11:                                     ; preds = %omp.loop.exit
+  %14 = load i32, i32* %.omp.comb.ub5, align 4
+  br label %cond.end12
+
+cond.end12:                                       ; preds = %cond.false11, %cond.true10
+  %cond13 = phi i32 [ 9, %cond.true10 ], [ %14, %cond.false11 ]
+  store i32 %cond13, i32* %.omp.comb.ub5, align 4
+  %15 = load i32, i32* %.omp.comb.lb4, align 4
+  store i32 %15, i32* %.omp.iv2, align 4
+  br label %omp.inner.for.cond14
+
+omp.inner.for.cond14:                             ; preds = %omp.inner.for.inc17, %cond.end12
+  %16 = load i32, i32* %.omp.iv2, align 4
+  %17 = load i32, i32* %.omp.comb.ub5, align 4
+  %cmp15 = icmp sle i32 %16, %17
+  br i1 %cmp15, label %omp.inner.for.body16, label %omp.inner.for.end19
+
+omp.inner.for.body16:                             ; preds = %omp.inner.for.cond14
+  %18 = load i32, i32* %.omp.comb.lb4, align 4
+  %19 = zext i32 %18 to i64
+  %20 = load i32, i32* %.omp.comb.ub5, align 4
+  %21 = zext i32 %20 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %19, i64 %21, i32** %dis.addr)
+  br label %omp.inner.for.inc17
+
+omp.inner.for.inc17:                              ; preds = %omp.inner.for.body16
+  %22 = load i32, i32* %.omp.iv2, align 4
+  %23 = load i32, i32* %.omp.stride6, align 4
+  %add18 = add nsw i32 %22, %23
+  store i32 %add18, i32* %.omp.iv2, align 4
+  br label %omp.inner.for.cond14
+
+omp.inner.for.end19:                              ; preds = %omp.inner.for.cond14
+  br label %omp.loop.exit20
+
+omp.loop.exit20:                                  ; preds = %omp.inner.for.end19
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  %24 = load i32*, i32** %team.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %24, i64 %idxprom
+  %25 = load i32, i32* %arrayidx, align 4
+  %add21 = add nsw i32 %25, 1
+  store i32 %add21, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %team = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i4 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  %3 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1
+  store i32 0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0
+  call void @foo(i32* %arraydecay, i32* %arraydecay3)
+  store i32 0, i32* %i4, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc12, %for.end
+  %5 = load i32, i32* %i4, align 4
+  %cmp6 = icmp slt i32 %5, 10
+  br i1 %cmp6, label %for.body7, label %for.end14
+
+for.body7:                                        ; preds = %for.cond5
+  %6 = load i32, i32* %i4, align 4
+  %7 = load i32, i32* %i4, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8
+  %8 = load i32, i32* %arrayidx9, align 4
+  %9 = load i32, i32* %i4, align 4
+  %10 = load i32, i32* %i4, align 4
+  %idxprom10 = sext i32 %10 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10
+  %11 = load i32, i32* %arrayidx11, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11)
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body7
+  %12 = load i32, i32* %i4, align 4
+  %inc13 = add nsw i32 %12, 1
+  store i32 %inc13, i32* %i4, align 4
+  br label %for.cond5
+
+for.end14:                                        ; preds = %for.cond5
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.2/target_offload_is_SPMD.new.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_is_SPMD.new.ll
@@ -0,0 +1,437 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
+; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] }
+%omp.shared.struct = type { i64, i64, i32** }
+%omp.shared.struct.0 = type { i64, i64, i32** }
+%struct._globalized_locals_ty = type { i32* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1
+@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c531_foo_l10_exec_mode = weak constant i8 1
+@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c531_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs:  norecurse nounwind 
+define weak void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #0 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %team.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.omp.iv3.i = alloca i32, align 4
+  %tmp4.i = alloca i32, align 4
+  %.omp.comb.lb5.i = alloca i32, align 4
+  %.omp.comb.ub6.i = alloca i32, align 4
+  %.omp.stride7.i = alloca i32, align 4
+  %.omp.is_last8.i = alloca i32, align 4
+  %i9.i = alloca i32, align 4
+  %.zero.addr18.i = alloca i32, align 4
+  %.captured19.i = alloca %omp.shared.struct.0, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0)
+  %1 = icmp eq i16 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %3 = load i32*, i32** %dis.addr, align 8
+  %4 = load i32*, i32** %team.addr, align 8
+  store i32 %2, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr18.i, align 4, !noalias !10
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !10
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !10
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !10
+  store i32* %3, i32** %dis.addr.i, align 8, !noalias !10
+  store i32* %4, i32** %team.addr.i, align 8, !noalias !10
+  call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #3
+  %5 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
+  %6 = bitcast i8* %5 to %struct._globalized_locals_ty*
+  %7 = load i32*, i32** %dis.addr.i, align 8, !noalias !10
+  %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %6, i32 0, i32 0
+  store i32* %7, i32** %dis1.i, align 8
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !10
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !10
+  %8 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !10
+  %9 = load i32, i32* %8, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #3
+  %10 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %cmp.i = icmp sgt i32 %10, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %11, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %12 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  store i32 %12, i32* %.omp.iv.i, align 4, !noalias !10
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %13 = load i32, i32* %.omp.iv.i, align 4, !noalias !10
+  %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %cmp2.i = icmp sle i32 %13, %14
+  br i1 %cmp2.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  %16 = zext i32 %15 to i64
+  %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %18 = zext i32 %17 to i64
+  %19 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %20 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %16, i64* %20, !noalias !10
+  %21 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %18, i64* %21, !noalias !10
+  %22 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store i32** %dis1.i, i32*** %22, !noalias !10
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %19, i16 24, i16 1) #3
+  %23 = load i32, i32* %.omp.iv.i, align 4, !noalias !10
+  %24 = load i32, i32* %.omp.stride.i, align 4, !noalias !10
+  %add.i = add nsw i32 %23, %24
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !10
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) #3
+  store i32 0, i32* %.omp.comb.lb5.i, align 4, !noalias !10
+  store i32 9, i32* %.omp.comb.ub6.i, align 4, !noalias !10
+  store i32 1, i32* %.omp.stride7.i, align 4, !noalias !10
+  store i32 0, i32* %.omp.is_last8.i, align 4, !noalias !10
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 92, i32* %.omp.is_last8.i, i32* %.omp.comb.lb5.i, i32* %.omp.comb.ub6.i, i32* %.omp.stride7.i, i32 1, i32 1) #3
+  %25 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10
+  %cmp10.i = icmp sgt i32 %25, 9
+  br i1 %cmp10.i, label %cond.true11.i, label %cond.false12.i
+
+cond.true11.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end13.i
+
+cond.false12.i:                                   ; preds = %omp.inner.for.end.i
+  %26 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10
+  br label %cond.end13.i
+
+cond.end13.i:                                     ; preds = %cond.false12.i, %cond.true11.i
+  %cond14.i = phi i32 [ 9, %cond.true11.i ], [ %26, %cond.false12.i ]
+  store i32 %cond14.i, i32* %.omp.comb.ub6.i, align 4, !noalias !10
+  %27 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !10
+  store i32 %27, i32* %.omp.iv3.i, align 4, !noalias !10
+  br label %omp.inner.for.cond15.i
+
+omp.inner.for.cond15.i:                           ; preds = %omp.inner.for.body17.i, %cond.end13.i
+  %28 = load i32, i32* %.omp.iv3.i, align 4, !noalias !10
+  %29 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10
+  %cmp16.i = icmp sle i32 %28, %29
+  br i1 %cmp16.i, label %omp.inner.for.body17.i, label %__omp_outlined__.exit
+
+omp.inner.for.body17.i:                           ; preds = %omp.inner.for.cond15.i
+  %30 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !10
+  %31 = zext i32 %30 to i64
+  %32 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10
+  %33 = zext i32 %32 to i64
+  %34 = bitcast %omp.shared.struct.0* %.captured19.i to i8*
+  %35 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i32 0, i32 0
+  store i64 %31, i64* %35, !noalias !10
+  %36 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i32 0, i32 1
+  store i64 %33, i64* %36, !noalias !10
+  %37 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i32 0, i32 2
+  store i32** %dis1.i, i32*** %37, !noalias !10
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %34, i16 24, i16 1) #3
+  %38 = load i32, i32* %.omp.iv3.i, align 4, !noalias !10
+  %39 = load i32, i32* %.omp.stride7.i, align 4, !noalias !10
+  %add21.i = add nsw i32 %38, %39
+  store i32 %add21.i, i32* %.omp.iv3.i, align 4, !noalias !10
+  br label %omp.inner.for.cond15.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond15.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) #3
+  %40 = load i32*, i32** %team.addr.i, align 8, !noalias !10
+  %call.i = call i32 @omp_get_team_num() #3
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %40, i64 %idxprom.i
+  %41 = load i32, i32* %arrayidx.i, align 4
+  %add24.i = add nsw i32 %41, 1
+  store i32 %add24.i, i32* %arrayidx.i, align 4
+  call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #3
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs:  norecurse nounwind 
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs:  norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct*
+  %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0
+  %4 = load i64, i64* %3, align 1
+  %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1
+  %6 = load i64, i64* %5, align 1
+  %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2
+  %8 = load i32**, i32*** %7, align 1
+  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32** %8) #3
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+; Function Attrs:  norecurse nounwind 
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+; Function Attrs:  norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct.0*
+  %3 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 0
+  %4 = load i64, i64* %3, align 1
+  %5 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 1
+  %6 = load i64, i64* %5, align 1
+  %7 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 2
+  %8 = load i32**, i32*** %7, align 1
+  call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32** %8) #3
+  ret void
+}
+
+declare i32 @omp_get_team_num() #2
+
+declare void @__kmpc_restore_team_static_memory(i16, i16)
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+attributes #0 = {  norecurse nounwind  "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = {  norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c531_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{!11, !13}
+!11 = distinct !{!11, !12, !"__omp_outlined__: %.global_tid."}
+!12 = distinct !{!12, !"__omp_outlined__"}
+!13 = distinct !{!13, !12, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
Index: SPMD_examples/v0.2/target_offload_is_SPMD.old.forced.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_is_SPMD.old.forced.ll
@@ -0,0 +1,944 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
+; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c531_foo_l10_exec_mode = weak constant i8 0
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c531_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: noinline norecurse nounwind optnone
+define weak void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #0 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %team.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %.omp.iv9.i = alloca i32, align 4
+  %tmp10.i = alloca i32, align 4
+  %.omp.comb.lb11.i = alloca i32, align 4
+  %.omp.comb.ub12.i = alloca i32, align 4
+  %.omp.stride13.i = alloca i32, align 4
+  %.omp.is_last14.i = alloca i32, align 4
+  %i15.i = alloca i32, align 4
+  %.zero.addr25.i = alloca i32, align 4
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !10
+  call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0)
+  call void @__kmpc_data_sharing_init_stack_spmd()
+  br label %.execute
+
+.execute:                                         ; preds = %entry
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %1 = load i32*, i32** %dis.addr, align 8
+  %2 = load i32*, i32** %team.addr, align 8
+  store i32 %0, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr25.i, align 4, !noalias !11
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !11
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !11
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !11
+  store i32* %1, i32** %dis.addr.i, align 8, !noalias !11
+  store i32* %2, i32** %team.addr.i, align 8, !noalias !11
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !11
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !11
+  %nvptx_num_threads.i = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3, !range !10
+  %3 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %4, i32 91, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 %nvptx_num_threads.i) #3
+  %5 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %cmp.i = icmp sgt i32 %5, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %6, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  store i32 %7, i32* %.omp.iv.i, align 4, !noalias !11
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %cond.end7.i, %cond.end.i
+  %8 = load i32, i32* %.omp.iv.i, align 4, !noalias !11
+  %cmp1.i = icmp slt i32 %8, 10
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %9 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  %10 = zext i32 %9 to i64
+  %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %12 = zext i32 %11 to i64
+  %13 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11
+  call void @__omp_outlined__1(i32* %13, i32* %.zero.addr.i, i64 %10, i64 %12, i32** %dis.addr.i) #3
+  %14 = load i32, i32* %.omp.iv.i, align 4, !noalias !11
+  %15 = load i32, i32* %.omp.stride.i, align 4, !noalias !11
+  %add.i = add nsw i32 %14, %15
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !11
+  %16 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !11
+  %add2.i = add nsw i32 %16, %17
+  store i32 %add2.i, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  %18 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %19 = load i32, i32* %.omp.stride.i, align 4, !noalias !11
+  %add3.i = add nsw i32 %18, %19
+  store i32 %add3.i, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %cmp4.i = icmp sgt i32 %20, 9
+  br i1 %cmp4.i, label %cond.true5.i, label %cond.false6.i
+
+cond.true5.i:                                     ; preds = %omp.inner.for.body.i
+  br label %cond.end7.i
+
+cond.false6.i:                                    ; preds = %omp.inner.for.body.i
+  %21 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  br label %cond.end7.i
+
+cond.end7.i:                                      ; preds = %cond.false6.i, %cond.true5.i
+  %cond8.i = phi i32 [ 9, %cond.true5.i ], [ %21, %cond.false6.i ]
+  store i32 %cond8.i, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %22 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  store i32 %22, i32* %.omp.iv.i, align 4, !noalias !11
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) #3
+  store i32 0, i32* %.omp.comb.lb11.i, align 4, !noalias !11
+  store i32 9, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  store i32 1, i32* %.omp.stride13.i, align 4, !noalias !11
+  store i32 0, i32* %.omp.is_last14.i, align 4, !noalias !11
+  %nvptx_num_threads16.i = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3, !range !10
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %4, i32 91, i32* %.omp.is_last14.i, i32* %.omp.comb.lb11.i, i32* %.omp.comb.ub12.i, i32* %.omp.stride13.i, i32 1, i32 %nvptx_num_threads16.i) #3
+  %23 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %cmp17.i = icmp sgt i32 %23, 9
+  br i1 %cmp17.i, label %cond.true18.i, label %cond.false19.i
+
+cond.true18.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end20.i
+
+cond.false19.i:                                   ; preds = %omp.inner.for.end.i
+  %24 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  br label %cond.end20.i
+
+cond.end20.i:                                     ; preds = %cond.false19.i, %cond.true18.i
+  %cond21.i = phi i32 [ 9, %cond.true18.i ], [ %24, %cond.false19.i ]
+  store i32 %cond21.i, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %25 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11
+  store i32 %25, i32* %.omp.iv9.i, align 4, !noalias !11
+  br label %omp.inner.for.cond22.i
+
+omp.inner.for.cond22.i:                           ; preds = %cond.end33.i, %cond.end20.i
+  %26 = load i32, i32* %.omp.iv9.i, align 4, !noalias !11
+  %cmp23.i = icmp slt i32 %26, 10
+  br i1 %cmp23.i, label %omp.inner.for.body24.i, label %__omp_outlined__.exit
+
+omp.inner.for.body24.i:                           ; preds = %omp.inner.for.cond22.i
+  %27 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11
+  %28 = zext i32 %27 to i64
+  %29 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %30 = zext i32 %29 to i64
+  %31 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11
+  call void @__omp_outlined__2(i32* %31, i32* %.zero.addr25.i, i64 %28, i64 %30, i32** %dis.addr.i) #3
+  %32 = load i32, i32* %.omp.iv9.i, align 4, !noalias !11
+  %33 = load i32, i32* %.omp.stride13.i, align 4, !noalias !11
+  %add27.i = add nsw i32 %32, %33
+  store i32 %add27.i, i32* %.omp.iv9.i, align 4, !noalias !11
+  %34 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11
+  %35 = load i32, i32* %.omp.stride13.i, align 4, !noalias !11
+  %add28.i = add nsw i32 %34, %35
+  store i32 %add28.i, i32* %.omp.comb.lb11.i, align 4, !noalias !11
+  %36 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %37 = load i32, i32* %.omp.stride13.i, align 4, !noalias !11
+  %add29.i = add nsw i32 %36, %37
+  store i32 %add29.i, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %38 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %cmp30.i = icmp sgt i32 %38, 9
+  br i1 %cmp30.i, label %cond.true31.i, label %cond.false32.i
+
+cond.true31.i:                                    ; preds = %omp.inner.for.body24.i
+  br label %cond.end33.i
+
+cond.false32.i:                                   ; preds = %omp.inner.for.body24.i
+  %39 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  br label %cond.end33.i
+
+cond.end33.i:                                     ; preds = %cond.false32.i, %cond.true31.i
+  %cond34.i = phi i32 [ 9, %cond.true31.i ], [ %39, %cond.false32.i ]
+  store i32 %cond34.i, i32* %.omp.comb.ub12.i, align 4, !noalias !11
+  %40 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11
+  store i32 %40, i32* %.omp.iv9.i, align 4, !noalias !11
+  br label %omp.inner.for.cond22.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond22.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) #3
+  %41 = load i32*, i32** %team.addr.i, align 8, !noalias !11
+  %call.i = call i32 @omp_get_team_num() #3
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %41, i64 %idxprom.i
+  %42 = load i32, i32* %arrayidx.i, align 4
+  %add37.i = add nsw i32 %42, 1
+  store i32 %add37.i, i32* %arrayidx.i, align 4
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
+
+declare void @__kmpc_spmd_kernel_init(i32, i16, i16)
+
+declare void @__kmpc_data_sharing_init_stack_spmd()
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare i32 @omp_get_team_num() #2
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_spmd_kernel_deinit_v2(i16)
+
+attributes #0 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c531_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{i32 1, i32 1025}
+!11 = !{!12, !14}
+!12 = distinct !{!12, !13, !"__omp_outlined__: %.global_tid."}
+!13 = distinct !{!13, !"__omp_outlined__"}
+!14 = distinct !{!14, !13, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/jdoerfert/target_offload_is_SPMD-c76c92.bc'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_2b_142c531_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40]
+@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c531_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_2b_142c531_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis, i32* %team) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %team.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %3, i64 0
+  %4 = load i32*, i32** %team.addr, align 8
+  %5 = load i32*, i32** %team.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %7 = bitcast i8** %6 to i32**
+  store i32* %2, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = bitcast i8** %8 to i32**
+  store i32* %arrayidx, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1
+  %11 = bitcast i8** %10 to i32**
+  store i32* %4, i32** %11, align 8
+  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1
+  %13 = bitcast i8** %12 to i32**
+  store i32* %arrayidx1, i32** %13, align 8
+  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %17 = icmp ne i32 %16, 0
+  br i1 %17, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_2b_142c531_foo_l10(i32* %0, i32* %1) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %team.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  %.omp.iv2 = alloca i32, align 4
+  %tmp3 = alloca i32, align 4
+  %.omp.comb.lb4 = alloca i32, align 4
+  %.omp.comb.ub5 = alloca i32, align 4
+  %.omp.stride6 = alloca i32, align 4
+  %.omp.is_last7 = alloca i32, align 4
+  %i8 = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %11, %12
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  store i32 0, i32* %.omp.comb.lb4, align 4
+  store i32 9, i32* %.omp.comb.ub5, align 4
+  store i32 1, i32* %.omp.stride6, align 4
+  store i32 0, i32* %.omp.is_last7, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last7, i32* %.omp.comb.lb4, i32* %.omp.comb.ub5, i32* %.omp.stride6, i32 1, i32 1)
+  %13 = load i32, i32* %.omp.comb.ub5, align 4
+  %cmp9 = icmp sgt i32 %13, 9
+  br i1 %cmp9, label %cond.true10, label %cond.false11
+
+cond.true10:                                      ; preds = %omp.loop.exit
+  br label %cond.end12
+
+cond.false11:                                     ; preds = %omp.loop.exit
+  %14 = load i32, i32* %.omp.comb.ub5, align 4
+  br label %cond.end12
+
+cond.end12:                                       ; preds = %cond.false11, %cond.true10
+  %cond13 = phi i32 [ 9, %cond.true10 ], [ %14, %cond.false11 ]
+  store i32 %cond13, i32* %.omp.comb.ub5, align 4
+  %15 = load i32, i32* %.omp.comb.lb4, align 4
+  store i32 %15, i32* %.omp.iv2, align 4
+  br label %omp.inner.for.cond14
+
+omp.inner.for.cond14:                             ; preds = %omp.inner.for.inc17, %cond.end12
+  %16 = load i32, i32* %.omp.iv2, align 4
+  %17 = load i32, i32* %.omp.comb.ub5, align 4
+  %cmp15 = icmp sle i32 %16, %17
+  br i1 %cmp15, label %omp.inner.for.body16, label %omp.inner.for.end19
+
+omp.inner.for.body16:                             ; preds = %omp.inner.for.cond14
+  %18 = load i32, i32* %.omp.comb.lb4, align 4
+  %19 = zext i32 %18 to i64
+  %20 = load i32, i32* %.omp.comb.ub5, align 4
+  %21 = zext i32 %20 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %19, i64 %21, i32** %dis.addr)
+  br label %omp.inner.for.inc17
+
+omp.inner.for.inc17:                              ; preds = %omp.inner.for.body16
+  %22 = load i32, i32* %.omp.iv2, align 4
+  %23 = load i32, i32* %.omp.stride6, align 4
+  %add18 = add nsw i32 %22, %23
+  store i32 %add18, i32* %.omp.iv2, align 4
+  br label %omp.inner.for.cond14
+
+omp.inner.for.end19:                              ; preds = %omp.inner.for.cond14
+  br label %omp.loop.exit20
+
+omp.loop.exit20:                                  ; preds = %omp.inner.for.end19
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  %24 = load i32*, i32** %team.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %24, i64 %idxprom
+  %25 = load i32, i32* %arrayidx, align 4
+  %add21 = add nsw i32 %25, 1
+  store i32 %add21, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %team = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i4 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  %3 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1
+  store i32 0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0
+  call void @foo(i32* %arraydecay, i32* %arraydecay3)
+  store i32 0, i32* %i4, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc12, %for.end
+  %5 = load i32, i32* %i4, align 4
+  %cmp6 = icmp slt i32 %5, 10
+  br i1 %cmp6, label %for.body7, label %for.end14
+
+for.body7:                                        ; preds = %for.cond5
+  %6 = load i32, i32* %i4, align 4
+  %7 = load i32, i32* %i4, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8
+  %8 = load i32, i32* %arrayidx9, align 4
+  %9 = load i32, i32* %i4, align 4
+  %10 = load i32, i32* %i4, align 4
+  %idxprom10 = sext i32 %10 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10
+  %11 = load i32, i32* %arrayidx11, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11)
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body7
+  %12 = load i32, i32* %i4, align 4
+  %inc13 = add nsw i32 %12, 1
+  store i32 %inc13, i32* %i4, align 4
+  br label %for.cond5
+
+for.end14:                                        ; preds = %for.cond5
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.2/target_offload_is_SPMD.old.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_is_SPMD.old.ll
@@ -0,0 +1,1095 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
+; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] }
+%struct._globalized_locals_ty = type { i32* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1
+@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c531_foo_l10_exec_mode = weak constant i8 1
+@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c531_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_offloading_2b_142c531_foo_l10_worker() #0 {
+entry:
+  %work_fn = alloca i8*, align 8
+  %exec_status = alloca i8, align 1
+  store i8* null, i8** %work_fn, align 8
+  store i8 0, i8* %exec_status, align 1
+  br label %.await.work
+
+.await.work:                                      ; preds = %.barrier.parallel, %entry
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1)
+  %1 = zext i1 %0 to i8
+  store i8 %1, i8* %exec_status, align 1
+  %2 = load i8*, i8** %work_fn, align 8
+  %should_terminate = icmp eq i8* %2, null
+  br i1 %should_terminate, label %.exit, label %.select.workers
+
+.select.workers:                                  ; preds = %.await.work
+  %3 = load i8, i8* %exec_status, align 1
+  %is_active = icmp ne i8 %3, 0
+  br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
+
+.execute.parallel:                                ; preds = %.select.workers
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %5 = load i8*, i8** %work_fn, align 8
+  %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
+  br i1 %work_match, label %.execute.fn, label %.check.next
+
+.execute.fn:                                      ; preds = %.execute.parallel
+  call void @__omp_outlined__1_wrapper(i16 0, i32 %4) #5
+  br label %.terminate.parallel
+
+.check.next:                                      ; preds = %.execute.parallel
+  %6 = load i8*, i8** %work_fn, align 8
+  %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
+  br i1 %work_match1, label %.execute.fn2, label %.check.next3
+
+.execute.fn2:                                     ; preds = %.check.next
+  call void @__omp_outlined__2_wrapper(i16 0, i32 %4) #5
+  br label %.terminate.parallel
+
+.check.next3:                                     ; preds = %.check.next
+  %7 = bitcast i8* %2 to void (i16, i32)*
+  call void %7(i16 0, i32 %4)
+  br label %.terminate.parallel
+
+.terminate.parallel:                              ; preds = %.check.next3, %.execute.fn2, %.execute.fn
+  call void @__kmpc_kernel_end_parallel()
+  br label %.barrier.parallel
+
+.barrier.parallel:                                ; preds = %.terminate.parallel, %.select.workers
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %.await.work
+
+.exit:                                            ; preds = %.await.work
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone
+define weak void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %team.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %shared_arg_refs.i = alloca i8**, align 8
+  %.omp.iv3.i = alloca i32, align 4
+  %tmp4.i = alloca i32, align 4
+  %.omp.comb.lb5.i = alloca i32, align 4
+  %.omp.comb.ub6.i = alloca i32, align 4
+  %.omp.stride7.i = alloca i32, align 4
+  %.omp.is_last8.i = alloca i32, align 4
+  %i9.i = alloca i32, align 4
+  %.zero.addr18.i = alloca i32, align 4
+  %shared_arg_refs19.i = alloca i8**, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10
+  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11
+  %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
+  %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
+  %0 = icmp ult i32 %nvptx_tid, %thread_limit
+  br i1 %0, label %.worker, label %.mastercheck
+
+.worker:                                          ; preds = %entry
+  call void @__omp_offloading_2b_142c531_foo_l10_worker() #5
+  br label %.exit
+
+.mastercheck:                                     ; preds = %entry
+  %nvptx_num_threads1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11
+  %nvptx_warp_size2 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10
+  %1 = sub nuw i32 %nvptx_warp_size2, 1
+  %2 = xor i32 %1, -1
+  %3 = sub nuw i32 %nvptx_num_threads1, 1
+  %master_tid = and i32 %3, %2
+  %nvptx_tid3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
+  %4 = icmp eq i32 %nvptx_tid3, %master_tid
+  br i1 %4, label %.master, label %.exit
+
+.master:                                          ; preds = %.mastercheck
+  %nvptx_warp_size4 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10
+  %nvptx_num_threads5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11
+  %thread_limit6 = sub nuw i32 %nvptx_num_threads5, %nvptx_warp_size4
+  call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
+  call void @__kmpc_data_sharing_init_stack()
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %6 = load i32*, i32** %dis.addr, align 8
+  %7 = load i32*, i32** %team.addr, align 8
+  store i32 %5, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr18.i, align 4, !noalias !13
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !13
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !13
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !13
+  store i32* %6, i32** %dis.addr.i, align 8, !noalias !13
+  store i32* %7, i32** %team.addr.i, align 8, !noalias !13
+  call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #5
+  %8 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
+  %9 = bitcast i8* %8 to %struct._globalized_locals_ty*
+  %10 = load i32*, i32** %dis.addr.i, align 8, !noalias !13
+  %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %9, i32 0, i32 0
+  store i32* %10, i32** %dis1.i, align 8
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !13
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !13
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !13
+  %11 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !13
+  %12 = load i32, i32* %11, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %12, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #5
+  %13 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %cmp.i = icmp sgt i32 %13, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.master
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.master
+  %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %14, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13
+  store i32 %15, i32* %.omp.iv.i, align 4, !noalias !13
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %16 = load i32, i32* %.omp.iv.i, align 4, !noalias !13
+  %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %cmp2.i = icmp sle i32 %16, %17
+  br i1 %cmp2.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %18 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13
+  %19 = zext i32 %18 to i64
+  %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %21 = zext i32 %20 to i64
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i16 1) #5
+  call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs.i, i64 3) #5
+  %22 = load i8**, i8*** %shared_arg_refs.i, align 8, !noalias !13
+  %23 = inttoptr i64 %19 to i8*
+  store i8* %23, i8** %22, align 8
+  %24 = getelementptr inbounds i8*, i8** %22, i64 1
+  %25 = inttoptr i64 %21 to i8*
+  store i8* %25, i8** %24, align 8
+  %26 = getelementptr inbounds i8*, i8** %22, i64 2
+  %27 = bitcast i32** %dis1.i to i8*
+  store i8* %27, i8** %26, align 8
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_end_sharing_variables() #5
+  %28 = load i32, i32* %.omp.iv.i, align 4, !noalias !13
+  %29 = load i32, i32* %.omp.stride.i, align 4, !noalias !13
+  %add.i = add nsw i32 %28, %29
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !13
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %12) #5
+  store i32 0, i32* %.omp.comb.lb5.i, align 4, !noalias !13
+  store i32 9, i32* %.omp.comb.ub6.i, align 4, !noalias !13
+  store i32 1, i32* %.omp.stride7.i, align 4, !noalias !13
+  store i32 0, i32* %.omp.is_last8.i, align 4, !noalias !13
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %12, i32 92, i32* %.omp.is_last8.i, i32* %.omp.comb.lb5.i, i32* %.omp.comb.ub6.i, i32* %.omp.stride7.i, i32 1, i32 1) #5
+  %30 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13
+  %cmp10.i = icmp sgt i32 %30, 9
+  br i1 %cmp10.i, label %cond.true11.i, label %cond.false12.i
+
+cond.true11.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end13.i
+
+cond.false12.i:                                   ; preds = %omp.inner.for.end.i
+  %31 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13
+  br label %cond.end13.i
+
+cond.end13.i:                                     ; preds = %cond.false12.i, %cond.true11.i
+  %cond14.i = phi i32 [ 9, %cond.true11.i ], [ %31, %cond.false12.i ]
+  store i32 %cond14.i, i32* %.omp.comb.ub6.i, align 4, !noalias !13
+  %32 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !13
+  store i32 %32, i32* %.omp.iv3.i, align 4, !noalias !13
+  br label %omp.inner.for.cond15.i
+
+omp.inner.for.cond15.i:                           ; preds = %omp.inner.for.body17.i, %cond.end13.i
+  %33 = load i32, i32* %.omp.iv3.i, align 4, !noalias !13
+  %34 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13
+  %cmp16.i = icmp sle i32 %33, %34
+  br i1 %cmp16.i, label %omp.inner.for.body17.i, label %__omp_outlined__.exit
+
+omp.inner.for.body17.i:                           ; preds = %omp.inner.for.cond15.i
+  %35 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !13
+  %36 = zext i32 %35 to i64
+  %37 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13
+  %38 = zext i32 %37 to i64
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i16 1) #5
+  call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs19.i, i64 3) #5
+  %39 = load i8**, i8*** %shared_arg_refs19.i, align 8, !noalias !13
+  %40 = inttoptr i64 %36 to i8*
+  store i8* %40, i8** %39, align 8
+  %41 = getelementptr inbounds i8*, i8** %39, i64 1
+  %42 = inttoptr i64 %38 to i8*
+  store i8* %42, i8** %41, align 8
+  %43 = getelementptr inbounds i8*, i8** %39, i64 2
+  %44 = bitcast i32** %dis1.i to i8*
+  store i8* %44, i8** %43, align 8
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_end_sharing_variables() #5
+  %45 = load i32, i32* %.omp.iv3.i, align 4, !noalias !13
+  %46 = load i32, i32* %.omp.stride7.i, align 4, !noalias !13
+  %add21.i = add nsw i32 %45, %46
+  store i32 %add21.i, i32* %.omp.iv3.i, align 4, !noalias !13
+  br label %omp.inner.for.cond15.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond15.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %12) #5
+  %47 = load i32*, i32** %team.addr.i, align 8, !noalias !13
+  %call.i = call i32 @omp_get_team_num() #5
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %47, i64 %idxprom.i
+  %48 = load i32, i32* %arrayidx.i, align 4
+  %add24.i = add nsw i32 %48, 1
+  store i32 %add24.i, i32* %arrayidx.i, align 4
+  call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #5
+  br label %.termination.notifier
+
+.termination.notifier:                            ; preds = %__omp_outlined__.exit
+  call void @__kmpc_kernel_deinit(i16 1)
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %.exit
+
+.exit:                                            ; preds = %.termination.notifier, %.mastercheck, %.worker
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+declare void @__kmpc_kernel_init(i32, i16)
+
+declare void @__kmpc_data_sharing_init_stack()
+
+declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i16 zeroext, i32) #0 {
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i64*
+  %5 = load i64, i64* %4, align 8
+  %6 = getelementptr inbounds i8*, i8** %2, i64 1
+  %7 = bitcast i8** %6 to i64*
+  %8 = load i64, i64* %7, align 8
+  %9 = getelementptr inbounds i8*, i8** %2, i64 2
+  %10 = bitcast i8** %9 to i32***
+  %11 = load i32**, i32*** %10, align 8
+  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, i32** %11) #5
+  ret void
+}
+
+declare void @__kmpc_get_shared_variables(i8***)
+
+declare void @__kmpc_kernel_prepare_parallel(i8*, i16)
+
+declare void @__kmpc_begin_sharing_variables(i8***, i64)
+
+; Function Attrs: convergent
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) #3
+
+declare void @__kmpc_end_sharing_variables()
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i16 zeroext, i32) #0 {
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i64*
+  %5 = load i64, i64* %4, align 8
+  %6 = getelementptr inbounds i8*, i8** %2, i64 1
+  %7 = bitcast i8** %6 to i64*
+  %8 = load i64, i64* %7, align 8
+  %9 = getelementptr inbounds i8*, i8** %2, i64 2
+  %10 = bitcast i8** %9 to i32***
+  %11 = load i32**, i32*** %10, align 8
+  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, i32** %11) #5
+  ret void
+}
+
+declare i32 @omp_get_team_num() #4
+
+declare void @__kmpc_restore_team_static_memory(i16, i16)
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_kernel_deinit(i16)
+
+declare i1 @__kmpc_kernel_parallel(i8**, i16)
+
+declare void @__kmpc_kernel_end_parallel()
+
+attributes #0 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c531_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{i32 32, i32 33}
+!11 = !{i32 1, i32 1025}
+!12 = !{i32 0, i32 1024}
+!13 = !{!14, !16}
+!14 = distinct !{!14, !15, !"__omp_outlined__: %.global_tid."}
+!15 = distinct !{!15, !"__omp_outlined__"}
+!16 = distinct !{!16, !15, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/jdoerfert/target_offload_is_SPMD-7856f8.bc'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_2b_142c531_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40]
+@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c531_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_2b_142c531_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis, i32* %team) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %team.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %3, i64 0
+  %4 = load i32*, i32** %team.addr, align 8
+  %5 = load i32*, i32** %team.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %7 = bitcast i8** %6 to i32**
+  store i32* %2, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = bitcast i8** %8 to i32**
+  store i32* %arrayidx, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1
+  %11 = bitcast i8** %10 to i32**
+  store i32* %4, i32** %11, align 8
+  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1
+  %13 = bitcast i8** %12 to i32**
+  store i32* %arrayidx1, i32** %13, align 8
+  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %17 = icmp ne i32 %16, 0
+  br i1 %17, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_2b_142c531_foo_l10(i32* %0, i32* %1) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %team.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  %.omp.iv2 = alloca i32, align 4
+  %tmp3 = alloca i32, align 4
+  %.omp.comb.lb4 = alloca i32, align 4
+  %.omp.comb.ub5 = alloca i32, align 4
+  %.omp.stride6 = alloca i32, align 4
+  %.omp.is_last7 = alloca i32, align 4
+  %i8 = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %11, %12
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  store i32 0, i32* %.omp.comb.lb4, align 4
+  store i32 9, i32* %.omp.comb.ub5, align 4
+  store i32 1, i32* %.omp.stride6, align 4
+  store i32 0, i32* %.omp.is_last7, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last7, i32* %.omp.comb.lb4, i32* %.omp.comb.ub5, i32* %.omp.stride6, i32 1, i32 1)
+  %13 = load i32, i32* %.omp.comb.ub5, align 4
+  %cmp9 = icmp sgt i32 %13, 9
+  br i1 %cmp9, label %cond.true10, label %cond.false11
+
+cond.true10:                                      ; preds = %omp.loop.exit
+  br label %cond.end12
+
+cond.false11:                                     ; preds = %omp.loop.exit
+  %14 = load i32, i32* %.omp.comb.ub5, align 4
+  br label %cond.end12
+
+cond.end12:                                       ; preds = %cond.false11, %cond.true10
+  %cond13 = phi i32 [ 9, %cond.true10 ], [ %14, %cond.false11 ]
+  store i32 %cond13, i32* %.omp.comb.ub5, align 4
+  %15 = load i32, i32* %.omp.comb.lb4, align 4
+  store i32 %15, i32* %.omp.iv2, align 4
+  br label %omp.inner.for.cond14
+
+omp.inner.for.cond14:                             ; preds = %omp.inner.for.inc17, %cond.end12
+  %16 = load i32, i32* %.omp.iv2, align 4
+  %17 = load i32, i32* %.omp.comb.ub5, align 4
+  %cmp15 = icmp sle i32 %16, %17
+  br i1 %cmp15, label %omp.inner.for.body16, label %omp.inner.for.end19
+
+omp.inner.for.body16:                             ; preds = %omp.inner.for.cond14
+  %18 = load i32, i32* %.omp.comb.lb4, align 4
+  %19 = zext i32 %18 to i64
+  %20 = load i32, i32* %.omp.comb.ub5, align 4
+  %21 = zext i32 %20 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %19, i64 %21, i32** %dis.addr)
+  br label %omp.inner.for.inc17
+
+omp.inner.for.inc17:                              ; preds = %omp.inner.for.body16
+  %22 = load i32, i32* %.omp.iv2, align 4
+  %23 = load i32, i32* %.omp.stride6, align 4
+  %add18 = add nsw i32 %22, %23
+  store i32 %add18, i32* %.omp.iv2, align 4
+  br label %omp.inner.for.cond14
+
+omp.inner.for.end19:                              ; preds = %omp.inner.for.cond14
+  br label %omp.loop.exit20
+
+omp.loop.exit20:                                  ; preds = %omp.inner.for.end19
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  %24 = load i32*, i32** %team.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %24, i64 %idxprom
+  %25 = load i32, i32* %arrayidx, align 4
+  %add21 = add nsw i32 %25, 1
+  store i32 %add21, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %team = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i4 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  %3 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1
+  store i32 0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0
+  call void @foo(i32* %arraydecay, i32* %arraydecay3)
+  store i32 0, i32* %i4, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc12, %for.end
+  %5 = load i32, i32* %i4, align 4
+  %cmp6 = icmp slt i32 %5, 10
+  br i1 %cmp6, label %for.body7, label %for.end14
+
+for.body7:                                        ; preds = %for.cond5
+  %6 = load i32, i32* %i4, align 4
+  %7 = load i32, i32* %i4, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8
+  %8 = load i32, i32* %arrayidx9, align 4
+  %9 = load i32, i32* %i4, align 4
+  %10 = load i32, i32* %i4, align 4
+  %idxprom10 = sext i32 %10 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10
+  %11 = load i32, i32* %arrayidx11, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11)
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body7
+  %12 = load i32, i32* %i4, align 4
+  %inc13 = add nsw i32 %12, 1
+  store i32 %inc13, i32* %i4, align 4
+  br label %for.cond5
+
+for.end14:                                        ; preds = %for.cond5
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.2/target_offload_not_SPMD.new.host.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_not_SPMD.new.host.ll
@@ -0,0 +1,425 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/jdoerfert/target_offload_not_SPMD-778fa0.bc'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_2b_142c58b_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40]
+@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c58b_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_2b_142c58b_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis, i32* %team) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %team.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %3, i64 0
+  %4 = load i32*, i32** %team.addr, align 8
+  %5 = load i32*, i32** %team.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %7 = bitcast i8** %6 to i32**
+  store i32* %2, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = bitcast i8** %8 to i32**
+  store i32* %arrayidx, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1
+  %11 = bitcast i8** %10 to i32**
+  store i32* %4, i32** %11, align 8
+  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1
+  %13 = bitcast i8** %12 to i32**
+  store i32* %arrayidx1, i32** %13, align 8
+  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %17 = icmp ne i32 %16, 0
+  br i1 %17, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_2b_142c58b_foo_l10(i32* %0, i32* %1) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %team.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %11, %12
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32** %dis.addr)
+  %13 = load i32*, i32** %team.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom
+  %14 = load i32, i32* %arrayidx, align 4
+  %add2 = add nsw i32 %14, 1
+  store i32 %add2, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32**, align 8
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %0, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %team = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i4 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  %3 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1
+  store i32 0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0
+  call void @foo(i32* %arraydecay, i32* %arraydecay3)
+  store i32 0, i32* %i4, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc12, %for.end
+  %5 = load i32, i32* %i4, align 4
+  %cmp6 = icmp slt i32 %5, 10
+  br i1 %cmp6, label %for.body7, label %for.end14
+
+for.body7:                                        ; preds = %for.cond5
+  %6 = load i32, i32* %i4, align 4
+  %7 = load i32, i32* %i4, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8
+  %8 = load i32, i32* %arrayidx9, align 4
+  %9 = load i32, i32* %i4, align 4
+  %10 = load i32, i32* %i4, align 4
+  %idxprom10 = sext i32 %10 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10
+  %11 = load i32, i32* %arrayidx11, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11)
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body7
+  %12 = load i32, i32* %i4, align 4
+  %inc13 = add nsw i32 %12, 1
+  store i32 %inc13, i32* %i4, align 4
+  br label %for.cond5
+
+for.end14:                                        ; preds = %for.cond5
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.2/target_offload_not_SPMD.new.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_not_SPMD.new.ll
@@ -0,0 +1,342 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
+; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] }
+%omp.shared.struct = type { i64, i64, i32** }
+%omp.shared.struct.0 = type { i32** }
+%struct._globalized_locals_ty = type { i32* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1
+@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 1
+@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs:  norecurse nounwind 
+define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #0 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %team.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.zero.addr3.i = alloca i32, align 4
+  %.captured4.i = alloca %omp.shared.struct.0, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0)
+  %1 = icmp eq i16 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %3 = load i32*, i32** %dis.addr, align 8
+  %4 = load i32*, i32** %team.addr, align 8
+  store i32 %2, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr3.i, align 4, !noalias !10
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !10
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !10
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !10
+  store i32* %3, i32** %dis.addr.i, align 8, !noalias !10
+  store i32* %4, i32** %team.addr.i, align 8, !noalias !10
+  call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #3
+  %5 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
+  %6 = bitcast i8* %5 to %struct._globalized_locals_ty*
+  %7 = load i32*, i32** %dis.addr.i, align 8, !noalias !10
+  %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %6, i32 0, i32 0
+  store i32* %7, i32** %dis1.i, align 8
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !10
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !10
+  %8 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !10
+  %9 = load i32, i32* %8, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #3
+  %10 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %cmp.i = icmp sgt i32 %10, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %11, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %12 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  store i32 %12, i32* %.omp.iv.i, align 4, !noalias !10
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %13 = load i32, i32* %.omp.iv.i, align 4, !noalias !10
+  %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %cmp2.i = icmp sle i32 %13, %14
+  br i1 %cmp2.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  %16 = zext i32 %15 to i64
+  %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %18 = zext i32 %17 to i64
+  %19 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %20 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %16, i64* %20, !noalias !10
+  %21 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %18, i64* %21, !noalias !10
+  %22 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store i32** %dis1.i, i32*** %22, !noalias !10
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %19, i16 24, i16 1) #3
+  %23 = load i32, i32* %.omp.iv.i, align 4, !noalias !10
+  %24 = load i32, i32* %.omp.stride.i, align 4, !noalias !10
+  %add.i = add nsw i32 %23, %24
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !10
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) #3
+  %25 = bitcast %omp.shared.struct.0* %.captured4.i to i8*
+  %26 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured4.i, i32 0, i32 0
+  store i32** %dis1.i, i32*** %26, !noalias !10
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %25, i16 8, i16 1) #3
+  %27 = load i32*, i32** %team.addr.i, align 8, !noalias !10
+  %call.i = call i32 @omp_get_team_num() #3
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %27, i64 %idxprom.i
+  %28 = load i32, i32* %arrayidx.i, align 4
+  %add5.i = add nsw i32 %28, 1
+  store i32 %add5.i, i32* %arrayidx.i, align 4
+  call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #3
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs:  norecurse nounwind 
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs:  norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct*
+  %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0
+  %4 = load i64, i64* %3, align 1
+  %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1
+  %6 = load i64, i64* %5, align 1
+  %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2
+  %8 = load i32**, i32*** %7, align 1
+  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32** %8) #3
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+; Function Attrs:  norecurse nounwind 
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32**, align 8
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %0, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs:  norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct.0*
+  %3 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 0
+  %4 = load i32**, i32*** %3, align 1
+  call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i32** %4) #3
+  ret void
+}
+
+declare i32 @omp_get_team_num() #2
+
+declare void @__kmpc_restore_team_static_memory(i16, i16)
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+attributes #0 = {  norecurse nounwind  "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = {  norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{!11, !13}
+!11 = distinct !{!11, !12, !"__omp_outlined__: %.global_tid."}
+!12 = distinct !{!12, !"__omp_outlined__"}
+!13 = distinct !{!13, !12, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
Index: SPMD_examples/v0.2/target_offload_not_SPMD.new.opt.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_not_SPMD.new.opt.ll
@@ -0,0 +1,319 @@
+; ModuleID = '/home/jdoerfert/SPMDtests/target_offload_not_SPMD.new.ll'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] }
+%omp.shared.struct = type { i64, i64, i32** }
+%omp.shared.struct.0 = type { i32** }
+%struct._globalized_locals_ty = type { i32* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1
+@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 1
+@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #0 {
+entry:
+  %work_fn.addr = alloca i8*
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.captured4.i = alloca %omp.shared.struct.0, align 8
+  %thread_kind = call i16 @__kmpc_generic_kernel_init(i16 0, i16 0, i16 1, i16 0)
+  %is_worker = icmp eq i16 %thread_kind, -1
+  br i1 %is_worker, label %worker.wait, label %master_check
+
+worker.wait:                                      ; preds = %worker.inactive, %entry
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  %is_active = call i1 @__kmpc_kernel_parallel(i8** %work_fn.addr, i16 1)
+  %Work_fn.addr_cast = bitcast i8** %work_fn.addr to void (i8*)**
+  %work_fn = load void (i8*)*, void (i8*)** %Work_fn.addr_cast
+  %no_work = icmp eq void (i8*)* %work_fn, null
+  br i1 %no_work, label %worker.finished, label %worker.active_check
+
+worker.finished:                                  ; preds = %worker.wait
+  br label %master_check
+
+worker.active_check:                              ; preds = %worker.wait
+  br i1 %is_active, label %worker.active, label %worker.inactive
+
+worker.active:                                    ; preds = %worker.active_check
+  %0 = call i8* @__kmpc_get_shared_variables()
+  %par_fn_check = icmp eq void (i8*)* %work_fn, @__omp_outlined__2_wrapper
+  br i1 %par_fn_check, label %worker.execute.__omp_outlined__2_wrapper, label %worker.check.next
+
+worker.execute.__omp_outlined__2_wrapper:         ; preds = %worker.active
+  call void @__omp_outlined__2_wrapper(i8* %0)
+  br label %worker.parallel_end
+
+worker.check.next:                                ; preds = %worker.active
+  %par_fn_check1 = icmp eq void (i8*)* %work_fn, @__omp_outlined__1_wrapper
+  br i1 %par_fn_check1, label %worker.execute.__omp_outlined__1_wrapper, label %worker.check.next2
+
+worker.execute.__omp_outlined__1_wrapper:         ; preds = %worker.check.next
+  call void @__omp_outlined__1_wrapper(i8* %0)
+  br label %worker.parallel_end
+
+worker.check.next2:                               ; preds = %worker.check.next
+  call void %work_fn(i8* %0)
+  br label %worker.parallel_end
+
+worker.parallel_end:                              ; preds = %worker.execute.__omp_outlined__1_wrapper, %worker.execute.__omp_outlined__2_wrapper, %worker.check.next2
+  call void @__kmpc_kernel_end_parallel()
+  br label %worker.inactive
+
+worker.inactive:                                  ; preds = %worker.active_check, %worker.parallel_end
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %worker.wait
+
+master_check:                                     ; preds = %worker.finished, %entry
+  %1 = icmp eq i16 %thread_kind, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %master_check
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #2
+  %3 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
+  %4 = bitcast i8* %3 to %struct._globalized_locals_ty*
+  %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %4, i32 0, i32 0
+  store i32* %dis, i32** %dis1.i, align 8
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !10
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !10
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2
+  %5 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %cmp.i = icmp sgt i32 %5, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %6, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %.omp.iv.i.0 = phi i32 [ %7, %cond.end.i ], [ %add.i, %omp.inner.for.body.i ]
+  %8 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %cmp2.i = icmp sle i32 %.omp.iv.i.0, %8
+  br i1 %cmp2.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %9 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10
+  %10 = zext i32 %9 to i64
+  %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10
+  %12 = zext i32 %11 to i64
+  %13 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %10, i64* %14, !noalias !10
+  %15 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %12, i64* %15, !noalias !10
+  %16 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store i32** %dis1.i, i32*** %16, !noalias !10
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %13, i16 24, i16 1) #2
+  %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !10
+  %add.i = add nsw i32 %.omp.iv.i.0, %17
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2
+  %18 = bitcast %omp.shared.struct.0* %.captured4.i to i8*
+  %19 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured4.i, i32 0, i32 0
+  store i32** %dis1.i, i32*** %19, !noalias !10
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %18, i16 8, i16 1) #2
+  %call.i = call i32 @omp_get_team_num() #2
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %team, i64 %idxprom.i
+  %20 = load i32, i32* %arrayidx.i, align 4
+  %add5.i = add nsw i32 %20, 1
+  store i32 %add5.i, i32* %arrayidx.i, align 4
+  call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #2
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %master_check
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %conv = trunc i64 %.previous.lb. to i32
+  %conv1 = trunc i64 %.previous.ub. to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32, i32* %.global_tid., align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %1 = load i32, i32* %.omp.lb, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %.omp.iv.0 = phi i32 [ %1, %entry ], [ %add4, %omp.inner.for.inc ]
+  %conv2 = sext i32 %.omp.iv.0 to i64
+  %cmp = icmp ule i64 %conv2, %.previous.ub.
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %mul = mul nsw i32 %.omp.iv.0, 1
+  %add = add nsw i32 0, %mul
+  %2 = load i32*, i32** %dis, align 8
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %4 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %.omp.iv.0, %4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #0 {
+entry:
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = bitcast i8* %payload to %omp.shared.struct*
+  %2 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 0
+  %3 = load i64, i64* %2, align 1
+  %4 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 1
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 2
+  %7 = load i32**, i32*** %6, align 1
+  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %3, i64 %5, i32** %7) #2
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %0 = load i32*, i32** %dis, align 8
+  %idxprom = sext i32 %i.0 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom
+  %1 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i8* %payload) #0 {
+entry:
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = bitcast i8* %payload to %omp.shared.struct.0*
+  %2 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 0
+  %3 = load i32**, i32*** %2, align 1
+  call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3) #2
+  ret void
+}
+
+declare i32 @omp_get_team_num() #1
+
+declare void @__kmpc_restore_team_static_memory(i16, i16)
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32)
+
+declare i1 @__kmpc_kernel_parallel(i8**, i16)
+
+declare i8* @__kmpc_get_shared_variables()
+
+declare void @__kmpc_kernel_end_parallel()
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{!11, !13}
+!11 = distinct !{!11, !12, !"__omp_outlined__: %.global_tid."}
+!12 = distinct !{!12, !"__omp_outlined__"}
+!13 = distinct !{!13, !12, !"__omp_outlined__: %.bound_tid."}
Index: SPMD_examples/v0.2/target_offload_not_SPMD.old.forced.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_not_SPMD.old.forced.ll
@@ -0,0 +1,728 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
+; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 0
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: noinline norecurse nounwind optnone
+define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #0 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %team.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %.zero.addr9.i = alloca i32, align 4
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !10
+  call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0)
+  call void @__kmpc_data_sharing_init_stack_spmd()
+  br label %.execute
+
+.execute:                                         ; preds = %entry
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %1 = load i32*, i32** %dis.addr, align 8
+  %2 = load i32*, i32** %team.addr, align 8
+  store i32 %0, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr9.i, align 4, !noalias !11
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !11
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !11
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !11
+  store i32* %1, i32** %dis.addr.i, align 8, !noalias !11
+  store i32* %2, i32** %team.addr.i, align 8, !noalias !11
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !11
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !11
+  %nvptx_num_threads.i = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3, !range !10
+  %3 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %4, i32 91, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 %nvptx_num_threads.i) #3
+  %5 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %cmp.i = icmp sgt i32 %5, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %6, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  store i32 %7, i32* %.omp.iv.i, align 4, !noalias !11
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %cond.end7.i, %cond.end.i
+  %8 = load i32, i32* %.omp.iv.i, align 4, !noalias !11
+  %cmp1.i = icmp slt i32 %8, 10
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %9 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  %10 = zext i32 %9 to i64
+  %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %12 = zext i32 %11 to i64
+  %13 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11
+  call void @__omp_outlined__1(i32* %13, i32* %.zero.addr.i, i64 %10, i64 %12, i32** %dis.addr.i) #3
+  %14 = load i32, i32* %.omp.iv.i, align 4, !noalias !11
+  %15 = load i32, i32* %.omp.stride.i, align 4, !noalias !11
+  %add.i = add nsw i32 %14, %15
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !11
+  %16 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !11
+  %add2.i = add nsw i32 %16, %17
+  store i32 %add2.i, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  %18 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %19 = load i32, i32* %.omp.stride.i, align 4, !noalias !11
+  %add3.i = add nsw i32 %18, %19
+  store i32 %add3.i, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %cmp4.i = icmp sgt i32 %20, 9
+  br i1 %cmp4.i, label %cond.true5.i, label %cond.false6.i
+
+cond.true5.i:                                     ; preds = %omp.inner.for.body.i
+  br label %cond.end7.i
+
+cond.false6.i:                                    ; preds = %omp.inner.for.body.i
+  %21 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  br label %cond.end7.i
+
+cond.end7.i:                                      ; preds = %cond.false6.i, %cond.true5.i
+  %cond8.i = phi i32 [ 9, %cond.true5.i ], [ %21, %cond.false6.i ]
+  store i32 %cond8.i, i32* %.omp.comb.ub.i, align 4, !noalias !11
+  %22 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11
+  store i32 %22, i32* %.omp.iv.i, align 4, !noalias !11
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) #3
+  %23 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11
+  call void @__omp_outlined__2(i32* %23, i32* %.zero.addr9.i, i32** %dis.addr.i) #3
+  %24 = load i32*, i32** %team.addr.i, align 8, !noalias !11
+  %call.i = call i32 @omp_get_team_num() #3
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %24, i64 %idxprom.i
+  %25 = load i32, i32* %arrayidx.i, align 4
+  %add10.i = add nsw i32 %25, 1
+  store i32 %add10.i, i32* %arrayidx.i, align 4
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
+
+declare void @__kmpc_spmd_kernel_init(i32, i16, i16)
+
+declare void @__kmpc_data_sharing_init_stack_spmd()
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32**, align 8
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %0, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare i32 @omp_get_team_num() #2
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_spmd_kernel_deinit_v2(i16)
+
+attributes #0 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{i32 1, i32 1025}
+!11 = !{!12, !14}
+!12 = distinct !{!12, !13, !"__omp_outlined__: %.global_tid."}
+!13 = distinct !{!13, !"__omp_outlined__"}
+!14 = distinct !{!14, !13, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/jdoerfert/target_offload_not_SPMD-5f7337.bc'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_2b_142c58b_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40]
+@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c58b_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_2b_142c58b_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis, i32* %team) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %team.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %3, i64 0
+  %4 = load i32*, i32** %team.addr, align 8
+  %5 = load i32*, i32** %team.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %7 = bitcast i8** %6 to i32**
+  store i32* %2, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = bitcast i8** %8 to i32**
+  store i32* %arrayidx, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1
+  %11 = bitcast i8** %10 to i32**
+  store i32* %4, i32** %11, align 8
+  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1
+  %13 = bitcast i8** %12 to i32**
+  store i32* %arrayidx1, i32** %13, align 8
+  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %17 = icmp ne i32 %16, 0
+  br i1 %17, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_2b_142c58b_foo_l10(i32* %0, i32* %1) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %team.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %11, %12
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32** %dis.addr)
+  %13 = load i32*, i32** %team.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom
+  %14 = load i32, i32* %arrayidx, align 4
+  %add2 = add nsw i32 %14, 1
+  store i32 %add2, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32**, align 8
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %0, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %team = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i4 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  %3 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1
+  store i32 0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0
+  call void @foo(i32* %arraydecay, i32* %arraydecay3)
+  store i32 0, i32* %i4, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc12, %for.end
+  %5 = load i32, i32* %i4, align 4
+  %cmp6 = icmp slt i32 %5, 10
+  br i1 %cmp6, label %for.body7, label %for.end14
+
+for.body7:                                        ; preds = %for.cond5
+  %6 = load i32, i32* %i4, align 4
+  %7 = load i32, i32* %i4, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8
+  %8 = load i32, i32* %arrayidx9, align 4
+  %9 = load i32, i32* %i4, align 4
+  %10 = load i32, i32* %i4, align 4
+  %idxprom10 = sext i32 %10 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10
+  %11 = load i32, i32* %arrayidx11, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11)
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body7
+  %12 = load i32, i32* %i4, align 4
+  %inc13 = add nsw i32 %12, 1
+  store i32 %inc13, i32* %i4, align 4
+  br label %for.cond5
+
+for.end14:                                        ; preds = %for.cond5
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.2/target_offload_not_SPMD.old.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.2/target_offload_not_SPMD.old.ll
@@ -0,0 +1,891 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda
+; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cuda"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] }
+%struct._globalized_locals_ty = type { i32* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1
+@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 1
+@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_offloading_2b_142c58b_foo_l10_worker() #0 {
+entry:
+  %work_fn = alloca i8*, align 8
+  %exec_status = alloca i8, align 1
+  store i8* null, i8** %work_fn, align 8
+  store i8 0, i8* %exec_status, align 1
+  br label %.await.work
+
+.await.work:                                      ; preds = %.barrier.parallel, %entry
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1)
+  %1 = zext i1 %0 to i8
+  store i8 %1, i8* %exec_status, align 1
+  %2 = load i8*, i8** %work_fn, align 8
+  %should_terminate = icmp eq i8* %2, null
+  br i1 %should_terminate, label %.exit, label %.select.workers
+
+.select.workers:                                  ; preds = %.await.work
+  %3 = load i8, i8* %exec_status, align 1
+  %is_active = icmp ne i8 %3, 0
+  br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
+
+.execute.parallel:                                ; preds = %.select.workers
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %5 = load i8*, i8** %work_fn, align 8
+  %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
+  br i1 %work_match, label %.execute.fn, label %.check.next
+
+.execute.fn:                                      ; preds = %.execute.parallel
+  call void @__omp_outlined__1_wrapper(i16 0, i32 %4) #5
+  br label %.terminate.parallel
+
+.check.next:                                      ; preds = %.execute.parallel
+  %6 = load i8*, i8** %work_fn, align 8
+  %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
+  br i1 %work_match1, label %.execute.fn2, label %.check.next3
+
+.execute.fn2:                                     ; preds = %.check.next
+  call void @__omp_outlined__2_wrapper(i16 0, i32 %4) #5
+  br label %.terminate.parallel
+
+.check.next3:                                     ; preds = %.check.next
+  %7 = bitcast i8* %2 to void (i16, i32)*
+  call void %7(i16 0, i32 %4)
+  br label %.terminate.parallel
+
+.terminate.parallel:                              ; preds = %.check.next3, %.execute.fn2, %.execute.fn
+  call void @__kmpc_kernel_end_parallel()
+  br label %.barrier.parallel
+
+.barrier.parallel:                                ; preds = %.terminate.parallel, %.select.workers
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %.await.work
+
+.exit:                                            ; preds = %.await.work
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone
+define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %team.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %shared_arg_refs.i = alloca i8**, align 8
+  %.zero.addr3.i = alloca i32, align 4
+  %shared_arg_refs4.i = alloca i8**, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10
+  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11
+  %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
+  %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
+  %0 = icmp ult i32 %nvptx_tid, %thread_limit
+  br i1 %0, label %.worker, label %.mastercheck
+
+.worker:                                          ; preds = %entry
+  call void @__omp_offloading_2b_142c58b_foo_l10_worker() #5
+  br label %.exit
+
+.mastercheck:                                     ; preds = %entry
+  %nvptx_num_threads1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11
+  %nvptx_warp_size2 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10
+  %1 = sub nuw i32 %nvptx_warp_size2, 1
+  %2 = xor i32 %1, -1
+  %3 = sub nuw i32 %nvptx_num_threads1, 1
+  %master_tid = and i32 %3, %2
+  %nvptx_tid3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12
+  %4 = icmp eq i32 %nvptx_tid3, %master_tid
+  br i1 %4, label %.master, label %.exit
+
+.master:                                          ; preds = %.mastercheck
+  %nvptx_warp_size4 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10
+  %nvptx_num_threads5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11
+  %thread_limit6 = sub nuw i32 %nvptx_num_threads5, %nvptx_warp_size4
+  call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
+  call void @__kmpc_data_sharing_init_stack()
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %6 = load i32*, i32** %dis.addr, align 8
+  %7 = load i32*, i32** %team.addr, align 8
+  store i32 %5, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr3.i, align 4, !noalias !13
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !13
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !13
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !13
+  store i32* %6, i32** %dis.addr.i, align 8, !noalias !13
+  store i32* %7, i32** %team.addr.i, align 8, !noalias !13
+  call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #5
+  %8 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8
+  %9 = bitcast i8* %8 to %struct._globalized_locals_ty*
+  %10 = load i32*, i32** %dis.addr.i, align 8, !noalias !13
+  %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %9, i32 0, i32 0
+  store i32* %10, i32** %dis1.i, align 8
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !13
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !13
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !13
+  %11 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !13
+  %12 = load i32, i32* %11, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %12, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #5
+  %13 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %cmp.i = icmp sgt i32 %13, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.master
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.master
+  %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %14, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13
+  store i32 %15, i32* %.omp.iv.i, align 4, !noalias !13
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %16 = load i32, i32* %.omp.iv.i, align 4, !noalias !13
+  %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %cmp2.i = icmp sle i32 %16, %17
+  br i1 %cmp2.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %18 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13
+  %19 = zext i32 %18 to i64
+  %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13
+  %21 = zext i32 %20 to i64
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i16 1) #5
+  call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs.i, i64 3) #5
+  %22 = load i8**, i8*** %shared_arg_refs.i, align 8, !noalias !13
+  %23 = inttoptr i64 %19 to i8*
+  store i8* %23, i8** %22, align 8
+  %24 = getelementptr inbounds i8*, i8** %22, i64 1
+  %25 = inttoptr i64 %21 to i8*
+  store i8* %25, i8** %24, align 8
+  %26 = getelementptr inbounds i8*, i8** %22, i64 2
+  %27 = bitcast i32** %dis1.i to i8*
+  store i8* %27, i8** %26, align 8
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_end_sharing_variables() #5
+  %28 = load i32, i32* %.omp.iv.i, align 4, !noalias !13
+  %29 = load i32, i32* %.omp.stride.i, align 4, !noalias !13
+  %add.i = add nsw i32 %28, %29
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !13
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %12) #5
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i16 1) #5
+  call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs4.i, i64 1) #5
+  %30 = load i8**, i8*** %shared_arg_refs4.i, align 8, !noalias !13
+  %31 = bitcast i32** %dis1.i to i8*
+  store i8* %31, i8** %30, align 8
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5
+  call void @__kmpc_end_sharing_variables() #5
+  %32 = load i32*, i32** %team.addr.i, align 8, !noalias !13
+  %call.i = call i32 @omp_get_team_num() #5
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %32, i64 %idxprom.i
+  %33 = load i32, i32* %arrayidx.i, align 4
+  %add5.i = add nsw i32 %33, 1
+  store i32 %add5.i, i32* %arrayidx.i, align 4
+  call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #5
+  br label %.termination.notifier
+
+.termination.notifier:                            ; preds = %__omp_outlined__.exit
+  call void @__kmpc_kernel_deinit(i16 1)
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %.exit
+
+.exit:                                            ; preds = %.termination.notifier, %.mastercheck, %.worker
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+declare void @__kmpc_kernel_init(i32, i16)
+
+declare void @__kmpc_data_sharing_init_stack()
+
+declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.lb, align 4
+  store i32 %5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %6 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %6 to i64
+  %7 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %7
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %8 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %8, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %9 = load i32*, i32** %0, align 8
+  %10 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %10 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom
+  %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %12, %13
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i16 zeroext, i32) #0 {
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i64*
+  %5 = load i64, i64* %4, align 8
+  %6 = getelementptr inbounds i8*, i8** %2, i64 1
+  %7 = bitcast i8** %6 to i64*
+  %8 = load i64, i64* %7, align 8
+  %9 = getelementptr inbounds i8*, i8** %2, i64 2
+  %10 = bitcast i8** %9 to i32***
+  %11 = load i32**, i32*** %10, align 8
+  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, i32** %11) #5
+  ret void
+}
+
+declare void @__kmpc_get_shared_variables(i8***)
+
+declare void @__kmpc_kernel_prepare_parallel(i8*, i16)
+
+declare void @__kmpc_begin_sharing_variables(i8***, i64)
+
+; Function Attrs: convergent
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) #3
+
+declare void @__kmpc_end_sharing_variables()
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32**, align 8
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %0, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i16 zeroext, i32) #0 {
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i32***
+  %5 = load i32**, i32*** %4, align 8
+  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr, i32** %5) #5
+  ret void
+}
+
+declare i32 @omp_get_team_num() #4
+
+declare void @__kmpc_restore_team_static_memory(i16, i16)
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_kernel_deinit(i16)
+
+declare i1 @__kmpc_kernel_parallel(i8**, i16)
+
+declare void @__kmpc_kernel_end_parallel()
+
+attributes #0 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4}
+!llvm.module.flags = !{!6, !7}
+!llvm.ident = !{!8}
+!nvvm.internalize.after.link = !{}
+!nvvmir.version = !{!9}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1}
+!2 = !{null, !"align", i32 8}
+!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080}
+!4 = !{null, !"align", i32 16}
+!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088}
+!6 = !{i32 1, !"wchar_size", i32 4}
+!7 = !{i32 7, !"PIC Level", i32 2}
+!8 = !{!"clang version 9.0.0 "}
+!9 = !{i32 1, i32 2}
+!10 = !{i32 32, i32 33}
+!11 = !{i32 1, i32 1025}
+!12 = !{i32 0, i32 1024}
+!13 = !{!14, !16}
+!14 = distinct !{!14, !15, !"__omp_outlined__: %.global_tid."}
+!15 = distinct !{!15, !"__omp_outlined__"}
+!16 = distinct !{!16, !15, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/jdoerfert/target_offload_not_SPMD-16350b.bc'
+source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_2b_142c58b_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40]
+@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c58b_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_2b_142c58b_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis, i32* %team) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [2 x i8*], align 8
+  %.offload_ptrs = alloca [2 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %team.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %3, i64 0
+  %4 = load i32*, i32** %team.addr, align 8
+  %5 = load i32*, i32** %team.addr, align 8
+  %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0
+  %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %7 = bitcast i8** %6 to i32**
+  store i32* %2, i32** %7, align 8
+  %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = bitcast i8** %8 to i32**
+  store i32* %arrayidx, i32** %9, align 8
+  %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1
+  %11 = bitcast i8** %10 to i32**
+  store i32* %4, i32** %11, align 8
+  %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1
+  %13 = bitcast i8** %12 to i32**
+  store i32* %arrayidx1, i32** %13, align 8
+  %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %17 = icmp ne i32 %16, 0
+  br i1 %17, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_2b_142c58b_foo_l10(i32* %0, i32* %1) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  %3 = load i32*, i32** %team.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %team.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32* %team, i32** %team.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %11, %12
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32** %dis.addr)
+  %13 = load i32*, i32** %team.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom
+  %14 = load i32, i32* %arrayidx, align 4
+  %add2 = add nsw i32 %14, 1
+  store i32 %add2, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32**, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %1 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %1 to i32
+  %2 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %2 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %3 = load i32*, i32** %.global_tid..addr, align 8
+  %4 = load i32, i32* %3, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %5 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %5, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %6 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %7 = load i32, i32* %.omp.lb, align 4
+  store i32 %7, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %8 = load i32, i32* %.omp.iv, align 4
+  %9 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %8, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %10 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %10, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %11 = load i32*, i32** %0, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %12 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom
+  %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %14 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %14, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32**, align 8
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32** %dis, i32*** %dis.addr, align 8
+  %0 = load i32**, i32*** %dis.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %1 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %1, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %2 = load i32*, i32** %0, align 8
+  %3 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %3 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %5 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %5, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %team = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i4 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  %3 = load i32, i32* %i, align 4
+  %idxprom1 = sext i32 %3 to i64
+  %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1
+  store i32 0, i32* %arrayidx2, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0
+  call void @foo(i32* %arraydecay, i32* %arraydecay3)
+  store i32 0, i32* %i4, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc12, %for.end
+  %5 = load i32, i32* %i4, align 4
+  %cmp6 = icmp slt i32 %5, 10
+  br i1 %cmp6, label %for.body7, label %for.end14
+
+for.body7:                                        ; preds = %for.cond5
+  %6 = load i32, i32* %i4, align 4
+  %7 = load i32, i32* %i4, align 4
+  %idxprom8 = sext i32 %7 to i64
+  %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8
+  %8 = load i32, i32* %arrayidx9, align 4
+  %9 = load i32, i32* %i4, align 4
+  %10 = load i32, i32* %i4, align 4
+  %idxprom10 = sext i32 %10 to i64
+  %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10
+  %11 = load i32, i32* %arrayidx11, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11)
+  br label %for.inc12
+
+for.inc12:                                        ; preds = %for.body7
+  %12 = load i32, i32* %i4, align 4
+  %inc13 = add nsw i32 %12, 1
+  store i32 %inc13, i32* %i4, align 4
+  br label %for.cond5
+
+for.end14:                                        ; preds = %for.cond5
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.3/target_offload_is_SPMD.O3.new.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.3/target_offload_is_SPMD.O3.new.ll
@@ -0,0 +1,736 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud
+; ModuleID = '../SPMD_examples/v0.3/target_offload_is_SPMD.c'
+source_filename = "../SPMD_examples/v0.3/target_offload_is_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.shared.struct = type { i64, i64, i32* }
+%omp.shared.struct.0 = type { i64, i64, i32* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_2852ec2_foo_l10_exec_mode = weak constant i8 0
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_2852ec2_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_2852ec2_foo_l10(i32* %dis) local_unnamed_addr #0 {
+entry:
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.omp.comb.lb4.i = alloca i32, align 4
+  %.omp.comb.ub5.i = alloca i32, align 4
+  %.omp.stride6.i = alloca i32, align 4
+  %.omp.is_last7.i = alloca i32, align 4
+  %.captured19.i = alloca %omp.shared.struct.0, align 8
+  %0 = tail call i16 @__kmpc_generic_kernel_init(i16 1, i16 1, i16 1, i16 0) #2
+  %1 = icmp eq i16 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #2
+  %3 = bitcast %omp.shared.struct* %.captured.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %3)
+  %4 = bitcast %omp.shared.struct.0* %.captured19.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %4)
+  %5 = bitcast i32* %.omp.comb.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %5) #2
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !tbaa !5
+  %6 = bitcast i32* %.omp.comb.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %6) #2
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %7 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %7) #2
+  store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5
+  %8 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %8) #2
+  store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %2, i32 92, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.comb.lb.i, i32* nonnull %.omp.comb.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #2
+  %9 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %10 = icmp slt i32 %9, 9
+  %cond.i = select i1 %10, i32 %9, i32 9
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %11 = load i32, i32* %.omp.comb.lb.i, align 4, !tbaa !5
+  %cmp13.i = icmp sgt i32 %11, %cond.i
+  br i1 %cmp13.i, label %omp.loop.exit.i, label %omp.inner.for.body.lr.ph.i
+
+omp.inner.for.body.lr.ph.i:                       ; preds = %.execute
+  %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 0
+  %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 1
+  %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 2
+  %15 = zext i32 %11 to i64
+  %16 = zext i32 %cond.i to i64
+  store i64 %15, i64* %12, align 8
+  store i64 %16, i64* %13, align 8
+  store i32* %dis, i32** %14, align 8
+  call fastcc void @__omp_outlined__1_wrapper(i8* %3)
+  %17 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %add.i4 = add nsw i32 %17, %11
+  %18 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %cmp1.i5 = icmp sgt i32 %add.i4, %18
+  br i1 %cmp1.i5, label %omp.loop.exit.i, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i
+
+omp.inner.for.body.omp.inner.for.body_crit_edge.i: ; preds = %omp.inner.for.body.lr.ph.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i
+  %19 = phi i32 [ %23, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %18, %omp.inner.for.body.lr.ph.i ]
+  %add.i6 = phi i32 [ %add.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %add.i4, %omp.inner.for.body.lr.ph.i ]
+  %.pre.i = load i32, i32* %.omp.comb.lb.i, align 4
+  %20 = zext i32 %.pre.i to i64
+  %21 = zext i32 %19 to i64
+  store i64 %20, i64* %12, align 8
+  store i64 %21, i64* %13, align 8
+  store i32* %dis, i32** %14, align 8
+  call fastcc void @__omp_outlined__1_wrapper(i8* %3)
+  %22 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %add.i = add nsw i32 %22, %add.i6
+  %23 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %cmp1.i = icmp sgt i32 %add.i, %23
+  br i1 %cmp1.i, label %omp.loop.exit.i, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i
+
+omp.loop.exit.i:                                  ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge.i, %omp.inner.for.body.lr.ph.i, %.execute
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %2) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %8) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %7) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %6) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %5) #2
+  %24 = bitcast i32* %.omp.comb.lb4.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %24) #2
+  store i32 0, i32* %.omp.comb.lb4.i, align 4, !tbaa !5
+  %25 = bitcast i32* %.omp.comb.ub5.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %25) #2
+  store i32 9, i32* %.omp.comb.ub5.i, align 4, !tbaa !5
+  %26 = bitcast i32* %.omp.stride6.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %26) #2
+  store i32 1, i32* %.omp.stride6.i, align 4, !tbaa !5
+  %27 = bitcast i32* %.omp.is_last7.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %27) #2
+  store i32 0, i32* %.omp.is_last7.i, align 4, !tbaa !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %2, i32 92, i32* nonnull %.omp.is_last7.i, i32* nonnull %.omp.comb.lb4.i, i32* nonnull %.omp.comb.ub5.i, i32* nonnull %.omp.stride6.i, i32 1, i32 1) #2
+  %28 = load i32, i32* %.omp.comb.ub5.i, align 4, !tbaa !5
+  %29 = icmp slt i32 %28, 9
+  %cond13.i = select i1 %29, i32 %28, i32 9
+  store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !tbaa !5
+  %30 = load i32, i32* %.omp.comb.lb4.i, align 4, !tbaa !5
+  %cmp151.i = icmp sgt i32 %30, %cond13.i
+  br i1 %cmp151.i, label %__omp_outlined__.exit, label %omp.inner.for.body17.lr.ph.i
+
+omp.inner.for.body17.lr.ph.i:                     ; preds = %omp.loop.exit.i
+  %31 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i64 0, i32 0
+  %32 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i64 0, i32 1
+  %33 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i64 0, i32 2
+  %34 = zext i32 %30 to i64
+  %35 = zext i32 %cond13.i to i64
+  store i64 %34, i64* %31, align 8
+  store i64 %35, i64* %32, align 8
+  store i32* %dis, i32** %33, align 8
+  call fastcc void @__omp_outlined__2_wrapper(i8* %4)
+  %36 = load i32, i32* %.omp.stride6.i, align 4, !tbaa !5
+  %add21.i1 = add nsw i32 %36, %30
+  %37 = load i32, i32* %.omp.comb.ub5.i, align 4, !tbaa !5
+  %cmp15.i2 = icmp sgt i32 %add21.i1, %37
+  br i1 %cmp15.i2, label %__omp_outlined__.exit, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i
+
+omp.inner.for.body17.omp.inner.for.body17_crit_edge.i: ; preds = %omp.inner.for.body17.lr.ph.i, %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i
+  %38 = phi i32 [ %42, %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i ], [ %37, %omp.inner.for.body17.lr.ph.i ]
+  %add21.i3 = phi i32 [ %add21.i, %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i ], [ %add21.i1, %omp.inner.for.body17.lr.ph.i ]
+  %.pre5.i = load i32, i32* %.omp.comb.lb4.i, align 4
+  %39 = zext i32 %.pre5.i to i64
+  %40 = zext i32 %38 to i64
+  store i64 %39, i64* %31, align 8
+  store i64 %40, i64* %32, align 8
+  store i32* %dis, i32** %33, align 8
+  call fastcc void @__omp_outlined__2_wrapper(i8* %4)
+  %41 = load i32, i32* %.omp.stride6.i, align 4, !tbaa !5
+  %add21.i = add nsw i32 %41, %add21.i3
+  %42 = load i32, i32* %.omp.comb.ub5.i, align 4, !tbaa !5
+  %cmp15.i = icmp sgt i32 %add21.i, %42
+  br i1 %cmp15.i, label %__omp_outlined__.exit, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i, %omp.inner.for.body17.lr.ph.i, %omp.loop.exit.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %2) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %27) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %26) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %25) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %24) #2
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %3)
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %4)
+  call void @__kmpc_generic_kernel_deinit(i16 1, i16 1) #2
+  br label %.exit
+
+.exit:                                            ; preds = %__omp_outlined__.exit, %entry
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind
+define internal fastcc void @__omp_outlined__1_wrapper(i8* nocapture readonly %payload) unnamed_addr #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #2
+  %1 = bitcast i8* %payload to i64*
+  %2 = load i64, i64* %1, align 1
+  %3 = getelementptr inbounds i8, i8* %payload, i64 8
+  %4 = bitcast i8* %3 to i64*
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds i8, i8* %payload, i64 16
+  %7 = bitcast i8* %6 to i32**
+  %8 = load i32*, i32** %7, align 1
+  %9 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9) #2
+  %10 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #2
+  %conv.i = trunc i64 %2 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !tbaa !5
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !tbaa !5
+  %11 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11) #2
+  store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5
+  %12 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %12) #2
+  store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %0, i32 33, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #2
+  %13 = load i32, i32* %.omp.lb.i, align 4, !tbaa !5
+  %conv21.i = sext i32 %13 to i64
+  %cmp2.i = icmp ult i64 %5, %conv21.i
+  br i1 %cmp2.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.lr.ph.i
+
+omp.inner.for.body.lr.ph.i:                       ; preds = %entry
+  %14 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %15 = sext i32 %14 to i64
+  br label %omp.inner.for.body.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.body.i, %omp.inner.for.body.lr.ph.i
+  %indvars.iv.i = phi i64 [ %conv21.i, %omp.inner.for.body.lr.ph.i ], [ %indvars.iv.next.i, %omp.inner.for.body.i ]
+  %arrayidx.i = getelementptr inbounds i32, i32* %8, i64 %indvars.iv.i
+  %16 = atomicrmw add i32* %arrayidx.i, i32 1 monotonic
+  %indvars.iv.next.i = add i64 %indvars.iv.i, %15
+  %cmp.i = icmp ugt i64 %indvars.iv.next.i, %5
+  br i1 %cmp.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.i
+
+__omp_outlined__1.exit:                           ; preds = %omp.inner.for.body.i, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %0) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %12) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %10) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %9) #2
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+; Function Attrs: norecurse nounwind
+define internal fastcc void @__omp_outlined__2_wrapper(i8* nocapture readonly %payload) unnamed_addr #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #2
+  %1 = bitcast i8* %payload to i64*
+  %2 = load i64, i64* %1, align 1
+  %3 = getelementptr inbounds i8, i8* %payload, i64 8
+  %4 = bitcast i8* %3 to i64*
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds i8, i8* %payload, i64 16
+  %7 = bitcast i8* %6 to i32**
+  %8 = load i32*, i32** %7, align 1
+  %9 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9) #2
+  %10 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #2
+  %conv.i = trunc i64 %2 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !tbaa !5
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !tbaa !5
+  %11 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11) #2
+  store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5
+  %12 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %12) #2
+  store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %0, i32 33, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #2
+  %13 = load i32, i32* %.omp.lb.i, align 4, !tbaa !5
+  %conv21.i = sext i32 %13 to i64
+  %cmp2.i = icmp ult i64 %5, %conv21.i
+  br i1 %cmp2.i, label %__omp_outlined__2.exit, label %omp.inner.for.body.lr.ph.i
+
+omp.inner.for.body.lr.ph.i:                       ; preds = %entry
+  %14 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %15 = sext i32 %14 to i64
+  br label %omp.inner.for.body.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.body.i, %omp.inner.for.body.lr.ph.i
+  %indvars.iv.i = phi i64 [ %conv21.i, %omp.inner.for.body.lr.ph.i ], [ %indvars.iv.next.i, %omp.inner.for.body.i ]
+  %arrayidx.i = getelementptr inbounds i32, i32* %8, i64 %indvars.iv.i
+  %16 = atomicrmw add i32* %arrayidx.i, i32 1 monotonic
+  %indvars.iv.next.i = add i64 %indvars.iv.i, %15
+  %cmp.i = icmp ugt i64 %indvars.iv.next.i, %5
+  br i1 %cmp.i, label %__omp_outlined__2.exit, label %omp.inner.for.body.i
+
+__omp_outlined__2.exit:                           ; preds = %omp.inner.for.body.i, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %0) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %12) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %10) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %9) #2
+  ret void
+}
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16) local_unnamed_addr
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 42282690, !"foo", i32 10, i32 0}
+!1 = !{void (i32*)* @__omp_offloading_18_2852ec2_foo_l10, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 "}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/johannes/target_offload_is_SPMD-241dc4.bc'
+source_filename = "../SPMD_examples/v0.3/target_offload_is_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cud = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_18_2852ec2_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 40]
+@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
+@.str.3 = private unnamed_addr constant [16 x i8] c"dis[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_18_2852ec2_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_18_2852ec2_foo_l10 = weak local_unnamed_addr constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_18_2852ec2_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cud = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cud = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cud, i8* @.omp_offloading.img_end.nvptx64-nvida-cud, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud to i8*) }]
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(i32* %dis) local_unnamed_addr #0 {
+entry:
+  %.offload_baseptrs = alloca [1 x i8*], align 8
+  %.offload_ptrs = alloca [1 x i8*], align 8
+  %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
+  %1 = bitcast [1 x i8*]* %.offload_baseptrs to i32**
+  store i32* %dis, i32** %1, align 8
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
+  %3 = bitcast [1 x i8*]* %.offload_ptrs to i32**
+  store i32* %dis, i32** %3, align 8
+  %4 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852ec2_foo_l10.region_id, i32 1, i8** nonnull %0, i8** nonnull %2, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #4
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %omp_offload.cont, label %omp_offload.failed
+
+omp_offload.failed:                               ; preds = %entry
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #4
+  %7 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %6, i32 3, i32 0) #4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %dis) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
+  ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* %dis) #1 {
+entry:
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %.omp.comb.lb4 = alloca i32, align 4
+  %.omp.comb.ub5 = alloca i32, align 4
+  %.omp.stride6 = alloca i32, align 4
+  %.omp.is_last7 = alloca i32, align 4
+  %0 = bitcast i32* %.omp.comb.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
+  store i32 0, i32* %.omp.comb.lb, align 4, !tbaa !3
+  %1 = bitcast i32* %.omp.comb.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
+  store i32 9, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %2 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !3
+  %3 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !3
+  %4 = load i32, i32* %.global_tid., align 4, !tbaa !3
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %4, i32 92, i32* nonnull %.omp.is_last, i32* nonnull %.omp.comb.lb, i32* nonnull %.omp.comb.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #4
+  %5 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %6 = icmp slt i32 %5, 9
+  %cond = select i1 %6, i32 %5, i32 9
+  store i32 %cond, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %7 = load i32, i32* %.omp.comb.lb, align 4, !tbaa !3
+  %cmp132 = icmp sgt i32 %7, %cond
+  br i1 %cmp132, label %omp.loop.exit, label %omp.inner.for.body.preheader
+
+omp.inner.for.body.preheader:                     ; preds = %entry
+  %8 = zext i32 %7 to i64
+  %9 = zext i32 %cond to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %9, i32* %dis) #4
+  %10 = load i32, i32* %.omp.stride, align 4, !tbaa !3
+  %add38 = add nsw i32 %10, %7
+  %11 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %cmp139 = icmp sgt i32 %add38, %11
+  br i1 %cmp139, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge
+
+omp.inner.for.body.omp.inner.for.body_crit_edge:  ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body.omp.inner.for.body_crit_edge
+  %12 = phi i32 [ %16, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %11, %omp.inner.for.body.preheader ]
+  %add40 = phi i32 [ %add, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %add38, %omp.inner.for.body.preheader ]
+  %.pre = load i32, i32* %.omp.comb.lb, align 4
+  %13 = zext i32 %.pre to i64
+  %14 = zext i32 %12 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %13, i64 %14, i32* %dis) #4
+  %15 = load i32, i32* %.omp.stride, align 4, !tbaa !3
+  %add = add nsw i32 %15, %add40
+  %16 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %cmp1 = icmp sgt i32 %add, %16
+  br i1 %cmp1, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge, %omp.inner.for.body.preheader, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
+  %17 = bitcast i32* %.omp.comb.lb4 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %17) #4
+  store i32 0, i32* %.omp.comb.lb4, align 4, !tbaa !3
+  %18 = bitcast i32* %.omp.comb.ub5 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %18) #4
+  store i32 9, i32* %.omp.comb.ub5, align 4, !tbaa !3
+  %19 = bitcast i32* %.omp.stride6 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %19) #4
+  store i32 1, i32* %.omp.stride6, align 4, !tbaa !3
+  %20 = bitcast i32* %.omp.is_last7 to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %20) #4
+  store i32 0, i32* %.omp.is_last7, align 4, !tbaa !3
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %4, i32 92, i32* nonnull %.omp.is_last7, i32* nonnull %.omp.comb.lb4, i32* nonnull %.omp.comb.ub5, i32* nonnull %.omp.stride6, i32 1, i32 1) #4
+  %21 = load i32, i32* %.omp.comb.ub5, align 4, !tbaa !3
+  %22 = icmp slt i32 %21, 9
+  %cond13 = select i1 %22, i32 %21, i32 9
+  store i32 %cond13, i32* %.omp.comb.ub5, align 4, !tbaa !3
+  %23 = load i32, i32* %.omp.comb.lb4, align 4, !tbaa !3
+  %cmp1530 = icmp sgt i32 %23, %cond13
+  br i1 %cmp1530, label %omp.loop.exit21, label %omp.inner.for.body17.preheader
+
+omp.inner.for.body17.preheader:                   ; preds = %omp.loop.exit
+  %24 = zext i32 %23 to i64
+  %25 = zext i32 %cond13 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %24, i64 %25, i32* %dis) #4
+  %26 = load i32, i32* %.omp.stride6, align 4, !tbaa !3
+  %add1935 = add nsw i32 %26, %23
+  %27 = load i32, i32* %.omp.comb.ub5, align 4, !tbaa !3
+  %cmp1536 = icmp sgt i32 %add1935, %27
+  br i1 %cmp1536, label %omp.loop.exit21, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge
+
+omp.inner.for.body17.omp.inner.for.body17_crit_edge: ; preds = %omp.inner.for.body17.preheader, %omp.inner.for.body17.omp.inner.for.body17_crit_edge
+  %28 = phi i32 [ %32, %omp.inner.for.body17.omp.inner.for.body17_crit_edge ], [ %27, %omp.inner.for.body17.preheader ]
+  %add1937 = phi i32 [ %add19, %omp.inner.for.body17.omp.inner.for.body17_crit_edge ], [ %add1935, %omp.inner.for.body17.preheader ]
+  %.pre34 = load i32, i32* %.omp.comb.lb4, align 4
+  %29 = zext i32 %.pre34 to i64
+  %30 = zext i32 %28 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %29, i64 %30, i32* %dis) #4
+  %31 = load i32, i32* %.omp.stride6, align 4, !tbaa !3
+  %add19 = add nsw i32 %31, %add1937
+  %32 = load i32, i32* %.omp.comb.ub5, align 4, !tbaa !3
+  %cmp15 = icmp sgt i32 %add19, %32
+  br i1 %cmp15, label %omp.loop.exit21, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge
+
+omp.loop.exit21:                                  ; preds = %omp.inner.for.body17.omp.inner.for.body17_crit_edge, %omp.inner.for.body17.preheader, %omp.loop.exit
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %20) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %19) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %18) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %17) #4
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr
+
+; Function Attrs: norecurse nounwind uwtable
+define internal void @.omp_outlined..1(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* nocapture %dis) #1 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %0 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
+  %1 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
+  %conv = trunc i64 %.previous.lb. to i32
+  %conv1 = trunc i64 %.previous.ub. to i32
+  store i32 %conv, i32* %.omp.lb, align 4, !tbaa !3
+  store i32 %conv1, i32* %.omp.ub, align 4, !tbaa !3
+  %2 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !3
+  %3 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !3
+  %4 = load i32, i32* %.global_tid., align 4, !tbaa !3
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %4, i32 34, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #4
+  %5 = load i32, i32* %.omp.ub, align 4, !tbaa !3
+  %6 = icmp slt i32 %5, 9
+  %cond = select i1 %6, i32 %5, i32 9
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !3
+  %7 = load i32, i32* %.omp.lb, align 4, !tbaa !3
+  %cmp310 = icmp sgt i32 %7, %cond
+  br i1 %cmp310, label %omp.loop.exit, label %omp.inner.for.body.preheader
+
+omp.inner.for.body.preheader:                     ; preds = %entry
+  %8 = sext i32 %7 to i64
+  %9 = sext i32 %cond to i64
+  br label %omp.inner.for.body
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body
+  %indvars.iv = phi i64 [ %8, %omp.inner.for.body.preheader ], [ %indvars.iv.next, %omp.inner.for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %indvars.iv
+  %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %cmp3 = icmp slt i64 %indvars.iv, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.body, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+declare !callback !7 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+; Function Attrs: norecurse nounwind uwtable
+define internal void @.omp_outlined..2(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* nocapture %dis) #1 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %0 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4
+  %1 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4
+  %conv = trunc i64 %.previous.lb. to i32
+  %conv1 = trunc i64 %.previous.ub. to i32
+  store i32 %conv, i32* %.omp.lb, align 4, !tbaa !3
+  store i32 %conv1, i32* %.omp.ub, align 4, !tbaa !3
+  %2 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !3
+  %3 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !3
+  %4 = load i32, i32* %.global_tid., align 4, !tbaa !3
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %4, i32 34, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #4
+  %5 = load i32, i32* %.omp.ub, align 4, !tbaa !3
+  %6 = icmp slt i32 %5, 9
+  %cond = select i1 %6, i32 %5, i32 9
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !3
+  %7 = load i32, i32* %.omp.lb, align 4, !tbaa !3
+  %cmp310 = icmp sgt i32 %7, %cond
+  br i1 %cmp310, label %omp.loop.exit, label %omp.inner.for.body.preheader
+
+omp.inner.for.body.preheader:                     ; preds = %entry
+  %8 = sext i32 %7 to i64
+  %9 = sext i32 %cond to i64
+  br label %omp.inner.for.body
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body
+  %indvars.iv = phi i64 [ %8, %omp.inner.for.body.preheader ], [ %indvars.iv.next, %omp.inner.for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %indvars.iv
+  %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %cmp3 = icmp slt i64 %indvars.iv, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.body, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4
+  ret void
+}
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) local_unnamed_addr
+
+declare !callback !7 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) local_unnamed_addr
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+entry:
+  %.offload_baseptrs.i = alloca [1 x i8*], align 8
+  %.offload_ptrs.i = alloca [1 x i8*], align 8
+  %dis = alloca [10 x i32], align 16
+  %0 = bitcast [10 x i32]* %dis to i8*
+  call void @llvm.lifetime.start.p0i8(i64 40, i8* nonnull %0) #4
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 0
+  %arrayidx.1 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 1
+  %arrayidx.2 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 2
+  %arrayidx.3 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 3
+  %1 = bitcast [10 x i32]* %dis to <4 x i32>*
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %1, align 16, !tbaa !3
+  %arrayidx.4 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 4
+  %arrayidx.5 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 5
+  %arrayidx.6 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 6
+  %arrayidx.7 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 7
+  %2 = bitcast i32* %arrayidx.4 to <4 x i32>*
+  store <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32>* %2, align 16, !tbaa !3
+  %arrayidx.8 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 8
+  store i32 8, i32* %arrayidx.8, align 16, !tbaa !3
+  %arrayidx.9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 9
+  store i32 9, i32* %arrayidx.9, align 4, !tbaa !3
+  %3 = bitcast [1 x i8*]* %.offload_baseptrs.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3)
+  %4 = bitcast [1 x i8*]* %.offload_ptrs.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %4)
+  %5 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs.i, i64 0, i64 0
+  %6 = bitcast [1 x i8*]* %.offload_baseptrs.i to i32**
+  store i32* %arrayidx, i32** %6, align 8
+  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs.i, i64 0, i64 0
+  %8 = bitcast [1 x i8*]* %.offload_ptrs.i to i32**
+  store i32* %arrayidx, i32** %8, align 8
+  %9 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852ec2_foo_l10.region_id, i32 1, i8** nonnull %5, i8** nonnull %7, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #4
+  %10 = icmp eq i32 %9, 0
+  br i1 %10, label %foo.exit, label %omp_offload.failed.i
+
+omp_offload.failed.i:                             ; preds = %entry
+  %11 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #4
+  %12 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %11, i32 3, i32 0) #4
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %arrayidx) #4
+  br label %foo.exit
+
+foo.exit:                                         ; preds = %entry, %omp_offload.failed.i
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %4)
+  %13 = load i32, i32* %arrayidx, align 16, !tbaa !3
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 0, i32 %13)
+  %14 = load i32, i32* %arrayidx.1, align 4, !tbaa !3
+  %call.1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 1, i32 %14)
+  %15 = load i32, i32* %arrayidx.2, align 8, !tbaa !3
+  %call.2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 2, i32 %15)
+  %16 = load i32, i32* %arrayidx.3, align 4, !tbaa !3
+  %call.3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 3, i32 %16)
+  %17 = load i32, i32* %arrayidx.4, align 16, !tbaa !3
+  %call.4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 4, i32 %17)
+  %18 = load i32, i32* %arrayidx.5, align 4, !tbaa !3
+  %call.5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 5, i32 %18)
+  %19 = load i32, i32* %arrayidx.6, align 8, !tbaa !3
+  %call.6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 6, i32 %19)
+  %20 = load i32, i32* %arrayidx.7, align 4, !tbaa !3
+  %call.7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 7, i32 %20)
+  %21 = load i32, i32* %arrayidx.8, align 16, !tbaa !3
+  %call.8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 8, i32 %21)
+  %22 = load i32, i32* %arrayidx.9, align 4, !tbaa !3
+  %call.9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 9, i32 %22)
+  call void @llvm.lifetime.end.p0i8(i64 40, i8* nonnull %0) #4
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3
+
+; Function Attrs: nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8* nocapture readnone) #0 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud) {
+entry:
+  %1 = tail call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) local_unnamed_addr
+
+; Function Attrs: nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cud() #0 section ".text.startup" comdat {
+entry:
+  %0 = tail call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #4
+  %1 = tail call i32 @__cxa_atexit(void (i8*)* nonnull @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* nonnull @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) local_unnamed_addr
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #4
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 24, i32 42282690, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8}
+!8 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.3/target_offload_is_SPMD.c
===================================================================
--- /dev/null
+++ SPMD_examples/v0.3/target_offload_is_SPMD.c
@@ -0,0 +1,36 @@
+#include <omp.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define N 10
+#define TEAMS 3
+
+void foo(int* dis) {
+
+  #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N])
+  {
+    #pragma omp distribute parallel for firstprivate(dis)
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;
+
+    #pragma omp distribute parallel for firstprivate(dis)
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;
+  }
+}
+
+int main() {
+  int dis[N];
+
+  for (int i = 0; i < N; i++)
+    dis[i] = i;
+
+  foo(dis);
+
+  for (int i = 0; i < N; i++)
+    printf("dis[%3i] = %4i\n", i, dis[i]);
+
+  return 0;
+}
Index: SPMD_examples/v0.3/target_offload_not_SPMD.O0.new.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.3/target_offload_not_SPMD.O0.new.ll
@@ -0,0 +1,610 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud
+; ModuleID = '../SPMD_examples/v0.3/target_offload_not_SPMD.c'
+source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.shared.struct = type { i64, i64, i32* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_2852fc0_foo_l10_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_2852fc0_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: noinline norecurse nounwind optnone
+define weak void @__omp_offloading_18_2852fc0_foo_l10(i32* %dis) #0 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %dis.addr.i = alloca i32*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %dis.addr = alloca i32*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %dis, i32** %dis.addr, align 8
+  %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0)
+  %1 = icmp eq i16 %0, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %3 = load i32*, i32** %dis.addr, align 8
+  store i32 %2, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !5
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !5
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !5
+  store i32* %3, i32** %dis.addr.i, align 8, !noalias !5
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5
+  %4 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !5
+  %5 = load i32, i32* %4, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %5, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #3
+  %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp.i = icmp sgt i32 %6, 9
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %7 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 9, %cond.true.i ], [ %7, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %8 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 %8, i32* %.omp.iv.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %9 = load i32, i32* %.omp.iv.i, align 4, !noalias !5
+  %10 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp1.i = icmp sle i32 %9, %10
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %11 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  %12 = zext i32 %11 to i64
+  %13 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %14 = zext i32 %13 to i64
+  %15 = load i32*, i32** %dis.addr.i, align 8, !noalias !5
+  %16 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %17 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %12, i64* %17, !noalias !5
+  %18 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %14, i64* %18, !noalias !5
+  %19 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store i32* %15, i32** %19, !noalias !5
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %16, i16 24, i16 1) #3
+  %20 = load i32, i32* %.omp.iv.i, align 4, !noalias !5
+  %21 = load i32, i32* %.omp.stride.i, align 4, !noalias !5
+  %add.i = add nsw i32 %20, %21
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %5) #3
+  %22 = load i32*, i32** %dis.addr.i, align 8, !noalias !5
+  %call.i = call i32 @omp_get_team_num() #3
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %22, i64 %idxprom.i
+  %23 = load i32, i32* %arrayidx.i, align 4
+  %add2.i = add nsw i32 %23, 1
+  store i32 %add2.i, i32* %arrayidx.i, align 4
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* %dis) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %0 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %0 to i32
+  %1 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %1 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %2 = load i32*, i32** %.global_tid..addr, align 8
+  %3 = load i32, i32* %2, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %4 = load i32, i32* %.omp.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %5 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %5 to i64
+  %6 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %6
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %7, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %8 = load i32*, i32** %dis.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %11 = load i32, i32* %.omp.iv, align 4
+  %12 = load i32, i32* %.omp.stride, align 4
+  %add4 = add nsw i32 %11, %12
+  store i32 %add4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: noinline norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct*
+  %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0
+  %4 = load i64, i64* %3, align 1
+  %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1
+  %6 = load i64, i64* %5, align 1
+  %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2
+  %8 = load i32*, i32** %7, align 1
+  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32* %8) #3
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+declare i32 @omp_get_team_num() #2
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+attributes #0 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0}
+!1 = !{void (i32*)* @__omp_offloading_18_2852fc0_foo_l10, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 "}
+!5 = !{!6, !8}
+!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."}
+!7 = distinct !{!7, !"__omp_outlined__"}
+!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/johannes/target_offload_not_SPMD-9b84f7.bc'
+source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cud = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_18_2852fc0_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 40]
+@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
+@.str.2 = private unnamed_addr constant [16 x i8] c"dis[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_18_2852fc0_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_18_2852fc0_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_18_2852fc0_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cud = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cud = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cud, i8* @.omp_offloading.img_end.nvptx64-nvida-cud, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud to i8*) }]
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local void @foo(i32* %dis) #0 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %.offload_baseptrs = alloca [1 x i8*], align 8
+  %.offload_ptrs = alloca [1 x i8*], align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  %0 = load i32*, i32** %dis.addr, align 8
+  %1 = load i32*, i32** %dis.addr, align 8
+  %2 = load i32*, i32** %dis.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 0
+  %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %4 = bitcast i8** %3 to i32**
+  store i32* %1, i32** %4, align 8
+  %5 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %6 = bitcast i8** %5 to i32**
+  store i32* %arrayidx, i32** %6, align 8
+  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i32 0, i32 0
+  %8 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i32 0, i32 0
+  %9 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_18_2852fc0_foo_l10.region_id, i32 1, i8** %7, i8** %8, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0)
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %omp_offload.failed, label %omp_offload.cont
+
+omp_offload.failed:                               ; preds = %entry
+  call void @__omp_offloading_18_2852fc0_foo_l10(i32* %0) #4
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %omp_offload.failed, %entry
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @__omp_offloading_18_2852fc0_foo_l10(i32* %dis) #1 {
+entry:
+  %dis.addr = alloca i32*, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32* %dis, i32** %dis.addr, align 8
+  %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0)
+  %2 = load i32*, i32** %dis.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2)
+  ret void
+}
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %dis.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32 0, i32* %.omp.comb.lb, align 4
+  store i32 9, i32* %.omp.comb.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %0 = load i32*, i32** %.global_tid..addr, align 8
+  %1 = load i32, i32* %0, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1)
+  %2 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp = icmp sgt i32 %2, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %3 = load i32, i32* %.omp.comb.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ]
+  store i32 %cond, i32* %.omp.comb.ub, align 4
+  %4 = load i32, i32* %.omp.comb.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %5 = load i32, i32* %.omp.iv, align 4
+  %6 = load i32, i32* %.omp.comb.ub, align 4
+  %cmp1 = icmp sle i32 %5, %6
+  br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.comb.lb, align 4
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub, align 4
+  %10 = zext i32 %9 to i64
+  %11 = load i32*, i32** %dis.addr, align 8
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32* %11)
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.inner.for.body
+  %12 = load i32, i32* %.omp.iv, align 4
+  %13 = load i32, i32* %.omp.stride, align 4
+  %add = add nsw i32 %12, %13
+  store i32 %add, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1)
+  %14 = load i32*, i32** %dis.addr, align 8
+  %call = call i32 @omp_get_team_num()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %14, i64 %idxprom
+  %15 = load i32, i32* %arrayidx, align 4
+  %add2 = add nsw i32 %15, 1
+  store i32 %add2, i32* %arrayidx, align 4
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: noinline norecurse nounwind optnone uwtable
+define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* %dis) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %dis.addr = alloca i32*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store i32* %dis, i32** %dis.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 9, i32* %.omp.ub, align 4
+  %0 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %0 to i32
+  %1 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %1 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %2 = load i32*, i32** %.global_tid..addr, align 8
+  %3 = load i32, i32* %2, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %4 = load i32, i32* %.omp.ub, align 4
+  %cmp = icmp sgt i32 %4, 9
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  br label %cond.end
+
+cond.false:                                       ; preds = %entry
+  %5 = load i32, i32* %.omp.ub, align 4
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 9, %cond.true ], [ %5, %cond.false ]
+  store i32 %cond, i32* %.omp.ub, align 4
+  %6 = load i32, i32* %.omp.lb, align 4
+  store i32 %6, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %cond.end
+  %7 = load i32, i32* %.omp.iv, align 4
+  %8 = load i32, i32* %.omp.ub, align 4
+  %cmp3 = icmp sle i32 %7, %8
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %9 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %9, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %10 = load i32*, i32** %dis.addr, align 8
+  %11 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %11 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %10, i64 %idxprom
+  %12 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %13 = load i32, i32* %.omp.iv, align 4
+  %add5 = add nsw i32 %13, 1
+  store i32 %add5, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3)
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @omp_get_team_num() #2
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32)
+
+declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...)
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32)
+
+; Function Attrs: noinline nounwind optnone uwtable
+define dso_local i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %dis = alloca [10 x i32], align 16
+  %i = alloca i32, align 4
+  %i1 = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 10
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %2 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %2 to i64
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom
+  store i32 %1, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %3 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %3, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0
+  call void @foo(i32* %arraydecay)
+  store i32 0, i32* %i1, align 4
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc7, %for.end
+  %4 = load i32, i32* %i1, align 4
+  %cmp3 = icmp slt i32 %4, 10
+  br i1 %cmp3, label %for.body4, label %for.end9
+
+for.body4:                                        ; preds = %for.cond2
+  %5 = load i32, i32* %i1, align 4
+  %6 = load i32, i32* %i1, align 4
+  %idxprom5 = sext i32 %6 to i64
+  %arrayidx6 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom5
+  %7 = load i32, i32* %arrayidx6, align 4
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i32 %5, i32 %7)
+  br label %for.inc7
+
+for.inc7:                                         ; preds = %for.body4
+  %8 = load i32, i32* %i1, align 4
+  %inc8 = add nsw i32 %8, 1
+  store i32 %inc8, i32* %i1, align 4
+  br label %for.cond2
+
+for.end9:                                         ; preds = %for.cond2
+  ret i32 0
+}
+
+declare dso_local i32 @printf(i8*, ...) #2
+
+; Function Attrs: noinline nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud) {
+entry:
+  %.addr = alloca i8*, align 8
+  store i8* %0, i8** %.addr, align 8
+  %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: noinline nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cud() #3 section ".text.startup" comdat {
+entry:
+  %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor)
+  %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*)
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4
+
+attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4}
+!4 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.3/target_offload_not_SPMD.O3.new.ll
===================================================================
--- /dev/null
+++ SPMD_examples/v0.3/target_offload_not_SPMD.O3.new.ll
@@ -0,0 +1,576 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud
+; ModuleID = '../SPMD_examples/v0.3/target_offload_not_SPMD.c'
+source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.shared.struct = type { i64, i64, i32* }
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_2852fc0_foo_l10_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_2852fc0_foo_l10_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_2852fc0_foo_l10(i32* %dis) local_unnamed_addr #0 {
+entry:
+  %work_fn.addr = alloca i8*, align 8
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %thread_kind = tail call i16 @__kmpc_generic_kernel_init(i16 0, i16 0, i16 1, i16 0) #3
+  switch i16 %thread_kind, label %.exit [
+    i16 -1, label %worker.wait.preheader
+    i16 1, label %.execute
+  ]
+
+worker.wait.preheader:                            ; preds = %entry
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #3
+  %is_active7 = call i1 @__kmpc_kernel_parallel(i8** nonnull %work_fn.addr, i16 1) #3
+  %Work_fn.addr_cast = bitcast i8** %work_fn.addr to void (i8*)**
+  %work_fn8 = load void (i8*)*, void (i8*)** %Work_fn.addr_cast, align 8
+  %no_work9 = icmp eq void (i8*)* %work_fn8, null
+  br i1 %no_work9, label %master_check, label %worker.active_check
+
+worker.active_check:                              ; preds = %worker.wait.preheader, %worker.inactive
+  %work_fn11 = phi void (i8*)* [ %work_fn, %worker.inactive ], [ %work_fn8, %worker.wait.preheader ]
+  %is_active10 = phi i1 [ %is_active, %worker.inactive ], [ %is_active7, %worker.wait.preheader ]
+  br i1 %is_active10, label %worker.active, label %worker.inactive
+
+worker.active:                                    ; preds = %worker.active_check
+  %0 = call i8* @__kmpc_get_shared_variables() #3
+  %par_fn_check = icmp eq void (i8*)* %work_fn11, @__omp_outlined__1_wrapper
+  br i1 %par_fn_check, label %worker.execute.__omp_outlined__1_wrapper, label %worker.check.next5
+
+worker.execute.__omp_outlined__1_wrapper:         ; preds = %worker.active
+  call void @__omp_outlined__1_wrapper(i8* %0)
+  br label %worker.parallel_end
+
+worker.check.next5:                               ; preds = %worker.active
+  call void %work_fn11(i8* %0) #3
+  br label %worker.parallel_end
+
+worker.parallel_end:                              ; preds = %worker.execute.__omp_outlined__1_wrapper, %worker.check.next5
+  call void @__kmpc_kernel_end_parallel() #3
+  br label %worker.inactive
+
+worker.inactive:                                  ; preds = %worker.active_check, %worker.parallel_end
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #3
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #3
+  %is_active = call i1 @__kmpc_kernel_parallel(i8** nonnull %work_fn.addr, i16 1) #3
+  %work_fn = load void (i8*)*, void (i8*)** %Work_fn.addr_cast, align 8
+  %no_work = icmp eq void (i8*)* %work_fn, null
+  br i1 %no_work, label %master_check, label %worker.active_check
+
+master_check:                                     ; preds = %worker.inactive, %worker.wait.preheader
+  %1 = icmp eq i16 %thread_kind, 1
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry, %master_check
+  %2 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #3
+  %3 = bitcast %omp.shared.struct* %.captured.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %3)
+  %4 = bitcast i32* %.omp.comb.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #3
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !tbaa !5
+  %5 = bitcast i32* %.omp.comb.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %5) #3
+  store i32 9, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %6 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %6) #3
+  store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5
+  %7 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %7) #3
+  store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %2, i32 92, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.comb.lb.i, i32* nonnull %.omp.comb.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #3
+  %8 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %9 = icmp slt i32 %8, 9
+  %cond.i = select i1 %9, i32 %8, i32 9
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %10 = load i32, i32* %.omp.comb.lb.i, align 4, !tbaa !5
+  %cmp11.i = icmp sgt i32 %10, %cond.i
+  br i1 %cmp11.i, label %__omp_outlined__.exit, label %omp.inner.for.body.lr.ph.i
+
+omp.inner.for.body.lr.ph.i:                       ; preds = %.execute
+  %11 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 0
+  %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 1
+  %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 2
+  %14 = zext i32 %10 to i64
+  %15 = zext i32 %cond.i to i64
+  store i64 %14, i64* %11, align 8
+  store i64 %15, i64* %12, align 8
+  store i32* %dis, i32** %13, align 8
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* nonnull %3, i16 24, i16 1) #3
+  %16 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %add.i1 = add nsw i32 %16, %10
+  %17 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %cmp1.i2 = icmp sgt i32 %add.i1, %17
+  br i1 %cmp1.i2, label %__omp_outlined__.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i
+
+omp.inner.for.body.omp.inner.for.body_crit_edge.i: ; preds = %omp.inner.for.body.lr.ph.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i
+  %18 = phi i32 [ %22, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %17, %omp.inner.for.body.lr.ph.i ]
+  %add.i3 = phi i32 [ %add.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %add.i1, %omp.inner.for.body.lr.ph.i ]
+  %.pre.i = load i32, i32* %.omp.comb.lb.i, align 4
+  %19 = zext i32 %.pre.i to i64
+  %20 = zext i32 %18 to i64
+  store i64 %19, i64* %11, align 8
+  store i64 %20, i64* %12, align 8
+  store i32* %dis, i32** %13, align 8
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* nonnull %3, i16 24, i16 1) #3
+  %21 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %add.i = add nsw i32 %21, %add.i3
+  %22 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5
+  %cmp1.i = icmp sgt i32 %add.i, %22
+  br i1 %cmp1.i, label %__omp_outlined__.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge.i, %omp.inner.for.body.lr.ph.i, %.execute
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %2) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %7) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %6) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %5) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #3
+  %call.i = call i32 @omp_get_team_num() #3
+  %idxprom.i = sext i32 %call.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %dis, i64 %idxprom.i
+  %23 = load i32, i32* %arrayidx.i, align 4, !tbaa !5
+  %add2.i = add nsw i32 %23, 1
+  store i32 %add2.i, i32* %arrayidx.i, align 4, !tbaa !5
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %3)
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) #3
+  br label %.exit
+
+.exit:                                            ; preds = %entry, %__omp_outlined__.exit, %master_check
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* nocapture readonly %payload) #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #3
+  %1 = bitcast i8* %payload to i64*
+  %2 = load i64, i64* %1, align 1
+  %3 = getelementptr inbounds i8, i8* %payload, i64 8
+  %4 = bitcast i8* %3 to i64*
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds i8, i8* %payload, i64 16
+  %7 = bitcast i8* %6 to i32**
+  %8 = load i32*, i32** %7, align 1
+  %9 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9) #3
+  %10 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #3
+  %conv.i = trunc i64 %2 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !tbaa !5
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !tbaa !5
+  %11 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11) #3
+  store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5
+  %12 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %12) #3
+  store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %0, i32 33, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #3
+  %13 = load i32, i32* %.omp.lb.i, align 4, !tbaa !5
+  %conv21.i = sext i32 %13 to i64
+  %cmp2.i = icmp ult i64 %5, %conv21.i
+  br i1 %cmp2.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.lr.ph.i
+
+omp.inner.for.body.lr.ph.i:                       ; preds = %entry
+  %14 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5
+  %15 = sext i32 %14 to i64
+  br label %omp.inner.for.body.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.body.i, %omp.inner.for.body.lr.ph.i
+  %indvars.iv.i = phi i64 [ %conv21.i, %omp.inner.for.body.lr.ph.i ], [ %indvars.iv.next.i, %omp.inner.for.body.i ]
+  %arrayidx.i = getelementptr inbounds i32, i32* %8, i64 %indvars.iv.i
+  %16 = atomicrmw add i32* %arrayidx.i, i32 1 monotonic
+  %indvars.iv.next.i = add i64 %indvars.iv.i, %15
+  %cmp.i = icmp ugt i64 %indvars.iv.next.i, %5
+  br i1 %cmp.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.i
+
+__omp_outlined__1.exit:                           ; preds = %omp.inner.for.body.i, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %12) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %10) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %9) #3
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) local_unnamed_addr
+
+declare i32 @omp_get_team_num() local_unnamed_addr #2
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16) local_unnamed_addr
+
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) local_unnamed_addr
+
+declare i1 @__kmpc_kernel_parallel(i8**, i16) local_unnamed_addr
+
+declare i8* @__kmpc_get_shared_variables() local_unnamed_addr
+
+declare void @__kmpc_kernel_end_parallel() local_unnamed_addr
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0}
+!1 = !{void (i32*)* @__omp_offloading_18_2852fc0_foo_l10, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 "}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"int", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud
+
+; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu
+; ModuleID = '/tmp/johannes/target_offload_not_SPMD-ce0dd6.bc'
+source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 }
+%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* }
+
+$.omp_offloading.descriptor_reg.nvptx64-nvida-cud = comdat any
+
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@.__omp_offloading_18_2852fc0_foo_l10.region_id = weak constant i8 0
+@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 40]
+@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35]
+@.str.2 = private unnamed_addr constant [16 x i8] c"dis[%3i] = %4i\0A\00", align 1
+@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_18_2852fc0_foo_l10\00"
+@.omp_offloading.entry.__omp_offloading_18_2852fc0_foo_l10 = weak local_unnamed_addr constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_18_2852fc0_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1
+@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry
+@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry
+@.omp_offloading.img_start.nvptx64-nvida-cud = extern_weak constant i8
+@.omp_offloading.img_end.nvptx64-nvida-cud = extern_weak constant i8
+@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cud, i8* @.omp_offloading.img_end.nvptx64-nvida-cud, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8
+@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8
+@__dso_handle = external hidden global i8
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud to i8*) }]
+
+; Function Attrs: nounwind uwtable
+define dso_local void @foo(i32* %dis) local_unnamed_addr #0 {
+entry:
+  %.offload_baseptrs = alloca [1 x i8*], align 8
+  %.offload_ptrs = alloca [1 x i8*], align 8
+  %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0
+  %1 = bitcast [1 x i8*]* %.offload_baseptrs to i32**
+  store i32* %dis, i32** %1, align 8
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0
+  %3 = bitcast [1 x i8*]* %.offload_ptrs to i32**
+  store i32* %dis, i32** %3, align 8
+  %4 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852fc0_foo_l10.region_id, i32 1, i8** nonnull %0, i8** nonnull %2, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #5
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %omp_offload.cont, label %omp_offload.failed
+
+omp_offload.failed:                               ; preds = %entry
+  %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #5
+  %7 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %6, i32 3, i32 0) #5
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %dis) #5
+  br label %omp_offload.cont
+
+omp_offload.cont:                                 ; preds = %entry, %omp_offload.failed
+  ret void
+}
+
+; Function Attrs: norecurse nounwind uwtable
+define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* %dis) #1 {
+entry:
+  %.omp.comb.lb = alloca i32, align 4
+  %.omp.comb.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %0 = bitcast i32* %.omp.comb.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #5
+  store i32 0, i32* %.omp.comb.lb, align 4, !tbaa !3
+  %1 = bitcast i32* %.omp.comb.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #5
+  store i32 9, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %2 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #5
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !3
+  %3 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #5
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !3
+  %4 = load i32, i32* %.global_tid., align 4, !tbaa !3
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %4, i32 92, i32* nonnull %.omp.is_last, i32* nonnull %.omp.comb.lb, i32* nonnull %.omp.comb.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #5
+  %5 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %6 = icmp slt i32 %5, 9
+  %cond = select i1 %6, i32 %5, i32 9
+  store i32 %cond, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %7 = load i32, i32* %.omp.comb.lb, align 4, !tbaa !3
+  %cmp17 = icmp sgt i32 %7, %cond
+  br i1 %cmp17, label %omp.loop.exit, label %omp.inner.for.body.preheader
+
+omp.inner.for.body.preheader:                     ; preds = %entry
+  %8 = zext i32 %7 to i64
+  %9 = zext i32 %cond to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %9, i32* %dis) #5
+  %10 = load i32, i32* %.omp.stride, align 4, !tbaa !3
+  %add9 = add nsw i32 %10, %7
+  %11 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %cmp110 = icmp sgt i32 %add9, %11
+  br i1 %cmp110, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge
+
+omp.inner.for.body.omp.inner.for.body_crit_edge:  ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body.omp.inner.for.body_crit_edge
+  %12 = phi i32 [ %16, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %11, %omp.inner.for.body.preheader ]
+  %add11 = phi i32 [ %add, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %add9, %omp.inner.for.body.preheader ]
+  %.pre = load i32, i32* %.omp.comb.lb, align 4
+  %13 = zext i32 %.pre to i64
+  %14 = zext i32 %12 to i64
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %13, i64 %14, i32* %dis) #5
+  %15 = load i32, i32* %.omp.stride, align 4, !tbaa !3
+  %add = add nsw i32 %15, %add11
+  %16 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3
+  %cmp1 = icmp sgt i32 %add, %16
+  br i1 %cmp1, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge, %omp.inner.for.body.preheader, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #5
+  %call = call i32 @omp_get_team_num() #5
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %idxprom
+  %17 = load i32, i32* %arrayidx, align 4, !tbaa !3
+  %add2 = add nsw i32 %17, 1
+  store i32 %add2, i32* %arrayidx, align 4, !tbaa !3
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
+
+declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr
+
+; Function Attrs: norecurse nounwind uwtable
+define internal void @.omp_outlined..1(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* nocapture %dis) #1 {
+entry:
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %0 = bitcast i32* %.omp.lb to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #5
+  %1 = bitcast i32* %.omp.ub to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #5
+  %conv = trunc i64 %.previous.lb. to i32
+  %conv1 = trunc i64 %.previous.ub. to i32
+  store i32 %conv, i32* %.omp.lb, align 4, !tbaa !3
+  store i32 %conv1, i32* %.omp.ub, align 4, !tbaa !3
+  %2 = bitcast i32* %.omp.stride to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #5
+  store i32 1, i32* %.omp.stride, align 4, !tbaa !3
+  %3 = bitcast i32* %.omp.is_last to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #5
+  store i32 0, i32* %.omp.is_last, align 4, !tbaa !3
+  %4 = load i32, i32* %.global_tid., align 4, !tbaa !3
+  call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %4, i32 34, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #5
+  %5 = load i32, i32* %.omp.ub, align 4, !tbaa !3
+  %6 = icmp slt i32 %5, 9
+  %cond = select i1 %6, i32 %5, i32 9
+  store i32 %cond, i32* %.omp.ub, align 4, !tbaa !3
+  %7 = load i32, i32* %.omp.lb, align 4, !tbaa !3
+  %cmp310 = icmp sgt i32 %7, %cond
+  br i1 %cmp310, label %omp.loop.exit, label %omp.inner.for.body.preheader
+
+omp.inner.for.body.preheader:                     ; preds = %entry
+  %8 = sext i32 %7 to i64
+  %9 = sext i32 %cond to i64
+  br label %omp.inner.for.body
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body
+  %indvars.iv = phi i64 [ %8, %omp.inner.for.body.preheader ], [ %indvars.iv.next, %omp.inner.for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %indvars.iv
+  %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %cmp3 = icmp slt i64 %indvars.iv, %9
+  br i1 %cmp3, label %omp.inner.for.body, label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.body, %entry
+  call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #5
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #5
+  ret void
+}
+
+declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+declare !callback !7 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+declare dso_local i32 @omp_get_team_num() local_unnamed_addr #3
+
+declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr
+
+declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) local_unnamed_addr
+
+declare !callback !7 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr
+
+declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) local_unnamed_addr
+
+; Function Attrs: nounwind uwtable
+define dso_local i32 @main() local_unnamed_addr #0 {
+entry:
+  %.offload_baseptrs.i = alloca [1 x i8*], align 8
+  %.offload_ptrs.i = alloca [1 x i8*], align 8
+  %dis = alloca [10 x i32], align 16
+  %0 = bitcast [10 x i32]* %dis to i8*
+  call void @llvm.lifetime.start.p0i8(i64 40, i8* nonnull %0) #5
+  %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 0
+  %arrayidx.1 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 1
+  %arrayidx.2 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 2
+  %arrayidx.3 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 3
+  %1 = bitcast [10 x i32]* %dis to <4 x i32>*
+  store <4 x i32> <i32 0, i32 1, i32 2, i32 3>, <4 x i32>* %1, align 16, !tbaa !3
+  %arrayidx.4 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 4
+  %arrayidx.5 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 5
+  %arrayidx.6 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 6
+  %arrayidx.7 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 7
+  %2 = bitcast i32* %arrayidx.4 to <4 x i32>*
+  store <4 x i32> <i32 4, i32 5, i32 6, i32 7>, <4 x i32>* %2, align 16, !tbaa !3
+  %arrayidx.8 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 8
+  store i32 8, i32* %arrayidx.8, align 16, !tbaa !3
+  %arrayidx.9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 9
+  store i32 9, i32* %arrayidx.9, align 4, !tbaa !3
+  %3 = bitcast [1 x i8*]* %.offload_baseptrs.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3)
+  %4 = bitcast [1 x i8*]* %.offload_ptrs.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %4)
+  %5 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs.i, i64 0, i64 0
+  %6 = bitcast [1 x i8*]* %.offload_baseptrs.i to i32**
+  store i32* %arrayidx, i32** %6, align 8
+  %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs.i, i64 0, i64 0
+  %8 = bitcast [1 x i8*]* %.offload_ptrs.i to i32**
+  store i32* %arrayidx, i32** %8, align 8
+  %9 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852fc0_foo_l10.region_id, i32 1, i8** nonnull %5, i8** nonnull %7, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #5
+  %10 = icmp eq i32 %9, 0
+  br i1 %10, label %foo.exit, label %omp_offload.failed.i
+
+omp_offload.failed.i:                             ; preds = %entry
+  %11 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #5
+  %12 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %11, i32 3, i32 0) #5
+  call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %arrayidx) #5
+  br label %foo.exit
+
+foo.exit:                                         ; preds = %entry, %omp_offload.failed.i
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %4)
+  %13 = load i32, i32* %arrayidx, align 16, !tbaa !3
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 0, i32 %13)
+  %14 = load i32, i32* %arrayidx.1, align 4, !tbaa !3
+  %call.1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 1, i32 %14)
+  %15 = load i32, i32* %arrayidx.2, align 8, !tbaa !3
+  %call.2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 2, i32 %15)
+  %16 = load i32, i32* %arrayidx.3, align 4, !tbaa !3
+  %call.3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 3, i32 %16)
+  %17 = load i32, i32* %arrayidx.4, align 16, !tbaa !3
+  %call.4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 4, i32 %17)
+  %18 = load i32, i32* %arrayidx.5, align 4, !tbaa !3
+  %call.5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 5, i32 %18)
+  %19 = load i32, i32* %arrayidx.6, align 8, !tbaa !3
+  %call.6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 6, i32 %19)
+  %20 = load i32, i32* %arrayidx.7, align 4, !tbaa !3
+  %call.7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 7, i32 %20)
+  %21 = load i32, i32* %arrayidx.8, align 16, !tbaa !3
+  %call.8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 8, i32 %21)
+  %22 = load i32, i32* %arrayidx.9, align 4, !tbaa !3
+  %call.9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 9, i32 %22)
+  call void @llvm.lifetime.end.p0i8(i64 40, i8* nonnull %0) #5
+  ret i32 0
+}
+
+; Function Attrs: nounwind
+declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #4
+
+; Function Attrs: nounwind uwtable
+define internal void @.omp_offloading.descriptor_unreg(i8* nocapture readnone) #0 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud) {
+entry:
+  %1 = tail call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #5
+  ret void
+}
+
+declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) local_unnamed_addr
+
+; Function Attrs: nounwind uwtable
+define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cud() #0 section ".text.startup" comdat {
+entry:
+  %0 = tail call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #5
+  %1 = tail call i32 @__cxa_atexit(void (i8*)* nonnull @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* nonnull @__dso_handle) #5
+  ret void
+}
+
+declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) local_unnamed_addr
+
+; Function Attrs: nounwind
+declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #5
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { nounwind }
+
+!omp_offload.info = !{!0}
+!llvm.module.flags = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{!"clang version 9.0.0 "}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!8}
+!8 = !{i64 2, i64 -1, i64 -1, i1 true}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu
Index: SPMD_examples/v0.3/target_offload_not_SPMD.c
===================================================================
--- /dev/null
+++ SPMD_examples/v0.3/target_offload_not_SPMD.c
@@ -0,0 +1,33 @@
+#include <omp.h>
+#include <stdio.h>
+#include <assert.h>
+
+#define N 10
+#define TEAMS 3
+
+void foo(int* dis) {
+
+  #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N])
+  {
+    #pragma omp distribute parallel for firstprivate(dis)
+    for (int i = 0; i < N; i++)
+      #pragma omp atomic
+      dis[i] += 1;
+
+    dis[omp_get_team_num()] += 1;
+  }
+}
+
+int main() {
+  int dis[N];
+
+  for (int i = 0; i < N; i++)
+    dis[i] = i;
+
+  foo(dis);
+
+  for (int i = 0; i < N; i++)
+    printf("dis[%3i] = %4i\n", i, dis[i]);
+
+  return 0;
+}
Index: clang/lib/CodeGen/CGOpenMPRuntime.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntime.h
+++ clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -211,6 +211,10 @@
     ~DisableAutoDeclareTargetRAII();
   };
 
+  /// Emits \p Callee function call with arguments \p Args with location \p Loc.
+  void emitCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *Callee,
+                ArrayRef<llvm::Value *> Args = llvm::None) const;
+
 protected:
   CodeGenModule &CGM;
   StringRef FirstSeparator, Separator;
@@ -270,10 +274,6 @@
   //
   virtual StringRef getOutlinedHelperName() const { return ".omp_outlined."; }
 
-  /// Emits \p Callee function call with arguments \p Args with location \p Loc.
-  void emitCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *Callee,
-                ArrayRef<llvm::Value *> Args = llvm::None) const;
-
   /// Emits address of the word in a memory where current thread id is
   /// stored.
   virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc);
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h
@@ -34,47 +34,15 @@
     EM_Unknown,
   };
 private:
-  /// Parallel outlined function work for workers to execute.
-  llvm::SmallVector<llvm::Function *, 16> Work;
 
   struct EntryFunctionState {
     llvm::BasicBlock *ExitBB = nullptr;
   };
 
-  class WorkerFunctionState {
-  public:
-    llvm::Function *WorkerFn;
-    const CGFunctionInfo &CGFI;
-    SourceLocation Loc;
-
-    WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc);
-
-  private:
-    void createWorkerFunction(CodeGenModule &CGM);
-  };
-
   ExecutionMode getExecutionMode() const;
 
   bool requiresFullRuntime() const { return RequiresFullRuntime; }
 
-  /// Get barrier to synchronize all threads in a block.
-  void syncCTAThreads(CodeGenFunction &CGF);
-
-  /// Emit the worker function for the current target region.
-  void emitWorkerFunction(WorkerFunctionState &WST);
-
-  /// Helper for worker function. Emit body of worker loop.
-  void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST);
-
-  /// Helper for non-SPMD target entry function. Guide the master and
-  /// worker threads to their respective locations.
-  void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
-                              WorkerFunctionState &WST);
-
-  /// Signal termination of OMP execution for non-SPMD target entry
-  /// function.
-  void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
-
   /// Helper for generic variables globalization prolog.
   void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc,
                              bool WithSPMDCheck = false);
@@ -82,12 +50,13 @@
   /// Helper for generic variables globalization epilog.
   void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false);
 
-  /// Helper for SPMD mode target directive's entry function.
-  void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
-                           const OMPExecutableDirective &D);
+  /// Helper for generic kernel mode, target directive's entry function.
+  void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST,
+                              const OMPExecutableDirective &D, bool IsSPMD);
 
-  /// Signal termination of SPMD mode execution.
-  void emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST);
+  /// Signal termination of generic mode execution.
+  void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST,
+                              bool IsSPMD);
 
   //
   // Base class overrides.
@@ -99,20 +68,6 @@
                           uint64_t Size, int32_t Flags,
                           llvm::GlobalValue::LinkageTypes Linkage) override;
 
-  /// Emit outlined function specialized for the Fork-Join
-  /// programming model for applicable target directives on the NVPTX device.
-  /// \param D Directive to emit.
-  /// \param ParentName Name of the function that encloses the target region.
-  /// \param OutlinedFn Outlined function value to be defined by this call.
-  /// \param OutlinedFnID Outlined function ID value to be defined by this call.
-  /// \param IsOffloadEntry True if the outlined function is an offload entry.
-  /// An outlined function may not be an entry if, e.g. the if clause always
-  /// evaluates to false.
-  void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                         llvm::Function *&OutlinedFn,
-                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                         const RegionCodeGenTy &CodeGen);
-
   /// Emit outlined function specialized for the Single Program
   /// Multiple Data programming model for applicable target directives on the
   /// NVPTX device.
@@ -121,13 +76,14 @@
   /// \param OutlinedFn Outlined function value to be defined by this call.
   /// \param OutlinedFnID Outlined function ID value to be defined by this call.
   /// \param IsOffloadEntry True if the outlined function is an offload entry.
+  /// \param IsSPMD True if the kernel is know to be executed in SPMD mode.
   /// \param CodeGen Object containing the target statements.
   /// An outlined function may not be an entry if, e.g. the if clause always
   /// evaluates to false.
-  void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName,
-                      llvm::Function *&OutlinedFn,
-                      llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
-                      const RegionCodeGenTy &CodeGen);
+  void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName,
+                         llvm::Function *&OutlinedFn,
+                         llvm::Constant *&OutlinedFnID, bool IsOffloadEntry,
+                         bool IsSPMD, const RegionCodeGenTy &CodeGen);
 
   /// Emit outlined function for 'target' directive on the NVPTX
   /// device.
@@ -145,21 +101,6 @@
                                   bool IsOffloadEntry,
                                   const RegionCodeGenTy &CodeGen) override;
 
-  /// Emits code for parallel or serial call of the \a OutlinedFn with
-  /// variables captured in a record which address is stored in \a
-  /// CapturedStruct.
-  /// This call is for the Non-SPMD Execution Mode.
-  /// \param OutlinedFn Outlined function to be run in parallel threads. Type of
-  /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*).
-  /// \param CapturedVars A pointer to the record with the references to
-  /// variables used in \a OutlinedFn function.
-  /// \param IfCond Condition in the associated 'if' clause, if it was
-  /// specified, nullptr otherwise.
-  void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
-                               llvm::Value *OutlinedFn,
-                               ArrayRef<llvm::Value *> CapturedVars,
-                               const Expr *IfCond);
-
   /// Emits code for parallel or serial call of the \a OutlinedFn with
   /// variables captured in a record which address is stored in \a
   /// CapturedStruct.
@@ -170,11 +111,12 @@
   /// variables used in \a OutlinedFn function.
   /// \param IfCond Condition in the associated 'if' clause, if it was
   /// specified, nullptr otherwise.
+  /// \param IsSPMD True if the kernel is know to be executed in SPMD mode.
   ///
-  void emitSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
-                            llvm::Value *OutlinedFn,
-                            ArrayRef<llvm::Value *> CapturedVars,
-                            const Expr *IfCond);
+  void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc,
+                               llvm::Value *OutlinedFn,
+                               ArrayRef<llvm::Value *> CapturedVars,
+                               const Expr *IfCond, bool IsSPMD);
 
 protected:
   /// Get the function name of an outlined region.
@@ -406,15 +348,20 @@
   /// true if we're definitely in the parallel region.
   bool IsInParallelRegion = false;
 
-  /// Map between an outlined function and its wrapper.
-  llvm::DenseMap<llvm::Function *, llvm::Function *> WrapperFunctionsMap;
+  /// Map between an outlined function and its wrapper + shared struct type.
+  struct WrapperInfo {
+    llvm::Function *WrapperFn;
+    llvm::StructType *SharedStructTy;
+  };
+
+  llvm::DenseMap<llvm::Function *, WrapperInfo> WrapperInfoMap;
 
   /// Emit function which wraps the outline parallel region
   /// and controls the parameters which are passed to this function.
   /// The wrapper ensures that the outlined function is called
   /// with the correct arguments when data is shared.
-  llvm::Function *createParallelDataSharingWrapper(
-      llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D);
+  void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn,
+                                        const OMPExecutableDirective &D);
 
   /// The data for the single globalized variable.
   struct MappedVarData {
Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
===================================================================
--- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -24,25 +24,6 @@
 
 namespace {
 enum OpenMPRTLFunctionNVPTX {
-  /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
-  /// int16_t RequiresOMPRuntime);
-  OMPRTL_NVPTX__kmpc_kernel_init,
-  /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-  OMPRTL_NVPTX__kmpc_kernel_deinit,
-  /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
-  /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
-  OMPRTL_NVPTX__kmpc_spmd_kernel_init,
-  /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
-  OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2,
-  /// Call to void __kmpc_kernel_prepare_parallel(void
-  /// *outlined_function, int16_t
-  /// IsOMPRuntimeInitialized);
-  OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
-  /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
-  /// int16_t IsOMPRuntimeInitialized);
-  OMPRTL_NVPTX__kmpc_kernel_parallel,
-  /// Call to void __kmpc_kernel_end_parallel();
-  OMPRTL_NVPTX__kmpc_kernel_end_parallel,
   /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
   /// global_tid);
   OMPRTL_NVPTX__kmpc_serialized_parallel,
@@ -69,22 +50,11 @@
   OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple,
   /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
   OMPRTL_NVPTX__kmpc_end_reduce_nowait,
-  /// Call to void __kmpc_data_sharing_init_stack();
-  OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
-  /// Call to void __kmpc_data_sharing_init_stack_spmd();
-  OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
   /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
   /// int16_t UseSharedMemory);
   OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
   /// Call to void __kmpc_data_sharing_pop_stack(void *a);
   OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
-  /// Call to void __kmpc_begin_sharing_variables(void ***args,
-  /// size_t n_args);
-  OMPRTL_NVPTX__kmpc_begin_sharing_variables,
-  /// Call to void __kmpc_end_sharing_variables();
-  OMPRTL_NVPTX__kmpc_end_sharing_variables,
-  /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
-  OMPRTL_NVPTX__kmpc_get_shared_variables,
   /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
   /// global_tid);
   OMPRTL_NVPTX__kmpc_parallel_level,
@@ -101,6 +71,15 @@
   /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
   /// global_tid);
   OMPRTL__kmpc_barrier_simple_spmd,
+  /// Call to int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t
+  /// RequiresOMPRuntime, int16_t RequiresDataSharing)
+  OMPRTL_NVPTX__kmpc_generic_kernel_init,
+  /// Call to void __kmpc_generic_kernel_deinit(int16_t IsSPMD, int16_t
+  /// RequiredOMPRuntime)
+  OMPRTL_NVPTX__kmpc_generic_kernel_deinit,
+  /// Call to void __kmpc_generic_kernel_parallel(void *OutlinedFn,
+  /// void *Payload, int16_t PayloadBytes, int16_t RequiredOMPRuntime)
+  OMPRTL_NVPTX__kmpc_generic_kernel_parallel,
 };
 
 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -160,11 +139,11 @@
   }
   /// Constructor for SPMD mode.
   ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode,
-                            bool &RuntimeMode, bool FullRuntimeMode)
+                            bool &RuntimeMode, bool FullRuntimeMode, bool IsSPMD)
       : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) {
     SavedExecMode = ExecMode;
     SavedRuntimeMode = RuntimeMode;
-    ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD;
+    ExecMode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
     RuntimeMode = FullRuntimeMode;
   }
   ~ExecutionRuntimeModesRAII() {
@@ -643,56 +622,6 @@
       "nvptx_num_threads");
 }
 
-/// Get the value of the thread_limit clause in the teams directive.
-/// For the 'generic' execution mode, the runtime encodes thread_limit in
-/// the launch parameters, always starting thread_limit+warpSize threads per
-/// CTA. The threads in the last warp are reserved for master execution.
-/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
-static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
-                                   bool IsInSPMDExecutionMode = false) {
-  CGBuilderTy &Bld = CGF.Builder;
-  return IsInSPMDExecutionMode
-             ? getNVPTXNumThreads(CGF)
-             : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
-                                "thread_limit");
-}
-
-/// Get the thread id of the OMP master thread.
-/// The master thread id is the first thread (lane) of the last warp in the
-/// GPU block.  Warp size is assumed to be some power of 2.
-/// Thread id is 0 indexed.
-/// E.g: If NumThreads is 33, master id is 32.
-///      If NumThreads is 64, master id is 32.
-///      If NumThreads is 1024, master id is 992.
-static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
-
-  // We assume that the warp size is a power of 2.
-  llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
-
-  return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
-                       Bld.CreateNot(Mask), "master_tid");
-}
-
-CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
-    CodeGenModule &CGM, SourceLocation Loc)
-    : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
-      Loc(Loc) {
-  createWorkerFunction(CGM);
-}
-
-void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
-    CodeGenModule &CGM) {
-  // Create an worker function with no arguments.
-
-  WorkerFn = llvm::Function::Create(
-      CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
-      /*placeholder=*/"_worker", &CGM.getModule());
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
-  WorkerFn->setDoesNotRecurse();
-}
-
 CGOpenMPRuntimeNVPTX::ExecutionMode
 CGOpenMPRuntimeNVPTX::getExecutionMode() const {
   return CurrentExecutionMode;
@@ -1159,149 +1088,18 @@
       "Unknown programming model for OpenMP directive on NVPTX target.");
 }
 
-void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
+void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
                                              StringRef ParentName,
                                              llvm::Function *&OutlinedFn,
                                              llvm::Constant *&OutlinedFnID,
-                                             bool IsOffloadEntry,
+                                             bool IsOffloadEntry, bool IsSPMD,
                                              const RegionCodeGenTy &CodeGen) {
-  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode);
-  EntryFunctionState EST;
-  WorkerFunctionState WST(CGM, D.getBeginLoc());
-  Work.clear();
-  WrapperFunctionsMap.clear();
-
-  // Emit target region as a standalone region.
-  class NVPTXPrePostActionTy : public PrePostActionTy {
-    CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
-    CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
-
-  public:
-    NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
-                         CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
-        : EST(EST), WST(WST) {}
-    void Enter(CodeGenFunction &CGF) override {
-      auto &RT =
-          static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
-      RT.emitNonSPMDEntryHeader(CGF, EST, WST);
-      // Skip target region initialization.
-      RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
-    }
-    void Exit(CodeGenFunction &CGF) override {
-      auto &RT =
-          static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
-      RT.clearLocThreadIdInsertPt(CGF);
-      RT.emitNonSPMDEntryFooter(CGF, EST);
-    }
-  } Action(EST, WST);
-  CodeGen.setAction(Action);
-  IsInTTDRegion = true;
-  // Reserve place for the globalized memory.
-  GlobalizedRecords.emplace_back();
-  if (!KernelStaticGlobalized) {
-    KernelStaticGlobalized = new llvm::GlobalVariable(
-        CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
-        llvm::GlobalValue::InternalLinkage,
-        llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
-        "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
-        llvm::GlobalValue::NotThreadLocal,
-        CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
-  }
-  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
-                                   IsOffloadEntry, CodeGen);
-  IsInTTDRegion = false;
-
-  // Now change the name of the worker function to correspond to this target
-  // region's entry function.
-  WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
-
-  // Create the worker function
-  emitWorkerFunction(WST);
-}
-
-// Setup NVPTX threads for master-worker OpenMP scheme.
-void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
-                                                  EntryFunctionState &EST,
-                                                  WorkerFunctionState &WST) {
-  CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
-  llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
-  llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
-  EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  llvm::Value *IsWorker =
-      Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
-  Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
-
-  CGF.EmitBlock(WorkerBB);
-  emitCall(CGF, WST.Loc, WST.WorkerFn);
-  CGF.EmitBranch(EST.ExitBB);
-
-  CGF.EmitBlock(MasterCheckBB);
-  llvm::Value *IsMaster =
-      Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
-  Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
-
-  CGF.EmitBlock(MasterBB);
-  IsInTargetMasterThreadRegion = true;
-  // SEQUENTIAL (MASTER) REGION START
-  // First action in sequential region:
-  // Initialize the state of the OpenMP runtime library on the GPU.
-  // TODO: Optimize runtime initialization and pass in correct value.
-  llvm::Value *Args[] = {getThreadLimit(CGF),
-                         Bld.getInt16(/*RequiresOMPRuntime=*/1)};
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
-
-  // For data sharing, we need to initialize the stack.
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(
-          OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
-
-  emitGenericVarsProlog(CGF, WST.Loc);
-}
-
-void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
-                                                  EntryFunctionState &EST) {
-  IsInTargetMasterThreadRegion = false;
-  if (!CGF.HaveInsertPoint())
-    return;
-
-  emitGenericVarsEpilog(CGF);
-
-  if (!EST.ExitBB)
-    EST.ExitBB = CGF.createBasicBlock(".exit");
-
-  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
-  CGF.EmitBranch(TerminateBB);
-
-  CGF.EmitBlock(TerminateBB);
-  // Signal termination condition.
-  // TODO: Optimize runtime initialization and pass in correct value.
-  llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
-  // Barrier to terminate worker threads.
-  syncCTAThreads(CGF);
-  // Master thread jumps to exit point.
-  CGF.EmitBranch(EST.ExitBB);
-
-  CGF.EmitBlock(EST.ExitBB);
-  EST.ExitBB = nullptr;
-}
-
-void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
-                                          StringRef ParentName,
-                                          llvm::Function *&OutlinedFn,
-                                          llvm::Constant *&OutlinedFnID,
-                                          bool IsOffloadEntry,
-                                          const RegionCodeGenTy &CodeGen) {
   ExecutionRuntimeModesRAII ModeRAII(
       CurrentExecutionMode, RequiresFullRuntime,
       CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
-          !supportsLightweightRuntime(CGM.getContext(), D));
+          !supportsLightweightRuntime(CGM.getContext(), D), IsSPMD);
   EntryFunctionState EST;
+  WrapperInfoMap.clear();
 
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
@@ -1309,21 +1107,24 @@
     CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
     const OMPExecutableDirective &D;
 
+    /// Flag that is set if this is already known to be executed in SPMD mode.
+    bool IsSPMD;
+
   public:
     NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
                          CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
-                         const OMPExecutableDirective &D)
-        : RT(RT), EST(EST), D(D) {}
+                         const OMPExecutableDirective &D, bool IsSPMD)
+        : RT(RT), EST(EST), D(D), IsSPMD(IsSPMD) {}
     void Enter(CodeGenFunction &CGF) override {
-      RT.emitSPMDEntryHeader(CGF, EST, D);
+      RT.emitGenericEntryHeader(CGF, EST, D, IsSPMD);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
     void Exit(CodeGenFunction &CGF) override {
       RT.clearLocThreadIdInsertPt(CGF);
-      RT.emitSPMDEntryFooter(CGF, EST);
+      RT.emitGenericEntryFooter(CGF, EST, IsSPMD);
     }
-  } Action(*this, EST, D);
+  } Action(*this, EST, D, IsSPMD);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
   // Reserve place for the globalized memory.
@@ -1342,37 +1143,37 @@
   IsInTTDRegion = false;
 }
 
-void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
+void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(
     CodeGenFunction &CGF, EntryFunctionState &EST,
-    const OMPExecutableDirective &D) {
+    const OMPExecutableDirective &D, bool IsSPMD) {
   CGBuilderTy &Bld = CGF.Builder;
 
   // Setup BBs in entry function.
   llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
   EST.ExitBB = CGF.createBasicBlock(".exit");
 
-  llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
+  llvm::Value *Args[] = {/* IsSPMD = */ Bld.getInt16(IsSPMD ? 1 : 0),
+                         /* UseSP */ Bld.getInt16(1),
                          /*RequiresOMPRuntime=*/
                          Bld.getInt16(RequiresFullRuntime ? 1 : 0),
                          /*RequiresDataSharing=*/Bld.getInt16(0)};
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
-
-  if (RequiresFullRuntime) {
-    // For data sharing, we need to initialize the stack.
-    CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
-        OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
-  }
+  llvm::Value *ExecuteFlag = CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_generic_kernel_init), Args);
+  llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(ExecuteFlag, Bld.getInt16(1));
 
-  CGF.EmitBranch(ExecuteBB);
+  Bld.CreateCondBr(ExecuteCnd, ExecuteBB, EST.ExitBB);
 
   CGF.EmitBlock(ExecuteBB);
 
   IsInTargetMasterThreadRegion = true;
+
+  if (!IsSPMD)
+    emitGenericVarsProlog(CGF, D.getBeginLoc());
 }
 
-void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
-                                               EntryFunctionState &EST) {
+void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
+                                                  EntryFunctionState &EST,
+                                                  bool IsSPMD) {
   IsInTargetMasterThreadRegion = false;
   if (!CGF.HaveInsertPoint())
     return;
@@ -1385,11 +1186,16 @@
 
   CGF.EmitBlock(OMPDeInitBB);
   // DeInitialize the OMP state in the runtime; called by all active threads.
-  llvm::Value *Args[] = {/*RequiresOMPRuntime=*/
+  llvm::Value *Args[] = {/* IsSPMD = */ CGF.Builder.getInt16(
+                             IsSPMD ? 1 : 0), /* RequiresOMPRuntime = */
                          CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)};
+
+  if (!IsSPMD)
+    emitGenericVarsEpilog(CGF);
+
   CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(
-          OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args);
+      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_generic_kernel_deinit),
+      Args);
   CGF.EmitBranch(EST.ExitBB);
 
   CGF.EmitBlock(EST.ExitBB);
@@ -1412,136 +1218,6 @@
   CGM.addCompilerUsedGlobal(GVMode);
 }
 
-void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
-  ASTContext &Ctx = CGM.getContext();
-
-  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
-  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
-                    WST.Loc, WST.Loc);
-  emitWorkerLoop(CGF, WST);
-  CGF.FinishFunction();
-}
-
-void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
-                                          WorkerFunctionState &WST) {
-  //
-  // The workers enter this loop and wait for parallel work from the master.
-  // When the master encounters a parallel region it sets up the work + variable
-  // arguments, and wakes up the workers.  The workers first check to see if
-  // they are required for the parallel region, i.e., within the # of requested
-  // parallel threads.  The activated workers load the variable arguments and
-  // execute the parallel work.
-  //
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
-  llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
-  llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
-  llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
-  llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
-  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
-
-  CGF.EmitBranch(AwaitBB);
-
-  // Workers wait for work from master.
-  CGF.EmitBlock(AwaitBB);
-  // Wait for parallel work
-  syncCTAThreads(CGF);
-
-  Address WorkFn =
-      CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
-  Address ExecStatus =
-      CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
-  CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
-  CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
-
-  // TODO: Optimize runtime initialization and pass in correct value.
-  llvm::Value *Args[] = {WorkFn.getPointer(),
-                         /*RequiresOMPRuntime=*/Bld.getInt16(1)};
-  llvm::Value *Ret = CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
-  Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
-
-  // On termination condition (workid == 0), exit loop.
-  llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
-  llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
-  Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
-
-  // Activate requested workers.
-  CGF.EmitBlock(SelectWorkersBB);
-  llvm::Value *IsActive =
-      Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
-  Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
-
-  // Signal start of parallel region.
-  CGF.EmitBlock(ExecuteBB);
-  // Skip initialization.
-  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
-
-  // Process work items: outlined parallel functions.
-  for (llvm::Function *W : Work) {
-    // Try to match this outlined function.
-    llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
-
-    llvm::Value *WorkFnMatch =
-        Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
-
-    llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
-    llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
-    Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
-
-    // Execute this outlined function.
-    CGF.EmitBlock(ExecuteFNBB);
-
-    // Insert call to work function via shared wrapper. The shared
-    // wrapper takes two arguments:
-    //   - the parallelism level;
-    //   - the thread ID;
-    emitCall(CGF, WST.Loc, W,
-             {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
-
-    // Go to end of parallel region.
-    CGF.EmitBranch(TerminateBB);
-
-    CGF.EmitBlock(CheckNextBB);
-  }
-  // Default case: call to outlined function through pointer if the target
-  // region makes a declare target call that may contain an orphaned parallel
-  // directive.
-  auto *ParallelFnTy =
-      llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
-                              /*isVarArg=*/false)
-          ->getPointerTo();
-  llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
-  // Insert call to work function via shared wrapper. The shared
-  // wrapper takes two arguments:
-  //   - the parallelism level;
-  //   - the thread ID;
-  emitCall(CGF, WST.Loc, WorkFnCast,
-           {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
-  // Go to end of parallel region.
-  CGF.EmitBranch(TerminateBB);
-
-  // Signal end of parallel region.
-  CGF.EmitBlock(TerminateBB);
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
-      llvm::None);
-  CGF.EmitBranch(BarrierBB);
-
-  // All active and inactive workers wait at a barrier after parallel region.
-  CGF.EmitBlock(BarrierBB);
-  // Barrier after parallel region.
-  syncCTAThreads(CGF);
-  CGF.EmitBranch(AwaitBB);
-
-  // Exit target region.
-  CGF.EmitBlock(ExitBB);
-  // Skip initialization.
-  clearLocThreadIdInsertPt(CGF);
-}
-
 /// Returns specified OpenMP runtime function for the current OpenMP
 /// implementation.  Specialized for the NVPTX device.
 /// \param Function OpenMP runtime function.
@@ -1550,66 +1226,6 @@
 CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
   llvm::Constant *RTLFn = nullptr;
   switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
-  case OMPRTL_NVPTX__kmpc_kernel_init: {
-    // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
-    // RequiresOMPRuntime);
-    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_kernel_deinit: {
-    // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
-    llvm::Type *TypeParams[] = {CGM.Int16Ty};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
-    // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
-    // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
-    llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: {
-    // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime);
-    llvm::Type *TypeParams[] = {CGM.Int16Ty};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
-    /// Build void __kmpc_kernel_prepare_parallel(
-    /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
-    llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_kernel_parallel: {
-    /// Build bool __kmpc_kernel_parallel(void **outlined_function,
-    /// int16_t IsOMPRuntimeInitialized);
-    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
-    llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
-    auto *FnTy =
-        llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
-    /// Build void __kmpc_kernel_end_parallel();
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
-    break;
-  }
   case OMPRTL_NVPTX__kmpc_serialized_parallel: {
     // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
     // global_tid);
@@ -1707,21 +1323,6 @@
         FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple");
     break;
   }
-  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
-    /// Build void __kmpc_data_sharing_init_stack();
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
-    /// Build void __kmpc_data_sharing_init_stack_spmd();
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
-    RTLFn =
-        CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
-    break;
-  }
   case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
     // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
     // int16_t UseSharedMemory);
@@ -1741,30 +1342,6 @@
                                       /*Name=*/"__kmpc_data_sharing_pop_stack");
     break;
   }
-  case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
-    /// Build void __kmpc_begin_sharing_variables(void ***args,
-    /// size_t n_args);
-    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
-    /// Build void __kmpc_end_sharing_variables();
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
-    break;
-  }
-  case OMPRTL_NVPTX__kmpc_get_shared_variables: {
-    /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
-    llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
-    break;
-  }
   case OMPRTL_NVPTX__kmpc_parallel_level: {
     // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
     llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
@@ -1819,6 +1396,35 @@
     cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
     break;
   }
+  case OMPRTL_NVPTX__kmpc_generic_kernel_init: {
+    // Build int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t UseSM,
+    // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
+    llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty, CGM.Int16Ty,
+                                CGM.Int16Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_generic_kernel_init");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_generic_kernel_deinit: {
+    // Build void __kmpc_generic_kernel_deinit(int16_t IsSPMD, int16_t
+    // RequiredOMPRuntime);
+    llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_generic_kernel_deinit");
+    break;
+  }
+  case OMPRTL_NVPTX__kmpc_generic_kernel_parallel: {
+    // Build void __kmpc_generic_kernel_parallel(void *OutlinedFnWrapper, void
+    // *Payload, int16_t PayloadBytes, int16_t RequiredOMPRuntime)
+    llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int16Ty,
+                                CGM.Int16Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_generic_kernel_parallel");
+    break;
+  }
   }
   return RTLFn;
 }
@@ -1854,15 +1460,12 @@
 
   assert(!ParentName.empty() && "Invalid target region parent name!");
 
-  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
-  if (Mode)
-    emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                   CodeGen);
-  else
-    emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
-                      CodeGen);
+  bool IsSPMD = supportsSPMDExecutionMode(CGM.getContext(), D);
+
+  emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
+                    IsSPMD, CodeGen);
 
-  setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
+  setPropertyExecutionMode(CGM, OutlinedFn->getName(), IsSPMD);
 }
 
 namespace {
@@ -1958,9 +1561,7 @@
   IsInTTDRegion = PrevIsInTTDRegion;
   if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
       !IsInParallelRegion) {
-    llvm::Function *WrapperFun =
-        createParallelDataSharingWrapper(OutlinedFun, D);
-    WrapperFunctionsMap[OutlinedFun] = WrapperFun;
+    createParallelDataSharingWrapper(OutlinedFun, D);
   }
 
   return OutlinedFun;
@@ -2450,15 +2051,13 @@
   if (!CGF.HaveInsertPoint())
     return;
 
-  if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
-    emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
-  else
-    emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
+  bool IsSPMD = getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD;
+  emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond, IsSPMD);
 }
 
-void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
+void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
     CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
-    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
+    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond, bool IsSPMD) {
   llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
 
   // Force inline this outlined function at its call site.
@@ -2500,76 +2099,64 @@
   auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
                                                   PrePostActionTy &Action) {
     CGBuilderTy &Bld = CGF.Builder;
-    llvm::Function *WFn = WrapperFunctionsMap[Fn];
-    assert(WFn && "Wrapper function does not exist!");
-    llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
+    const WrapperInfo &WFI = WrapperInfoMap[Fn];
+    assert(WFI.WrapperFn && "Wrapper function does not exist!");
+
+    llvm::Value *PayloadBytes = llvm::Constant::getNullValue(CGM.Int16Ty);
+    llvm::Value *StructAlloca = llvm::Constant::getNullValue(CGM.VoidPtrTy);
+    if (WFI.SharedStructTy) {
+      StructAlloca =
+          CGF.CreateDefaultAlignTempAlloca(WFI.SharedStructTy, ".captured")
+              .getPointer();
+      const llvm::DataLayout &DL = WFI.WrapperFn->getParent()->getDataLayout();
+      PayloadBytes = Bld.getInt16(DL.getTypeAllocSize(WFI.SharedStructTy));
+    }
 
-    // Prepare for parallel region. Indicate the outlined function.
-    llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
-    CGF.EmitRuntimeCall(
-        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
-        Args);
+    llvm::SmallVector<llvm::Value *, 4> Args;
+    Args.push_back(CGF.EmitCastToVoidPtr(WFI.WrapperFn));
+    Args.push_back(CGF.EmitCastToVoidPtr(StructAlloca));
+    Args.push_back(PayloadBytes);
+    Args.push_back(/* RequiresOMPRuntime */Bld.getInt16(1));
 
     // Create a private scope that will globalize the arguments
     // passed from the outside of the target region.
     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
 
-    // There's something to share.
-    if (!CapturedVars.empty()) {
-      // Prepare for parallel region. Indicate the outlined function.
-      Address SharedArgs =
-          CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
-      llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
-
-      llvm::Value *DataSharingArgs[] = {
-          SharedArgsPtr,
-          llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
-      CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
-                              OMPRTL_NVPTX__kmpc_begin_sharing_variables),
-                          DataSharingArgs);
-
-      // Store variable address in a list of references to pass to workers.
-      unsigned Idx = 0;
-      ASTContext &Ctx = CGF.getContext();
-      Address SharedArgListAddress = CGF.EmitLoadOfPointer(
-          SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
-                          .castAs<PointerType>());
-      for (llvm::Value *V : CapturedVars) {
-        Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
-                                                 CGF.getPointerSize());
-        llvm::Value *PtrV;
-        if (V->getType()->isIntegerTy())
-          PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
-        else
-          PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
-        CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
-                              Ctx.getPointerType(Ctx.VoidPtrTy));
-        ++Idx;
-      }
-    }
-
-    // Activate workers. This barrier is used by the master to signal
-    // work for the workers.
-    syncCTAThreads(CGF);
+    assert((CapturedVars.empty() || WFI.SharedStructTy) &&
+           "Expected the shared struct type to be set!");
+    assert((CapturedVars.empty() ||
+            CapturedVars.size() == WFI.SharedStructTy->getNumElements()) &&
+           "#elements in shared struct type is not the number of captured "
+           "variables!");
 
-    // OpenMP [2.5, Parallel Construct, p.49]
-    // There is an implied barrier at the end of a parallel region. After the
-    // end of a parallel region, only the master thread of the team resumes
-    // execution of the enclosing task region.
-    //
-    // The master waits at this barrier until all workers are done.
-    syncCTAThreads(CGF);
-
-    if (!CapturedVars.empty())
-      CGF.EmitRuntimeCall(
-          createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
+    unsigned Idx = 0;
+    for (llvm::Value *V : CapturedVars) {
+      llvm::Value* GEP = Bld.CreateStructGEP(StructAlloca, Idx++);
+      Bld.CreateDefaultAlignedStore(V, GEP);
+    }
 
-    // Remember for post-processing in worker loop.
-    Work.emplace_back(WFn);
+    CGF.EmitRuntimeCall(
+        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_generic_kernel_parallel),
+        Args);
   };
 
-  auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
-                             CodeGenFunction &CGF, PrePostActionTy &Action) {
+  auto &&LNParallelGen = [this, Loc, &CodeGen, &SeqGen, &L0ParallelGen, IsSPMD,
+                          &ThreadIDAddr](CodeGenFunction &CGF,
+                                         PrePostActionTy &Action) {
+    if (IsSPMD) {
+      if (IsInTargetMasterThreadRegion) {
+        // In the worker need to use the real thread id.
+        ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
+        CodeGen(CGF, Action);
+      } else {
+        // If we are not in the target region, it is definitely L2 parallelism
+        // or more, because for SPMD mode we always has L1 parallel level, sowe
+        // don't need to check for orphaned directives.
+        SeqGen(CGF, Action);
+      }
+      return;
+    }
+
     if (IsInParallelRegion) {
       SeqGen(CGF, Action);
     } else if (IsInTargetMasterThreadRegion) {
@@ -2623,76 +2210,6 @@
   }
 }
 
-void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
-    CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
-    ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
-  // Just call the outlined function to execute the parallel region.
-  // OutlinedFn(&GTid, &zero, CapturedStruct);
-  //
-  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
-
-  Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
-                                           /*DestWidth=*/32, /*Signed=*/1),
-                                       ".zero.addr");
-  CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
-  // ThreadId for serialized parallels is 0.
-  Address ThreadIDAddr = ZeroAddr;
-  auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
-                    &ThreadIDAddr](CodeGenFunction &CGF,
-                                   PrePostActionTy &Action) {
-    Action.Enter(CGF);
-
-    llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
-    OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
-    OutlinedFnArgs.push_back(ZeroAddr.getPointer());
-    OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
-    emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
-  };
-  auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
-                                        PrePostActionTy &) {
-
-    RegionCodeGenTy RCG(CodeGen);
-    llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
-    llvm::Value *ThreadID = getThreadID(CGF, Loc);
-    llvm::Value *Args[] = {RTLoc, ThreadID};
-
-    NVPTXActionTy Action(
-        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
-        Args,
-        createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
-        Args);
-    RCG.setAction(Action);
-    RCG(CGF);
-  };
-
-  if (IsInTargetMasterThreadRegion) {
-    // In the worker need to use the real thread id.
-    ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
-    RegionCodeGenTy RCG(CodeGen);
-    RCG(CGF);
-  } else {
-    // If we are not in the target region, it is definitely L2 parallelism or
-    // more, because for SPMD mode we always has L1 parallel level, sowe don't
-    // need to check for orphaned directives.
-    RegionCodeGenTy RCG(SeqGen);
-    RCG(CGF);
-  }
-}
-
-void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
-  // Always emit simple barriers!
-  if (!CGF.HaveInsertPoint())
-    return;
-  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
-  // This function does not use parameters, so we can emit just default values.
-  llvm::Value *Args[] = {
-      llvm::ConstantPointerNull::get(
-          cast<llvm::PointerType>(getIdentTyPointerTy())),
-      llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
-}
-
 void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
                                            SourceLocation Loc,
                                            OpenMPDirectiveKind Kind, bool,
@@ -4059,29 +3576,17 @@
   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
 }
 
-/// Emit function which wraps the outline parallel region
-/// and controls the arguments which are passed to this function.
-/// The wrapper ensures that the outlined function is called
-/// with the correct arguments when data is shared.
-llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
+void CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
   ASTContext &Ctx = CGM.getContext();
   const auto &CS = *D.getCapturedStmt(OMPD_parallel);
 
   // Create a function that takes as argument the source thread.
   FunctionArgList WrapperArgs;
-  QualType Int16QTy =
-      Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
-  QualType Int32QTy =
-      Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
-  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
-                                     /*Id=*/nullptr, Int16QTy,
-                                     ImplicitParamDecl::Other);
-  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
-                               /*Id=*/nullptr, Int32QTy,
+  ImplicitParamDecl PayloadArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
+                               /*Id=*/nullptr, Ctx.VoidPtrTy,
                                ImplicitParamDecl::Other);
-  WrapperArgs.emplace_back(&ParallelLevelArg);
-  WrapperArgs.emplace_back(&WrapperArg);
+  WrapperArgs.emplace_back(&PayloadArg);
 
   const CGFunctionInfo &CGFI =
       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
@@ -4096,35 +3601,29 @@
   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
                     D.getBeginLoc(), D.getBeginLoc());
-
-  const auto *RD = CS.getCapturedRecordDecl();
-  auto CurField = RD->field_begin();
+  Fn->arg_begin()->setName("payload");
 
   Address ZeroAddr = CGF.CreateMemTemp(
       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
       /*Name*/ ".zero.addr");
   CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
+
+  setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
+
   // Get the array of arguments.
   SmallVector<llvm::Value *, 8> Args;
-
-  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
+  Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer());
   Args.emplace_back(ZeroAddr.getPointer());
 
   CGBuilderTy &Bld = CGF.Builder;
-  auto CI = CS.capture_begin();
 
-  // Use global memory for data sharing.
   // Handle passing of global args to workers.
-  Address GlobalArgs =
-      CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
-  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
-  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
-  CGF.EmitRuntimeCall(
-      createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
-      DataSharingArgs);
+  Address GlobalArgs = CGF.GetAddrOfLocalVar(&PayloadArg);
 
   // Retrieve the shared variables from the list of references returned
   // by the runtime. Pass the variables to the outlined function.
+  llvm::StructType *StructTy = nullptr;
+  llvm::Value *StructPtr = nullptr;
   Address SharedArgListAddress = Address::invalid();
   if (CS.capture_size() > 0 ||
       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
@@ -4133,56 +3632,38 @@
                         .getPointerType(CGF.getContext().getPointerType(
                             CGF.getContext().VoidPtrTy))
                         .castAs<PointerType>());
+
+    llvm::SmallVector<llvm::Type *, 8> StructMemberTypes;
+    auto ArgIt = OutlinedParallelFn->arg_begin() + 1;
+    auto ArgEnd = OutlinedParallelFn->arg_end();
+    while (++ArgIt != ArgEnd)
+      StructMemberTypes.push_back(ArgIt->getType());
+
+    StructTy = llvm::StructType::create(OutlinedParallelFn->getContext(),
+                                        StructMemberTypes, "omp.shared.struct");
+    SharedArgListAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
+        SharedArgListAddress, StructTy->getPointerTo());
+    StructPtr = SharedArgListAddress.getPointer();
   }
+
   unsigned Idx = 0;
   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
-    Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
-                                             CGF.getPointerSize());
-    Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        Src, CGF.SizeTy->getPointerTo());
-    llvm::Value *LB = CGF.EmitLoadOfScalar(
-        TypedAddress,
-        /*Volatile=*/false,
-        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
-        cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
+    llvm::Value *LB =
+        Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1);
+    llvm::Value *UB =
+        Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1);
     Args.emplace_back(LB);
-    ++Idx;
-    Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
-                                     CGF.getPointerSize());
-    TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        Src, CGF.SizeTy->getPointerTo());
-    llvm::Value *UB = CGF.EmitLoadOfScalar(
-        TypedAddress,
-        /*Volatile=*/false,
-        CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
-        cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
     Args.emplace_back(UB);
-    ++Idx;
-  }
-  if (CS.capture_size() > 0) {
-    ASTContext &CGFContext = CGF.getContext();
-    for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
-      QualType ElemTy = CurField->getType();
-      Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
-                                               CGF.getPointerSize());
-      Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
-          Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
-      llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
-                                              /*Volatile=*/false,
-                                              CGFContext.getPointerType(ElemTy),
-                                              CI->getLocation());
-      if (CI->capturesVariableByCopy() &&
-          !CI->getCapturedVar()->getType()->isAnyPointerType()) {
-        Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
-                              CI->getLocation());
-      }
-      Args.emplace_back(Arg);
-    }
   }
+  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I)
+    Args.emplace_back(
+        Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1));
 
   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
   CGF.FinishFunction();
-  return Fn;
+
+  WrapperInfoMap[OutlinedParallelFn] = WrapperInfo({Fn, StructTy});
+  clearLocThreadIdInsertPt(CGF);
 }
 
 void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
Index: llvm/include/llvm/InitializePasses.h
===================================================================
--- llvm/include/llvm/InitializePasses.h
+++ llvm/include/llvm/InitializePasses.h
@@ -290,6 +290,7 @@
 void initializeObjCARCContractPass(PassRegistry&);
 void initializeObjCARCExpandPass(PassRegistry&);
 void initializeObjCARCOptPass(PassRegistry&);
+void initializeOpenMPOptPass(PassRegistry&);
 void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
 void initializePAEvalPass(PassRegistry&);
Index: llvm/include/llvm/LinkAllPasses.h
===================================================================
--- llvm/include/llvm/LinkAllPasses.h
+++ llvm/include/llvm/LinkAllPasses.h
@@ -147,6 +147,7 @@
       (void) llvm::createObjCARCExpandPass();
       (void) llvm::createObjCARCContractPass();
       (void) llvm::createObjCARCOptPass();
+      (void) llvm::createOpenMPOptPass();
       (void) llvm::createPAEvalPass();
       (void) llvm::createPromoteMemoryToRegisterPass();
       (void) llvm::createDemoteRegisterToMemoryPass();
Index: llvm/include/llvm/Transforms/IPO.h
===================================================================
--- llvm/include/llvm/Transforms/IPO.h
+++ llvm/include/llvm/Transforms/IPO.h
@@ -156,6 +156,11 @@
 ///
 ModulePass *createIPConstantPropagationPass();
 
+//===----------------------------------------------------------------------===//
+/// createOpenMPOpt - This pass performs OpenMP specific optimizations.
+///
+ModulePass *createOpenMPOptPass();
+
 //===----------------------------------------------------------------------===//
 /// createIPSCCPPass - This pass propagates constants from call sites into the
 /// bodies of functions, and keeps track of whether basic blocks are executable
Index: llvm/lib/Transforms/IPO/CMakeLists.txt
===================================================================
--- llvm/lib/Transforms/IPO/CMakeLists.txt
+++ llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -25,6 +25,7 @@
   LoopExtractor.cpp
   LowerTypeTests.cpp
   MergeFunctions.cpp
+  OpenMPOpt.cpp
   PartialInlining.cpp
   PassManagerBuilder.cpp
   PruneEH.cpp
Index: llvm/lib/Transforms/IPO/IPO.cpp
===================================================================
--- llvm/lib/Transforms/IPO/IPO.cpp
+++ llvm/lib/Transforms/IPO/IPO.cpp
@@ -35,6 +35,7 @@
   initializeGlobalSplitPass(Registry);
   initializeHotColdSplittingLegacyPassPass(Registry);
   initializeIPCPPass(Registry);
+  initializeOpenMPOptPass(Registry);
   initializeAlwaysInlinerLegacyPassPass(Registry);
   initializeSimpleInlinerPass(Registry);
   initializeInferFunctionAttrsLegacyPassPass(Registry);
Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp
===================================================================
--- /dev/null
+++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -0,0 +1,428 @@
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> BuildCustomStateMachines(
+    "openmp-opt-build-custom-state-machines", cl::ZeroOrMore,
+    cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden,
+    cl::init(true));
+
+STATISTIC(NumKernelsConvertedToSPMD,
+          "Number of GPU kernels converted to SPMD mode");
+STATISTIC(NumCustomStateMachinesCreated,
+          "Number of custom GPU kernel non-SPMD mode state machines created");
+STATISTIC(NumCustomStateMachinesNoFallback,
+          "Number of custom GPU kernel non-SPMD mode state machines without fallback");
+
+static Type *getOrCreateStructIdentTypePtr(Module &M) {
+  // TODO create if not present!
+  return M.getTypeByName("struct.ident_t")->getPointerTo();
+}
+
+// TODO: Simplify function declaration
+static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {}, false);
+    Fn = Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1,const char *Name, Module &M) {
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    FunctionType *FType = FunctionType::get(RT, {T0, T1}, false);
+    Fn = Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) {
+  static const char *Name = "__kmpc_barrier_simple_spmd";
+  Function *Fn = M.getFunction(Name);
+  if (!Fn) {
+    LLVMContext &Ctx = M.getContext();
+    FunctionType *FType = FunctionType::get(
+        Type::getVoidTy(Ctx),
+        {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false);
+    Fn = Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M);
+  }
+  return Fn;
+}
+
+// TODO: This should be done via attributes.
+static bool isIgnoredCall(Instruction *I) {
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->getCalledFunction())
+    return false;
+
+  return StringSwitch<bool>(CI->getCalledFunction()->getName())
+      .Case("omp_get_team_number", true)
+      .Case("__kmpc_global_thread_num", true)
+      .Case("__kmpc_for_static_init_4", true)
+      .Case("__kmpc_for_static_fini", true)
+      .Case("__kmpc_get_team_static_memory", true)
+      .Case("__kmpc_restoe_team_static_memory", true)
+      .Case("llvm.nvvm.read.ptx.sreg.ntid.x", true)
+      .Case("llvm.lifetime.start.p0i8", true)
+      .Case("llvm.lifetime.end.p0i8", true)
+      .Default(false);
+}
+
+static bool isSPMDRelatedRTCall(Instruction *I) {
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->getCalledFunction())
+    return false;
+
+  return StringSwitch<bool>(CI->getCalledFunction()->getName())
+      .Case("__kmpc_generic_kernel_init", true)
+      .Case("__kmpc_generic_kernel_parallel", true)
+      .Case("__kmpc_generic_kernel_deinit", true)
+      .Default(false);
+}
+
+static void
+createCustomStateMachine(Module &M,
+                         SmallVectorImpl<Instruction *> &SideEffectInst,
+                         SmallVectorImpl<CallInst *> &RTCalls) {
+
+  // TODO use reachability to eliminate the loop and if-cascade
+
+  SmallVector<CallInst *, 8> ParallelRTCalls;
+  CallInst *InitCI = nullptr;
+  for (CallInst *CI : RTCalls) {
+    const auto &CalleeName = CI->getCalledFunction()->getName();
+    if (CalleeName.equals("__kmpc_generic_kernel_init")) {
+      assert(!InitCI && "Found multiple kernel init calls!");
+      InitCI = CI;
+      continue;
+    }
+    if (CalleeName.equals("__kmpc_generic_kernel_parallel")) {
+      ParallelRTCalls.push_back(CI);
+    }
+  }
+
+  assert(InitCI && "No kernel init call found");
+
+  // TODO: Warn or eliminate the offloading if no parallel regions are present.
+
+  ConstantInt *UseSM = dyn_cast<ConstantInt>(InitCI->getArgOperand(1));
+  if (!UseSM || !UseSM->isOne()) {
+    LLVM_DEBUG(dbgs() << "No custom state machine because of " << *InitCI
+                      << "\n");
+    return;
+  }
+
+  InitCI->setName("thread_kind");
+  LLVMContext &Ctx = InitCI->getContext();
+  Function *KernelFn = InitCI->getFunction();
+  Type *VoidTy = Type::getVoidTy(Ctx);
+  Type *BoolTy = Type::getInt1Ty(Ctx);
+  Type *I16Ty = Type::getInt16Ty(Ctx);
+  Type *VoidPtrTy = Type::getInt8PtrTy(Ctx);
+  AllocaInst *WorkFnAI =
+      new AllocaInst(VoidPtrTy, 0,
+                     "work_fn.addr", &KernelFn->getEntryBlock().front());
+
+  Instruction *IP = InitCI->getNextNode();
+  Constant *ConstZero = ConstantInt::getSigned(UseSM->getType(), 0);
+  Constant *ConstMOne = ConstantInt::getSigned(UseSM->getType(), -1);
+  InitCI->setArgOperand(1, ConstZero);
+  Instruction *WorkerCnd =
+      new ICmpInst(IP, ICmpInst::ICMP_EQ, InitCI, ConstMOne, "is_worker");
+
+  Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false);
+  BasicBlock *WaitBB = WaitTI->getParent();
+  WaitBB->setName("worker.wait");
+  IP->getParent()->setName("master_check");
+
+  Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M);
+
+  auto AI = SimpleBarrierFn->arg_begin();
+  Instruction *BarrierCall =
+      CallInst::Create(SimpleBarrierFn,
+                       {Constant::getNullValue((AI++)->getType()),
+                        Constant::getNullValue((AI)->getType())},
+                       "", WaitTI);
+
+  Function *KernelParallelFn = getOrCreateFn(
+      BoolTy, VoidPtrTy->getPointerTo(), I16Ty, "__kmpc_kernel_parallel", M);
+
+  Value *RequiresOMPRuntime = InitCI->getArgOperand(2);
+  Instruction *ActiveCnd = CallInst::Create(
+      KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI);
+
+  Type *WorkFnPrototype =
+      FunctionType::get(VoidTy, {VoidPtrTy}, false)->getPointerTo();
+  Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast(
+      WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI);
+  Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI);
+
+  Instruction *WorkFnCnd =
+      new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn,
+                   Constant::getNullValue(WorkFn->getType()), "no_work");
+
+  Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false);
+  FinishedTI->getParent()->setName("worker.finished");
+  WaitTI->getParent()->setName("worker.active_check");
+
+  Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false);
+  ActiveTI->getParent()->setName("worker.active");
+  WaitTI->getParent()->setName("worker.inactive");
+
+  Function *KernelGetSharedVars =
+      getOrCreateFn(VoidPtrTy, "__kmpc_get_shared_variables", M);
+  Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI);
+
+  BasicBlock *ExecuteBB = ActiveTI->getParent();
+  BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI);
+  ParallelEndBB->setName("worker.parallel_end");
+
+  Function *KernelEndParallelFn =
+      getOrCreateFn(VoidTy, "__kmpc_kernel_end_parallel", M);
+  CallInst::Create(KernelEndParallelFn, "", ActiveTI);
+
+  // A fallback is required if we might not see all parallel regions
+  // (__kmpc_generic_kernel_parallel calls). This could be the case if there is
+  // an unknown function call with side effects in the target region.
+  bool RequiresFallback = std::any_of(
+      SideEffectInst.begin(), SideEffectInst.end(), [](Instruction *I) {
+        return (isa<CallInst>(I) && I->mayHaveSideEffects() &&
+                !isIgnoredCall(I));
+      });
+
+  auto MayContainParallelKernelCall = [](Function &F) {
+    for (Instruction &I : instructions(F)) {
+      if (!isa<CallInst>(I) || !I.mayHaveSideEffects())
+        continue;
+      if (isIgnoredCall(&I))
+        continue;
+      if (isSPMDRelatedRTCall(&I) &&
+          !cast<CallInst>(I).getCalledFunction()->getName().equals(
+              "__kmpc_generic_kernel_parallel"))
+        continue;
+      return true;
+    }
+    return false;
+  };
+
+  IP = ExecuteBB->getTerminator();
+  for (CallInst *ParCI : ParallelRTCalls) {
+    Function *ParFn =
+        dyn_cast<Function>(ParCI->getArgOperand(0)->stripPointerCasts());
+    // We also need to check the parallel regions (behind the
+    // __kmpc_generic_kernel_parallel calls).
+    if (!ParFn) {
+      RequiresFallback = true;
+      continue;
+    }
+    RequiresFallback |= MayContainParallelKernelCall(*ParFn);
+
+    Value *ParFnCnd =
+        new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check");
+    Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false);
+    IP->getParent()->setName("worker.check.next");
+    ParFnTI->getParent()->setName("worker.execute." + ParFn->getName());
+    CallInst::Create(ParFn, {SharedVars}, "", ParFnTI);
+    ParFnTI->setSuccessor(0, ParallelEndBB);
+  }
+
+  if (RequiresFallback) {
+    CallInst::Create(WorkFn, {SharedVars}, "", IP);
+  }
+
+  BarrierCall->clone()->insertBefore(WaitTI);
+
+  FinishedTI->setSuccessor(0, WaitTI->getSuccessor(0));
+  WaitTI->setSuccessor(0, WaitBB);
+  // TODO: Add the new loop to LI!
+
+  NumCustomStateMachinesCreated++;
+  NumCustomStateMachinesNoFallback += !RequiresFallback;
+}
+
+static void collectNonParallelGlobalSideEffectsInKernel(
+    CallInst *CInst, SmallVectorImpl<Instruction *> &SideEffectInst,
+    SmallVectorImpl<CallInst *> &RTCalls) {
+
+  SmallVector<Instruction *, 32> Worklist;
+  SmallPtrSet<BasicBlock *, 32> Visited;
+
+  Worklist.push_back(CInst);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+
+    if (isSPMDRelatedRTCall(I))
+      RTCalls.push_back(cast<CallInst>(I));
+    else if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+      SideEffectInst.push_back(I);
+
+    if (!I->isTerminator()) {
+      Worklist.push_back(I->getNextNode());
+      continue;
+    }
+
+    for (BasicBlock *SuccBB : successors(I))
+      if (Visited.insert(SuccBB).second)
+        Worklist.push_back(&SuccBB->front());
+  }
+}
+
+static bool
+guardAllSideEffects(Module &M, SmallVectorImpl<Instruction *> &SideEffectInst) {
+  bool Guarded = true;
+  const DataLayout &DL = M.getDataLayout();
+  for (Instruction *I : SideEffectInst) {
+    if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      if (isIgnoredCall(CI))
+        continue;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (isa<AllocaInst>(
+              SI->getPointerOperand()->stripInBoundsConstantOffsets()))
+        continue;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (isSafeToLoadUnconditionally(LI->getPointerOperand(),
+                                      LI->getAlignment(), DL))
+        continue;
+    }
+    LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n");
+    Guarded = false;
+  }
+  return Guarded;
+}
+
+static bool convertGPUKernelsToSPMD(Module &M) {
+  bool Changed = false;
+
+  Function *GenericKernelInitFn = M.getFunction("__kmpc_generic_kernel_init");
+
+  // If the kernel init function is not present or unused, we are done.
+  if (!GenericKernelInitFn || GenericKernelInitFn->getNumUses() == 0)
+    return Changed;
+
+  LLVMContext &Ctx = M.getContext();
+  for (const Use &U : GenericKernelInitFn->uses()) {
+    CallSite CS(U.getUser());
+
+    // Filter out non-callee uses.
+    if (!CS || !CS.isCallee(&U))
+      continue;
+
+    // Filter out non call-inst uses.
+    if (!isa<CallInst>(CS.getInstruction()))
+      continue;
+
+    auto *CInst = cast<CallInst>(CS.getInstruction());
+
+    // Filter out all but explicit non-SPMD cases.
+    Value *IsSPMDConstVal = CInst->getArgOperand(0);
+    if (!isa<ConstantInt>(IsSPMDConstVal) ||
+        !cast<ConstantInt>(IsSPMDConstVal)->isZero())
+      continue;
+
+    Function *KernelFn = CInst->getFunction();
+
+    // For now we require the init call to be in the entry block, not strictly
+    // necessary but it makes things easier.
+    if (CInst->getParent() != &KernelFn->getEntryBlock())
+      continue;
+
+    // Traverse the kernel from the init to the deinit call and determine if
+    // there are any global side effects outside of parallel sections. If so,
+    // we cannot compute the kernel in SPMD mode (right now).
+    SmallVector<Instruction *, 16> SideEffectInst;
+    SmallVector<CallInst *, 16> RTCalls;
+    collectNonParallelGlobalSideEffectsInKernel(CInst, SideEffectInst, RTCalls);
+    if (!guardAllSideEffects(M, SideEffectInst)) {
+      if (BuildCustomStateMachines)
+        createCustomStateMachine(M, SideEffectInst, RTCalls);
+      continue;
+    }
+
+    ConstantInt *COne = ConstantInt::get(IntegerType::getInt16Ty(Ctx), 1);
+    for (CallInst *RTCall : RTCalls) {
+      if (RTCall->getCalledFunction()->getName().equals(
+              "__kmpc_generic_kernel_parallel")) {
+        Value *Callee = RTCall->getArgOperand(0)->stripPointerCasts();
+        Value *Payload = RTCall->getArgOperand(1);
+        CallInst::Create(Callee, {Payload}, "", RTCall);
+        RTCall->eraseFromParent();
+        continue;
+      }
+
+      assert(RTCall->getArgOperand(0)->getType()->isIntegerTy(16) &&
+             "IsSPMD flag with int16_t expected!");
+      assert(isa<ConstantInt>(IsSPMDConstVal) &&
+             "Constant IsSPMD flag expected!");
+      assert(cast<ConstantInt>(IsSPMDConstVal)->isZero() &&
+             "Consistent IsSPMD flags expected!");
+
+      RTCall->setArgOperand(0, COne);
+      continue;
+    }
+
+    GlobalVariable *ExecMode =
+        M.getGlobalVariable((KernelFn->getName() + "_exec_mode").str());
+    assert(ExecMode &&
+           "Assumed to find an execution mode hint among the globals");
+    assert(ExecMode->getInitializer()->isOneValue() &&
+           "Assumed generic execution mode prior to 'SPMD'-zation");
+    ExecMode->setInitializer(
+        Constant::getNullValue(ExecMode->getInitializer()->getType()));
+
+    NumKernelsConvertedToSPMD++;
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+namespace {
+/// OpenMPOpt - The interprocedural OpenMP optimization pass
+struct OpenMPOpt : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  OpenMPOpt() : ModulePass(ID) {
+    initializeOpenMPOptPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    bool Changed = false;
+    Changed |= convertGPUKernelsToSPMD(M);
+    return Changed;
+  }
+};
+} // namespace
+
+char OpenMPOpt::ID = 0;
+INITIALIZE_PASS(OpenMPOpt, "openmp-opt", "OpenMP specific optimizations", false,
+                false)
+
+ModulePass *llvm::createOpenMPOptPass() { return new OpenMPOpt(); }
Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -546,6 +546,8 @@
   addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
   addFunctionSimplificationPasses(MPM);
 
+  MPM.add(createOpenMPOptPass());
+
   // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
   // pass manager that we are specifically trying to avoid. To prevent this
   // we must insert a no-op module pass to reset the pass manager.
Index: llvm/test/Transforms/OpenMP/target_offload_late_SPMD.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/OpenMP/target_offload_late_SPMD.ll
@@ -0,0 +1,344 @@
+; ModuleID = '/tmp/target_offload_new.ll'
+source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.shared.struct = type { i64, i64, double*, i32*, float* }
+%omp.shared.struct.0 = type { i64, i64, double*, i32*, float* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 0
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #0 {
+entry:
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.omp.comb.lb4.i = alloca i32, align 4
+  %.omp.comb.ub5.i = alloca i32, align 4
+  %.omp.stride6.i = alloca i32, align 4
+  %.omp.is_last7.i = alloca i32, align 4
+  %.captured18.i = alloca %omp.shared.struct.0, align 8
+  %0 = call i16 @__kmpc_generic_kernel_init(i16 1, i16 1, i16 1, i16 0)
+  %1 = icmp eq i16 %0, 0
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2
+  %3 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp.i = icmp sgt i32 %3, 1023
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %4 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 1023, %cond.true.i ], [ %4, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %5 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %.omp.iv.i.0 = phi i32 [ %5, %cond.end.i ], [ %add.i, %omp.inner.for.body.i ]
+  %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp1.i = icmp sle i32 %.omp.iv.i.0, %6
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %10 = zext i32 %9 to i64
+  %11 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %8, i64* %12, !noalias !5
+  %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %10, i64* %13, !noalias !5
+  %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store double* %c, double** %14, !noalias !5
+  %15 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 3
+  store i32* %a, i32** %15, !noalias !5
+  %16 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 4
+  store float* %b, float** %16, !noalias !5
+  call void @__omp_outlined__1_wrapper(i8* %11)
+  %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !5
+  %add.i = add nsw i32 %.omp.iv.i.0, %17
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2
+  store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #2
+  %18 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp9.i = icmp sgt i32 %18, 1023
+  br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i
+
+cond.true10.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end12.i
+
+cond.false11.i:                                   ; preds = %omp.inner.for.end.i
+  %19 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  br label %cond.end12.i
+
+cond.end12.i:                                     ; preds = %cond.false11.i, %cond.true10.i
+  %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %19, %cond.false11.i ]
+  store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %20 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  br label %omp.inner.for.cond14.i
+
+omp.inner.for.cond14.i:                           ; preds = %omp.inner.for.body16.i, %cond.end12.i
+  %.omp.iv2.i.0 = phi i32 [ %20, %cond.end12.i ], [ %add20.i, %omp.inner.for.body16.i ]
+  %21 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp15.i = icmp sle i32 %.omp.iv2.i.0, %21
+  br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit
+
+omp.inner.for.body16.i:                           ; preds = %omp.inner.for.cond14.i
+  %22 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  %23 = zext i32 %22 to i64
+  %24 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %25 = zext i32 %24 to i64
+  %26 = bitcast %omp.shared.struct.0* %.captured18.i to i8*
+  %27 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 0
+  store i64 %23, i64* %27, !noalias !5
+  %28 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 1
+  store i64 %25, i64* %28, !noalias !5
+  %29 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 2
+  store double* %c, double** %29, !noalias !5
+  %30 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 3
+  store i32* %a, i32** %30, !noalias !5
+  %31 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 4
+  store float* %b, float** %31, !noalias !5
+  call void @__omp_outlined__2_wrapper(i8* %26)
+  %32 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5
+  %add20.i = add nsw i32 %.omp.iv2.i.0, %32
+  br label %omp.inner.for.cond14.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond14.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 1, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %1 = bitcast i8* %payload to %omp.shared.struct*
+  %2 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 0
+  %3 = load i64, i64* %2, align 1
+  %4 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 1
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 2
+  %7 = load double*, double** %6, align 1
+  %8 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 3
+  %9 = load i32*, i32** %8, align 1
+  %10 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 4
+  %11 = load float*, float** %10, align 1
+  %12 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %12)
+  %13 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %13)
+  %14 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %14)
+  %15 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %15)
+  store i32 0, i32* %.omp.lb.i, align 4, !noalias !9
+  store i32 1023, i32* %.omp.ub.i, align 4, !noalias !9
+  %conv.i = trunc i64 %3 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !9
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !9
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !9
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !9
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !9
+  %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !9
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %entry
+  %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ]
+  %conv2.i = sext i32 %.omp.iv.0.i to i64
+  %cmp.i = icmp ule i64 %conv2.i, %5
+  br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__1.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %idxprom.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i
+  %17 = load i32, i32* %arrayidx.i, align 4, !noalias !9
+  %conv4.i = sitofp i32 %17 to float
+  %idxprom5.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i
+  %18 = load float, float* %arrayidx6.i, align 4, !noalias !9
+  %mul7.i = fmul float %conv4.i, %18
+  %conv8.i = fpext float %mul7.i to double
+  %idxprom9.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i
+  %19 = load double, double* %arrayidx10.i, align 8, !noalias !9
+  %add11.i = fadd double %19, %conv8.i
+  store double %add11.i, double* %arrayidx10.i, align 8, !noalias !9
+  %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !9
+  %add12.i = add nsw i32 %.omp.iv.0.i, %20
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__1.exit:                           ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !9
+  %21 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %21)
+  %22 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %22)
+  %23 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %23)
+  %24 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %24)
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i8* %payload) #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %1 = bitcast i8* %payload to %omp.shared.struct.0*
+  %2 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 0
+  %3 = load i64, i64* %2, align 1
+  %4 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 1
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 2
+  %7 = load double*, double** %6, align 1
+  %8 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 3
+  %9 = load i32*, i32** %8, align 1
+  %10 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 4
+  %11 = load float*, float** %10, align 1
+  %12 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %12)
+  %13 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %13)
+  %14 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %14)
+  %15 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %15)
+  store i32 0, i32* %.omp.lb.i, align 4, !noalias !12
+  store i32 1023, i32* %.omp.ub.i, align 4, !noalias !12
+  %conv.i = trunc i64 %3 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !12
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !12
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !12
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !12
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !12
+  %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !12
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %entry
+  %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ]
+  %conv2.i = sext i32 %.omp.iv.0.i to i64
+  %cmp.i = icmp ule i64 %conv2.i, %5
+  br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__2.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %idxprom.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i
+  %17 = load i32, i32* %arrayidx.i, align 4, !noalias !12
+  %conv4.i = sitofp i32 %17 to float
+  %idxprom5.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i
+  %18 = load float, float* %arrayidx6.i, align 4, !noalias !12
+  %mul7.i = fmul float %conv4.i, %18
+  %conv8.i = fpext float %mul7.i to double
+  %idxprom9.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i
+  %19 = load double, double* %arrayidx10.i, align 8, !noalias !12
+  %add11.i = fadd double %19, %conv8.i
+  store double %add11.i, double* %arrayidx10.i, align 8, !noalias !12
+  %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !12
+  %add12.i = add nsw i32 %.omp.iv.0.i, %20
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__2.exit:                           ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !12
+  %21 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %21)
+  %22 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %22)
+  %23 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %23)
+  %24 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %24)
+  ret void
+}
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0}
+!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 "}
+!5 = !{!6, !8}
+!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."}
+!7 = distinct !{!7, !"__omp_outlined__"}
+!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."}
+!9 = !{!10}
+!10 = distinct !{!10, !11, !"__omp_outlined__1: %.global_tid."}
+!11 = distinct !{!11, !"__omp_outlined__1"}
+!12 = !{!13}
+!13 = distinct !{!13, !14, !"__omp_outlined__2: %.global_tid."}
+!14 = distinct !{!14, !"__omp_outlined__2"}
Index: llvm/test/Transforms/OpenMP/target_offload_new.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/OpenMP/target_offload_new.ll
@@ -0,0 +1,468 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud
+; ModuleID = '../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c'
+source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.shared.struct = type { i64, i64, double*, i32*, float* }
+%omp.shared.struct.0 = type { i64, i64, double*, i32*, float* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #0 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %a.addr.i = alloca i32*, align 8
+  %b.addr.i = alloca float*, align 8
+  %c.addr.i = alloca double*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.omp.iv2.i = alloca i32, align 4
+  %tmp3.i = alloca i32, align 4
+  %.omp.comb.lb4.i = alloca i32, align 4
+  %.omp.comb.ub5.i = alloca i32, align 4
+  %.omp.stride6.i = alloca i32, align 4
+  %.omp.is_last7.i = alloca i32, align 4
+  %i8.i = alloca i32, align 4
+  %.zero.addr17.i = alloca i32, align 4
+  %.captured18.i = alloca %omp.shared.struct.0, align 8
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca float*, align 8
+  %c.addr = alloca double*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store float* %b, float** %b.addr, align 8
+  store double* %c, double** %c.addr, align 8
+  %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0)
+  %1 = icmp eq i16 %0, 0
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %entry
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %3 = load i32*, i32** %a.addr, align 8
+  %4 = load float*, float** %b.addr, align 8
+  %5 = load double*, double** %c.addr, align 8
+  store i32 %2, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr17.i, align 4, !noalias !5
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !5
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !5
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !5
+  store i32* %3, i32** %a.addr.i, align 8, !noalias !5
+  store float* %4, float** %b.addr.i, align 8, !noalias !5
+  store double* %5, double** %c.addr.i, align 8, !noalias !5
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5
+  %6 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !5
+  %7 = load i32, i32* %6, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %7, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2
+  %8 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp.i = icmp sgt i32 %8, 1023
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %9 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 1023, %cond.true.i ], [ %9, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %10 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 %10, i32* %.omp.iv.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %11 = load i32, i32* %.omp.iv.i, align 4, !noalias !5
+  %12 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp1.i = icmp sle i32 %11, %12
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %13 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  %14 = zext i32 %13 to i64
+  %15 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %16 = zext i32 %15 to i64
+  %17 = load double*, double** %c.addr.i, align 8, !noalias !5
+  %18 = load i32*, i32** %a.addr.i, align 8, !noalias !5
+  %19 = load float*, float** %b.addr.i, align 8, !noalias !5
+  %20 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %21 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %14, i64* %21, !noalias !5
+  %22 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %16, i64* %22, !noalias !5
+  %23 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store double* %17, double** %23, !noalias !5
+  %24 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 3
+  store i32* %18, i32** %24, !noalias !5
+  %25 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 4
+  store float* %19, float** %25, !noalias !5
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %20, i16 40, i16 1) #2
+  %26 = load i32, i32* %.omp.iv.i, align 4, !noalias !5
+  %27 = load i32, i32* %.omp.stride.i, align 4, !noalias !5
+  %add.i = add nsw i32 %26, %27
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %7) #2
+  store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %7, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #2
+  %28 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp9.i = icmp sgt i32 %28, 1023
+  br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i
+
+cond.true10.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end12.i
+
+cond.false11.i:                                   ; preds = %omp.inner.for.end.i
+  %29 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  br label %cond.end12.i
+
+cond.end12.i:                                     ; preds = %cond.false11.i, %cond.true10.i
+  %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %29, %cond.false11.i ]
+  store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %30 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  store i32 %30, i32* %.omp.iv2.i, align 4, !noalias !5
+  br label %omp.inner.for.cond14.i
+
+omp.inner.for.cond14.i:                           ; preds = %omp.inner.for.body16.i, %cond.end12.i
+  %31 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5
+  %32 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp15.i = icmp sle i32 %31, %32
+  br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit
+
+omp.inner.for.body16.i:                           ; preds = %omp.inner.for.cond14.i
+  %33 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  %34 = zext i32 %33 to i64
+  %35 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %36 = zext i32 %35 to i64
+  %37 = load double*, double** %c.addr.i, align 8, !noalias !5
+  %38 = load i32*, i32** %a.addr.i, align 8, !noalias !5
+  %39 = load float*, float** %b.addr.i, align 8, !noalias !5
+  %40 = bitcast %omp.shared.struct.0* %.captured18.i to i8*
+  %41 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 0
+  store i64 %34, i64* %41, !noalias !5
+  %42 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 1
+  store i64 %36, i64* %42, !noalias !5
+  %43 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 2
+  store double* %37, double** %43, !noalias !5
+  %44 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 3
+  store i32* %38, i32** %44, !noalias !5
+  %45 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 4
+  store float* %39, float** %45, !noalias !5
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %40, i16 40, i16 1) #2
+  %46 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5
+  %47 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5
+  %add20.i = add nsw i32 %46, %47
+  store i32 %add20.i, i32* %.omp.iv2.i, align 4, !noalias !5
+  br label %omp.inner.for.cond14.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond14.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %7) #2
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %entry
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %c.addr = alloca double*, align 8
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca float*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store double* %c, double** %c.addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  store float* %b, float** %b.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 1023, i32* %.omp.ub, align 4
+  %0 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %0 to i32
+  %1 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %1 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %2 = load i32*, i32** %.global_tid..addr, align 8
+  %3 = load i32, i32* %2, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %4 = load i32, i32* %.omp.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %5 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %5 to i64
+  %6 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %6
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %7, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %8 = load i32*, i32** %a.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %10 = load i32, i32* %arrayidx, align 4
+  %conv4 = sitofp i32 %10 to float
+  %11 = load float*, float** %b.addr, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom5 = sext i32 %12 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5
+  %13 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %conv4, %13
+  %conv8 = fpext float %mul7 to double
+  %14 = load double*, double** %c.addr, align 8
+  %15 = load i32, i32* %i, align 4
+  %idxprom9 = sext i32 %15 to i64
+  %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9
+  %16 = load double, double* %arrayidx10, align 8
+  %add11 = fadd double %16, %conv8
+  store double %add11, double* %arrayidx10, align 8
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %17 = load i32, i32* %.omp.iv, align 4
+  %18 = load i32, i32* %.omp.stride, align 4
+  %add12 = add nsw i32 %17, %18
+  store i32 %add12, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct*
+  %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0
+  %4 = load i64, i64* %3, align 1
+  %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1
+  %6 = load i64, i64* %5, align 1
+  %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2
+  %8 = load double*, double** %7, align 1
+  %9 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 3
+  %10 = load i32*, i32** %9, align 1
+  %11 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 4
+  %12 = load float*, float** %11, align 1
+  call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, double* %8, i32* %10, float* %12) #2
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #0 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %c.addr = alloca double*, align 8
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca float*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store double* %c, double** %c.addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  store float* %b, float** %b.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 1023, i32* %.omp.ub, align 4
+  %0 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %0 to i32
+  %1 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %1 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %2 = load i32*, i32** %.global_tid..addr, align 8
+  %3 = load i32, i32* %2, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %4 = load i32, i32* %.omp.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %5 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %5 to i64
+  %6 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %6
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %7, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %8 = load i32*, i32** %a.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %10 = load i32, i32* %arrayidx, align 4
+  %conv4 = sitofp i32 %10 to float
+  %11 = load float*, float** %b.addr, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom5 = sext i32 %12 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5
+  %13 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %conv4, %13
+  %conv8 = fpext float %mul7 to double
+  %14 = load double*, double** %c.addr, align 8
+  %15 = load i32, i32* %i, align 4
+  %idxprom9 = sext i32 %15 to i64
+  %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9
+  %16 = load double, double* %arrayidx10, align 8
+  %add11 = fadd double %16, %conv8
+  store double %add11, double* %arrayidx10, align 8
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %17 = load i32, i32* %.omp.iv, align 4
+  %18 = load i32, i32* %.omp.stride, align 4
+  %add12 = add nsw i32 %17, %18
+  store i32 %add12, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3)
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i8* %payload) #1 {
+entry:
+  %.addr = alloca i8*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i8* %payload, i8** %.addr, align 8
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 %0, i32* %.threadid_temp., align 4
+  %1 = load i8*, i8** %.addr, align 8
+  %2 = bitcast i8* %1 to %omp.shared.struct.0*
+  %3 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 0
+  %4 = load i64, i64* %3, align 1
+  %5 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 1
+  %6 = load i64, i64* %5, align 1
+  %7 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 2
+  %8 = load double*, double** %7, align 1
+  %9 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 3
+  %10 = load i32*, i32** %9, align 1
+  %11 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 4
+  %12 = load float*, float** %11, align 1
+  call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, double* %8, i32* %10, float* %12) #2
+  ret void
+}
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0}
+!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 "}
+!5 = !{!6, !8}
+!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."}
+!7 = distinct !{!7, !"__omp_outlined__"}
+!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud
+
Index: llvm/test/Transforms/OpenMP/target_offload_no_SPMD_custom_sm.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/OpenMP/target_offload_no_SPMD_custom_sm.ll
@@ -0,0 +1,399 @@
+; ModuleID = '/tmp/target_offload_new.ll'
+source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+%omp.shared.struct = type { i64, i64, double*, i32*, float* }
+%omp.shared.struct.0 = type { i64, i64, double*, i32*, float* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #0 {
+entry:
+  %work_fn.addr = alloca i8*
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %.captured.i = alloca %omp.shared.struct, align 8
+  %.omp.comb.lb4.i = alloca i32, align 4
+  %.omp.comb.ub5.i = alloca i32, align 4
+  %.omp.stride6.i = alloca i32, align 4
+  %.omp.is_last7.i = alloca i32, align 4
+  %.captured18.i = alloca %omp.shared.struct.0, align 8
+  %thread_kind = call i16 @__kmpc_generic_kernel_init(i16 0, i16 0, i16 1, i16 0)
+  %is_worker = icmp eq i16 %thread_kind, -1
+  br i1 %is_worker, label %worker.wait, label %master_check
+
+worker.wait:                                      ; preds = %worker.inactive, %entry
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  %is_active = call i1 @__kmpc_kernel_parallel(i8** %work_fn.addr, i16 1)
+  %Work_fn.addr_cast = bitcast i8** %work_fn.addr to void (i8*)**
+  %work_fn = load void (i8*)*, void (i8*)** %Work_fn.addr_cast
+  %no_work = icmp eq void (i8*)* %work_fn, null
+  br i1 %no_work, label %worker.finished, label %worker.active_check
+
+worker.finished:                                  ; preds = %worker.wait
+  br label %master_check
+
+worker.active_check:                              ; preds = %worker.wait
+  br i1 %is_active, label %worker.active, label %worker.inactive
+
+worker.active:                                    ; preds = %worker.active_check
+  %0 = call i8* @__kmpc_get_shared_variables()
+  %par_fn_check = icmp eq void (i8*)* %work_fn, @__omp_outlined__2_wrapper
+  br i1 %par_fn_check, label %worker.execute.__omp_outlined__2_wrapper, label %worker.check.next
+
+worker.execute.__omp_outlined__2_wrapper:         ; preds = %worker.active
+  call void @__omp_outlined__2_wrapper(i8* %0)
+  br label %worker.parallel_end
+
+worker.check.next:                                ; preds = %worker.active
+  %par_fn_check1 = icmp eq void (i8*)* %work_fn, @__omp_outlined__1_wrapper
+  br i1 %par_fn_check1, label %worker.execute.__omp_outlined__1_wrapper, label %worker.check.next2
+
+worker.execute.__omp_outlined__1_wrapper:         ; preds = %worker.check.next
+  call void @__omp_outlined__1_wrapper(i8* %0)
+  br label %worker.parallel_end
+
+worker.check.next2:                               ; preds = %worker.check.next
+  br label %worker.parallel_end
+
+worker.parallel_end:                              ; preds = %worker.execute.__omp_outlined__1_wrapper, %worker.execute.__omp_outlined__2_wrapper, %worker.check.next2
+  call void @__kmpc_kernel_end_parallel()
+  br label %worker.inactive
+
+worker.inactive:                                  ; preds = %worker.active_check, %worker.parallel_end
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %worker.wait
+
+master_check:                                     ; preds = %worker.finished, %entry
+  %1 = icmp eq i16 %thread_kind, 0
+  br i1 %1, label %.execute, label %.exit
+
+.execute:                                         ; preds = %master_check
+  %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2
+  %3 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp.i = icmp sgt i32 %3, 1023
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.execute
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.execute
+  %4 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 1023, %cond.true.i ], [ %4, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %5 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %.omp.iv.i.0 = phi i32 [ %5, %cond.end.i ], [ %add.i, %omp.inner.for.body.i ]
+  %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp1.i = icmp sle i32 %.omp.iv.i.0, %6
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  %8 = zext i32 %7 to i64
+  %9 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %10 = zext i32 %9 to i64
+  %11 = bitcast %omp.shared.struct* %.captured.i to i8*
+  %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0
+  store i64 %8, i64* %12, !noalias !5
+  %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1
+  store i64 %10, i64* %13, !noalias !5
+  %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2
+  store double* %c, double** %14, !noalias !5
+  %15 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 3
+  store i32* %a, i32** %15, !noalias !5
+  %16 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 4
+  store float* %b, float** %16, !noalias !5
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %11, i16 40, i16 1) #2
+  %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !5
+  %add.i = add nsw i32 %.omp.iv.i.0, %17
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2
+  store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #2
+  %18 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp9.i = icmp sgt i32 %18, 1023
+  br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i
+
+cond.true10.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end12.i
+
+cond.false11.i:                                   ; preds = %omp.inner.for.end.i
+  %19 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  br label %cond.end12.i
+
+cond.end12.i:                                     ; preds = %cond.false11.i, %cond.true10.i
+  %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %19, %cond.false11.i ]
+  store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %20 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  br label %omp.inner.for.cond14.i
+
+omp.inner.for.cond14.i:                           ; preds = %omp.inner.for.body16.i, %cond.end12.i
+  %.omp.iv2.i.0 = phi i32 [ %20, %cond.end12.i ], [ %add20.i, %omp.inner.for.body16.i ]
+  %21 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp15.i = icmp sle i32 %.omp.iv2.i.0, %21
+  br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit
+
+omp.inner.for.body16.i:                           ; preds = %omp.inner.for.cond14.i
+  %22 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  %23 = zext i32 %22 to i64
+  %24 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %25 = zext i32 %24 to i64
+  %26 = bitcast %omp.shared.struct.0* %.captured18.i to i8*
+  %27 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 0
+  store i64 %23, i64* %27, !noalias !5
+  %28 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 1
+  store i64 %25, i64* %28, !noalias !5
+  %29 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 2
+  store double* %c, double** %29, !noalias !5
+  %30 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 3
+  store i32* %a, i32** %30, !noalias !5
+  %31 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 4
+  store float* %b, float** %31, !noalias !5
+  call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %26, i16 40, i16 1) #2
+  %32 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5
+  %add20.i = add nsw i32 %.omp.iv2.i.0, %32
+  br label %omp.inner.for.cond14.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond14.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2
+  br label %.omp.deinit
+
+.omp.deinit:                                      ; preds = %__omp_outlined__.exit
+  call void @__kmpc_generic_kernel_deinit(i16 0, i16 1)
+  br label %.exit
+
+.exit:                                            ; preds = %.omp.deinit, %master_check
+  ret void
+}
+
+declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16)
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i8* %payload) #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %1 = bitcast i8* %payload to %omp.shared.struct*
+  %2 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 0
+  %3 = load i64, i64* %2, align 1
+  %4 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 1
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 2
+  %7 = load double*, double** %6, align 1
+  %8 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 3
+  %9 = load i32*, i32** %8, align 1
+  %10 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 4
+  %11 = load float*, float** %10, align 1
+  %12 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %12)
+  %13 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %13)
+  %14 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %14)
+  %15 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %15)
+  store i32 0, i32* %.omp.lb.i, align 4, !noalias !9
+  store i32 1023, i32* %.omp.ub.i, align 4, !noalias !9
+  %conv.i = trunc i64 %3 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !9
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !9
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !9
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !9
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !9
+  %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !9
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %entry
+  %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ]
+  %conv2.i = sext i32 %.omp.iv.0.i to i64
+  %cmp.i = icmp ule i64 %conv2.i, %5
+  br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__1.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %idxprom.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i
+  %17 = load i32, i32* %arrayidx.i, align 4, !noalias !9
+  %conv4.i = sitofp i32 %17 to float
+  %idxprom5.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i
+  %18 = load float, float* %arrayidx6.i, align 4, !noalias !9
+  %mul7.i = fmul float %conv4.i, %18
+  %conv8.i = fpext float %mul7.i to double
+  %idxprom9.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i
+  %19 = load double, double* %arrayidx10.i, align 8, !noalias !9
+  %add11.i = fadd double %19, %conv8.i
+  store double %add11.i, double* %arrayidx10.i, align 8, !noalias !9
+  %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !9
+  %add12.i = add nsw i32 %.omp.iv.0.i, %20
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__1.exit:                           ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !9
+  %21 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %21)
+  %22 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %22)
+  %23 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %23)
+  %24 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %24)
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i8* %payload) #0 {
+entry:
+  %.omp.lb.i = alloca i32, align 4
+  %.omp.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %1 = bitcast i8* %payload to %omp.shared.struct.0*
+  %2 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 0
+  %3 = load i64, i64* %2, align 1
+  %4 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 1
+  %5 = load i64, i64* %4, align 1
+  %6 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 2
+  %7 = load double*, double** %6, align 1
+  %8 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 3
+  %9 = load i32*, i32** %8, align 1
+  %10 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 4
+  %11 = load float*, float** %10, align 1
+  %12 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %12)
+  %13 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %13)
+  %14 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %14)
+  %15 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %15)
+  store i32 0, i32* %.omp.lb.i, align 4, !noalias !12
+  store i32 1023, i32* %.omp.ub.i, align 4, !noalias !12
+  %conv.i = trunc i64 %3 to i32
+  %conv1.i = trunc i64 %5 to i32
+  store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !12
+  store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !12
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !12
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !12
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !12
+  %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !12
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %entry
+  %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ]
+  %conv2.i = sext i32 %.omp.iv.0.i to i64
+  %cmp.i = icmp ule i64 %conv2.i, %5
+  br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__2.exit
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %idxprom.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i
+  %17 = load i32, i32* %arrayidx.i, align 4, !noalias !12
+  %conv4.i = sitofp i32 %17 to float
+  %idxprom5.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i
+  %18 = load float, float* %arrayidx6.i, align 4, !noalias !12
+  %mul7.i = fmul float %conv4.i, %18
+  %conv8.i = fpext float %mul7.i to double
+  %idxprom9.i = sext i32 %.omp.iv.0.i to i64
+  %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i
+  %19 = load double, double* %arrayidx10.i, align 8, !noalias !12
+  %add11.i = fadd double %19, %conv8.i
+  store double %add11.i, double* %arrayidx10.i, align 8, !noalias !12
+  %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !12
+  %add12.i = add nsw i32 %.omp.iv.0.i, %20
+  br label %omp.inner.for.cond.i
+
+__omp_outlined__2.exit:                           ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !12
+  %21 = bitcast i32* %.omp.lb.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %21)
+  %22 = bitcast i32* %.omp.ub.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %22)
+  %23 = bitcast i32* %.omp.stride.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %23)
+  %24 = bitcast i32* %.omp.is_last.i to i8*
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %24)
+  ret void
+}
+
+declare void @__kmpc_generic_kernel_deinit(i16, i16)
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32)
+
+declare i1 @__kmpc_kernel_parallel(i8**, i16)
+
+declare i8* @__kmpc_get_shared_variables()
+
+declare void @__kmpc_kernel_end_parallel()
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0}
+!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 "}
+!5 = !{!6, !8}
+!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."}
+!7 = distinct !{!7, !"__omp_outlined__"}
+!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."}
+!9 = !{!10}
+!10 = distinct !{!10, !11, !"__omp_outlined__1: %.global_tid."}
+!11 = distinct !{!11, !"__omp_outlined__1"}
+!12 = !{!13}
+!13 = distinct !{!13, !14, !"__omp_outlined__2: %.global_tid."}
+!14 = distinct !{!14, !"__omp_outlined__2"}
Index: llvm/test/Transforms/OpenMP/target_offload_old.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/OpenMP/target_offload_old.ll
@@ -0,0 +1,600 @@
+
+; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud
+; ModuleID = '../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c'
+source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c"
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvida-cud"
+
+%struct.ident_t = type { i32, i32, i32, i32, i8* }
+
+@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null
+@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
+@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8
+@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 1
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata"
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_offloading_18_280394b_foo_l3_worker() #0 {
+entry:
+  %work_fn = alloca i8*, align 8
+  %exec_status = alloca i8, align 1
+  store i8* null, i8** %work_fn, align 8
+  store i8 0, i8* %exec_status, align 1
+  br label %.await.work
+
+.await.work:                                      ; preds = %.barrier.parallel, %entry
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1)
+  %1 = zext i1 %0 to i8
+  store i8 %1, i8* %exec_status, align 1
+  %2 = load i8*, i8** %work_fn, align 8
+  %should_terminate = icmp eq i8* %2, null
+  br i1 %should_terminate, label %.exit, label %.select.workers
+
+.select.workers:                                  ; preds = %.await.work
+  %3 = load i8, i8* %exec_status, align 1
+  %is_active = icmp ne i8 %3, 0
+  br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
+
+.execute.parallel:                                ; preds = %.select.workers
+  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %5 = load i8*, i8** %work_fn, align 8
+  %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
+  br i1 %work_match, label %.execute.fn, label %.check.next
+
+.execute.fn:                                      ; preds = %.execute.parallel
+  call void @__omp_outlined__1_wrapper(i16 0, i32 %4) #4
+  br label %.terminate.parallel
+
+.check.next:                                      ; preds = %.execute.parallel
+  %6 = load i8*, i8** %work_fn, align 8
+  %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
+  br i1 %work_match1, label %.execute.fn2, label %.check.next3
+
+.execute.fn2:                                     ; preds = %.check.next
+  call void @__omp_outlined__2_wrapper(i16 0, i32 %4) #4
+  br label %.terminate.parallel
+
+.check.next3:                                     ; preds = %.check.next
+  %7 = bitcast i8* %2 to void (i16, i32)*
+  call void %7(i16 0, i32 %4)
+  br label %.terminate.parallel
+
+.terminate.parallel:                              ; preds = %.check.next3, %.execute.fn2, %.execute.fn
+  call void @__kmpc_kernel_end_parallel()
+  br label %.barrier.parallel
+
+.barrier.parallel:                                ; preds = %.terminate.parallel, %.select.workers
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %.await.work
+
+.exit:                                            ; preds = %.await.work
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #1 {
+entry:
+  %.global_tid..addr.i = alloca i32*, align 8
+  %.bound_tid..addr.i = alloca i32*, align 8
+  %a.addr.i = alloca i32*, align 8
+  %b.addr.i = alloca float*, align 8
+  %c.addr.i = alloca double*, align 8
+  %.omp.iv.i = alloca i32, align 4
+  %tmp.i = alloca i32, align 4
+  %.omp.comb.lb.i = alloca i32, align 4
+  %.omp.comb.ub.i = alloca i32, align 4
+  %.omp.stride.i = alloca i32, align 4
+  %.omp.is_last.i = alloca i32, align 4
+  %i.i = alloca i32, align 4
+  %.zero.addr.i = alloca i32, align 4
+  %shared_arg_refs.i = alloca i8**, align 8
+  %.omp.iv2.i = alloca i32, align 4
+  %tmp3.i = alloca i32, align 4
+  %.omp.comb.lb4.i = alloca i32, align 4
+  %.omp.comb.ub5.i = alloca i32, align 4
+  %.omp.stride6.i = alloca i32, align 4
+  %.omp.is_last7.i = alloca i32, align 4
+  %i8.i = alloca i32, align 4
+  %.zero.addr17.i = alloca i32, align 4
+  %shared_arg_refs18.i = alloca i8**, align 8
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca float*, align 8
+  %c.addr = alloca double*, align 8
+  %.zero.addr = alloca i32, align 4
+  %.threadid_temp. = alloca i32, align 4
+  store i32 0, i32* %.zero.addr, align 4
+  store i32* %a, i32** %a.addr, align 8
+  store float* %b, float** %b.addr, align 8
+  store double* %c, double** %c.addr, align 8
+  %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
+  %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %0 = icmp ult i32 %nvptx_tid, %thread_limit
+  br i1 %0, label %.worker, label %.mastercheck
+
+.worker:                                          ; preds = %entry
+  call void @__omp_offloading_18_280394b_foo_l3_worker() #4
+  br label %.exit
+
+.mastercheck:                                     ; preds = %entry
+  %nvptx_num_threads1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %nvptx_warp_size2 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  %1 = sub nuw i32 %nvptx_warp_size2, 1
+  %2 = xor i32 %1, -1
+  %3 = sub nuw i32 %nvptx_num_threads1, 1
+  %master_tid = and i32 %3, %2
+  %nvptx_tid3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %4 = icmp eq i32 %nvptx_tid3, %master_tid
+  br i1 %4, label %.master, label %.exit
+
+.master:                                          ; preds = %.mastercheck
+  %nvptx_warp_size4 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  %nvptx_num_threads5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  %thread_limit6 = sub nuw i32 %nvptx_num_threads5, %nvptx_warp_size4
+  call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
+  call void @__kmpc_data_sharing_init_stack()
+  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
+  %6 = load i32*, i32** %a.addr, align 8
+  %7 = load float*, float** %b.addr, align 8
+  %8 = load double*, double** %c.addr, align 8
+  store i32 %5, i32* %.threadid_temp., align 4
+  store i32 0, i32* %.zero.addr17.i, align 4, !noalias !5
+  store i32 0, i32* %.zero.addr.i, align 4, !noalias !5
+  store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !5
+  store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !5
+  store i32* %6, i32** %a.addr.i, align 8, !noalias !5
+  store float* %7, float** %b.addr.i, align 8, !noalias !5
+  store double* %8, double** %c.addr.i, align 8, !noalias !5
+  store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5
+  %9 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !5
+  %10 = load i32, i32* %9, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #4
+  %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp.i = icmp sgt i32 %11, 1023
+  br i1 %cmp.i, label %cond.true.i, label %cond.false.i
+
+cond.true.i:                                      ; preds = %.master
+  br label %cond.end.i
+
+cond.false.i:                                     ; preds = %.master
+  %12 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  br label %cond.end.i
+
+cond.end.i:                                       ; preds = %cond.false.i, %cond.true.i
+  %cond.i = phi i32 [ 1023, %cond.true.i ], [ %12, %cond.false.i ]
+  store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %13 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  store i32 %13, i32* %.omp.iv.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.cond.i:                             ; preds = %omp.inner.for.body.i, %cond.end.i
+  %14 = load i32, i32* %.omp.iv.i, align 4, !noalias !5
+  %15 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %cmp1.i = icmp sle i32 %14, %15
+  br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i
+
+omp.inner.for.body.i:                             ; preds = %omp.inner.for.cond.i
+  %16 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5
+  %17 = zext i32 %16 to i64
+  %18 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5
+  %19 = zext i32 %18 to i64
+  %20 = load double*, double** %c.addr.i, align 8, !noalias !5
+  %21 = load i32*, i32** %a.addr.i, align 8, !noalias !5
+  %22 = load float*, float** %b.addr.i, align 8, !noalias !5
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i16 1) #4
+  call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs.i, i64 5) #4
+  %23 = load i8**, i8*** %shared_arg_refs.i, align 8, !noalias !5
+  %24 = inttoptr i64 %17 to i8*
+  store i8* %24, i8** %23, align 8
+  %25 = getelementptr inbounds i8*, i8** %23, i64 1
+  %26 = inttoptr i64 %19 to i8*
+  store i8* %26, i8** %25, align 8
+  %27 = getelementptr inbounds i8*, i8** %23, i64 2
+  %28 = bitcast double* %20 to i8*
+  store i8* %28, i8** %27, align 8
+  %29 = getelementptr inbounds i8*, i8** %23, i64 3
+  %30 = bitcast i32* %21 to i8*
+  store i8* %30, i8** %29, align 8
+  %31 = getelementptr inbounds i8*, i8** %23, i64 4
+  %32 = bitcast float* %22 to i8*
+  store i8* %32, i8** %31, align 8
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4
+  call void @__kmpc_end_sharing_variables() #4
+  %33 = load i32, i32* %.omp.iv.i, align 4, !noalias !5
+  %34 = load i32, i32* %.omp.stride.i, align 4, !noalias !5
+  %add.i = add nsw i32 %33, %34
+  store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !5
+  br label %omp.inner.for.cond.i
+
+omp.inner.for.end.i:                              ; preds = %omp.inner.for.cond.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) #4
+  store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5
+  store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #4
+  %35 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp9.i = icmp sgt i32 %35, 1023
+  br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i
+
+cond.true10.i:                                    ; preds = %omp.inner.for.end.i
+  br label %cond.end12.i
+
+cond.false11.i:                                   ; preds = %omp.inner.for.end.i
+  %36 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  br label %cond.end12.i
+
+cond.end12.i:                                     ; preds = %cond.false11.i, %cond.true10.i
+  %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %36, %cond.false11.i ]
+  store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %37 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  store i32 %37, i32* %.omp.iv2.i, align 4, !noalias !5
+  br label %omp.inner.for.cond14.i
+
+omp.inner.for.cond14.i:                           ; preds = %omp.inner.for.body16.i, %cond.end12.i
+  %38 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5
+  %39 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %cmp15.i = icmp sle i32 %38, %39
+  br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit
+
+omp.inner.for.body16.i:                           ; preds = %omp.inner.for.cond14.i
+  %40 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5
+  %41 = zext i32 %40 to i64
+  %42 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5
+  %43 = zext i32 %42 to i64
+  %44 = load double*, double** %c.addr.i, align 8, !noalias !5
+  %45 = load i32*, i32** %a.addr.i, align 8, !noalias !5
+  %46 = load float*, float** %b.addr.i, align 8, !noalias !5
+  call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i16 1) #4
+  call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs18.i, i64 5) #4
+  %47 = load i8**, i8*** %shared_arg_refs18.i, align 8, !noalias !5
+  %48 = inttoptr i64 %41 to i8*
+  store i8* %48, i8** %47, align 8
+  %49 = getelementptr inbounds i8*, i8** %47, i64 1
+  %50 = inttoptr i64 %43 to i8*
+  store i8* %50, i8** %49, align 8
+  %51 = getelementptr inbounds i8*, i8** %47, i64 2
+  %52 = bitcast double* %44 to i8*
+  store i8* %52, i8** %51, align 8
+  %53 = getelementptr inbounds i8*, i8** %47, i64 3
+  %54 = bitcast i32* %45 to i8*
+  store i8* %54, i8** %53, align 8
+  %55 = getelementptr inbounds i8*, i8** %47, i64 4
+  %56 = bitcast float* %46 to i8*
+  store i8* %56, i8** %55, align 8
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4
+  call void @__kmpc_end_sharing_variables() #4
+  %57 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5
+  %58 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5
+  %add20.i = add nsw i32 %57, %58
+  store i32 %add20.i, i32* %.omp.iv2.i, align 4, !noalias !5
+  br label %omp.inner.for.cond14.i
+
+__omp_outlined__.exit:                            ; preds = %omp.inner.for.cond14.i
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) #4
+  br label %.termination.notifier
+
+.termination.notifier:                            ; preds = %__omp_outlined__.exit
+  call void @__kmpc_kernel_deinit(i16 1)
+  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
+  br label %.exit
+
+.exit:                                            ; preds = %.termination.notifier, %.mastercheck, %.worker
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2
+
+; Function Attrs: nounwind readnone
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2
+
+declare void @__kmpc_kernel_init(i32, i16)
+
+declare void @__kmpc_data_sharing_init_stack()
+
+declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %c.addr = alloca double*, align 8
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca float*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store double* %c, double** %c.addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  store float* %b, float** %b.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 1023, i32* %.omp.ub, align 4
+  %0 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %0 to i32
+  %1 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %1 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %2 = load i32*, i32** %.global_tid..addr, align 8
+  %3 = load i32, i32* %2, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %4 = load i32, i32* %.omp.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %5 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %5 to i64
+  %6 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %6
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %7, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %8 = load i32*, i32** %a.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %10 = load i32, i32* %arrayidx, align 4
+  %conv4 = sitofp i32 %10 to float
+  %11 = load float*, float** %b.addr, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom5 = sext i32 %12 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5
+  %13 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %conv4, %13
+  %conv8 = fpext float %mul7 to double
+  %14 = load double*, double** %c.addr, align 8
+  %15 = load i32, i32* %i, align 4
+  %idxprom9 = sext i32 %15 to i64
+  %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9
+  %16 = load double, double* %arrayidx10, align 8
+  %add11 = fadd double %16, %conv8
+  store double %add11, double* %arrayidx10, align 8
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %17 = load i32, i32* %.omp.iv, align 4
+  %18 = load i32, i32* %.omp.stride, align 4
+  %add12 = add nsw i32 %17, %18
+  store i32 %add12, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3)
+  ret void
+}
+
+declare void @__kmpc_for_static_fini(%struct.ident_t*, i32)
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__1_wrapper(i16 zeroext, i32) #0 {
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i64*
+  %5 = load i64, i64* %4, align 8
+  %6 = getelementptr inbounds i8*, i8** %2, i64 1
+  %7 = bitcast i8** %6 to i64*
+  %8 = load i64, i64* %7, align 8
+  %9 = getelementptr inbounds i8*, i8** %2, i64 2
+  %10 = bitcast i8** %9 to double**
+  %11 = load double*, double** %10, align 8
+  %12 = getelementptr inbounds i8*, i8** %2, i64 3
+  %13 = bitcast i8** %12 to i32**
+  %14 = load i32*, i32** %13, align 8
+  %15 = getelementptr inbounds i8*, i8** %2, i64 4
+  %16 = bitcast i8** %15 to float**
+  %17 = load float*, float** %16, align 8
+  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, double* %11, i32* %14, float* %17) #4
+  ret void
+}
+
+declare void @__kmpc_get_shared_variables(i8***)
+
+declare void @__kmpc_kernel_prepare_parallel(i8*, i16)
+
+declare void @__kmpc_begin_sharing_variables(i8***, i64)
+
+; Function Attrs: convergent
+declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) #3
+
+declare void @__kmpc_end_sharing_variables()
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #1 {
+entry:
+  %.global_tid..addr = alloca i32*, align 8
+  %.bound_tid..addr = alloca i32*, align 8
+  %.previous.lb..addr = alloca i64, align 8
+  %.previous.ub..addr = alloca i64, align 8
+  %c.addr = alloca double*, align 8
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca float*, align 8
+  %.omp.iv = alloca i32, align 4
+  %tmp = alloca i32, align 4
+  %.omp.lb = alloca i32, align 4
+  %.omp.ub = alloca i32, align 4
+  %.omp.stride = alloca i32, align 4
+  %.omp.is_last = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32* %.global_tid., i32** %.global_tid..addr, align 8
+  store i32* %.bound_tid., i32** %.bound_tid..addr, align 8
+  store i64 %.previous.lb., i64* %.previous.lb..addr, align 8
+  store i64 %.previous.ub., i64* %.previous.ub..addr, align 8
+  store double* %c, double** %c.addr, align 8
+  store i32* %a, i32** %a.addr, align 8
+  store float* %b, float** %b.addr, align 8
+  store i32 0, i32* %.omp.lb, align 4
+  store i32 1023, i32* %.omp.ub, align 4
+  %0 = load i64, i64* %.previous.lb..addr, align 8
+  %conv = trunc i64 %0 to i32
+  %1 = load i64, i64* %.previous.ub..addr, align 8
+  %conv1 = trunc i64 %1 to i32
+  store i32 %conv, i32* %.omp.lb, align 4
+  store i32 %conv1, i32* %.omp.ub, align 4
+  store i32 1, i32* %.omp.stride, align 4
+  store i32 0, i32* %.omp.is_last, align 4
+  %2 = load i32*, i32** %.global_tid..addr, align 8
+  %3 = load i32, i32* %2, align 4
+  call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1)
+  %4 = load i32, i32* %.omp.lb, align 4
+  store i32 %4, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.cond:                               ; preds = %omp.inner.for.inc, %entry
+  %5 = load i32, i32* %.omp.iv, align 4
+  %conv2 = sext i32 %5 to i64
+  %6 = load i64, i64* %.previous.ub..addr, align 8
+  %cmp = icmp ule i64 %conv2, %6
+  br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end
+
+omp.inner.for.body:                               ; preds = %omp.inner.for.cond
+  %7 = load i32, i32* %.omp.iv, align 4
+  %mul = mul nsw i32 %7, 1
+  %add = add nsw i32 0, %mul
+  store i32 %add, i32* %i, align 4
+  %8 = load i32*, i32** %a.addr, align 8
+  %9 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %9 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom
+  %10 = load i32, i32* %arrayidx, align 4
+  %conv4 = sitofp i32 %10 to float
+  %11 = load float*, float** %b.addr, align 8
+  %12 = load i32, i32* %i, align 4
+  %idxprom5 = sext i32 %12 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5
+  %13 = load float, float* %arrayidx6, align 4
+  %mul7 = fmul float %conv4, %13
+  %conv8 = fpext float %mul7 to double
+  %14 = load double*, double** %c.addr, align 8
+  %15 = load i32, i32* %i, align 4
+  %idxprom9 = sext i32 %15 to i64
+  %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9
+  %16 = load double, double* %arrayidx10, align 8
+  %add11 = fadd double %16, %conv8
+  store double %add11, double* %arrayidx10, align 8
+  br label %omp.body.continue
+
+omp.body.continue:                                ; preds = %omp.inner.for.body
+  br label %omp.inner.for.inc
+
+omp.inner.for.inc:                                ; preds = %omp.body.continue
+  %17 = load i32, i32* %.omp.iv, align 4
+  %18 = load i32, i32* %.omp.stride, align 4
+  %add12 = add nsw i32 %17, %18
+  store i32 %add12, i32* %.omp.iv, align 4
+  br label %omp.inner.for.cond
+
+omp.inner.for.end:                                ; preds = %omp.inner.for.cond
+  br label %omp.loop.exit
+
+omp.loop.exit:                                    ; preds = %omp.inner.for.end
+  call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3)
+  ret void
+}
+
+; Function Attrs: norecurse nounwind
+define internal void @__omp_outlined__2_wrapper(i16 zeroext, i32) #0 {
+entry:
+  %.addr = alloca i16, align 2
+  %.addr1 = alloca i32, align 4
+  %.zero.addr = alloca i32, align 4
+  %global_args = alloca i8**, align 8
+  store i32 0, i32* %.zero.addr, align 4
+  store i16 %0, i16* %.addr, align 2
+  store i32 %1, i32* %.addr1, align 4
+  call void @__kmpc_get_shared_variables(i8*** %global_args)
+  %2 = load i8**, i8*** %global_args, align 8
+  %3 = getelementptr inbounds i8*, i8** %2, i64 0
+  %4 = bitcast i8** %3 to i64*
+  %5 = load i64, i64* %4, align 8
+  %6 = getelementptr inbounds i8*, i8** %2, i64 1
+  %7 = bitcast i8** %6 to i64*
+  %8 = load i64, i64* %7, align 8
+  %9 = getelementptr inbounds i8*, i8** %2, i64 2
+  %10 = bitcast i8** %9 to double**
+  %11 = load double*, double** %10, align 8
+  %12 = getelementptr inbounds i8*, i8** %2, i64 3
+  %13 = bitcast i8** %12 to i32**
+  %14 = load i32*, i32** %13, align 8
+  %15 = getelementptr inbounds i8*, i8** %2, i64 4
+  %16 = bitcast i8** %15 to float**
+  %17 = load float*, float** %16, align 8
+  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, double* %11, i32* %14, float* %17) #4
+  ret void
+}
+
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
+
+declare void @__kmpc_kernel_deinit(i16)
+
+declare i1 @__kmpc_kernel_parallel(i8**, i16)
+
+declare void @__kmpc_kernel_end_parallel()
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { convergent }
+attributes #4 = { nounwind }
+
+!omp_offload.info = !{!0}
+!nvvm.annotations = !{!1}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0}
+!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1}
+!2 = !{i32 1, !"wchar_size", i32 4}
+!3 = !{i32 7, !"PIC Level", i32 2}
+!4 = !{!"clang version 9.0.0 (http://llvm.org/git/clang.git c6f1d4e0e14fbd11f4cc61068c429a067faf86ef) (http://llvm.org/git/llvm.git 0f783294e2ea6fe630e7655f303b4bc33bfd6167)"}
+!5 = !{!6, !8}
+!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."}
+!7 = distinct !{!7, !"__omp_outlined__"}
+!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."}
+
+; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud
Index: openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu
@@ -346,7 +346,7 @@
   // statically allocated shared memory slots. The size of a shared memory
   // slot is pre-determined to be 256 bytes.
   data_sharing_init_stack_common();
-  omptarget_nvptx_globalArgs.Init();
+  omptarget_nvptx_globalArgBuffer.Init();
 }
 
 // Initialize data sharing data structure. This function needs to be called
@@ -506,14 +506,11 @@
   }
 }
 
-// Begin a data sharing context. Maintain a list of references to shared
-// variables. This list of references to shared variables will be passed
-// to one or more threads.
-// In L0 data sharing this is called by master thread.
-// In L1 data sharing this is called by active warp master thread.
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
-  omptarget_nvptx_globalArgs.EnsureSize(nArgs);
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+/// Enuse the data sharing context has at least \p NumBytes and return a pointer
+/// to the beginning of the shared memory.
+EXTERN char *__kmpc_begin_sharing_variables(size_t NumBytes) {
+  omptarget_nvptx_globalArgBuffer.EnsureSize(NumBytes);
+  return omptarget_nvptx_globalArgBuffer.begin();
 }
 
 // End a data sharing context. There is no need to have a list of refs
@@ -523,7 +520,7 @@
 // In L0 data sharing this is called by master thread.
 // In L1 data sharing this is called by active warp master thread.
 EXTERN void __kmpc_end_sharing_variables() {
-  omptarget_nvptx_globalArgs.DeInit();
+  omptarget_nvptx_globalArgBuffer.DeInit();
 }
 
 // This function will return a list of references to global variables. This
@@ -531,8 +528,8 @@
 // members of this list will be passed to the outlined parallel function
 // preserving the order.
 // Called by all workers.
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
-  *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs();
+EXTERN char *__kmpc_get_shared_variables() {
+  return omptarget_nvptx_globalArgBuffer.begin();
 }
 
 // This function is used to init static memory manager. This manager is used to
Index: openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/interface.h
@@ -513,16 +513,15 @@
                                           int32_t *LaneId, int32_t *NumLanes);
 EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer);
 
-
 EXTERN void __kmpc_data_sharing_init_stack();
 EXTERN void __kmpc_data_sharing_init_stack_spmd();
 EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
     int16_t UseSharedMemory);
 EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory);
 EXTERN void __kmpc_data_sharing_pop_stack(void *a);
-EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
+EXTERN char *__kmpc_begin_sharing_variables(size_t NumBytes);
 EXTERN void __kmpc_end_sharing_variables();
-EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
+EXTERN char *__kmpc_get_shared_variables();
 
 // The slot used for data sharing by the master and worker threads. We use a
 // complete (default size version and an incomplete one so that we allow sizes
@@ -560,4 +559,20 @@
 EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
                                               int16_t is_shared);
 
+/// Generic kernel initialization which defers to __kmpc_spmd_kernel_init if \p
+/// IsSPMD is true, and to __kmpc_kernel_init otherwise.
+EXTERN int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t UseSM,
+                                          int16_t RequiresOMPRuntime,
+                                          int16_t RequiresDataSharing);
+/// TODO
+EXTERN void __kmpc_generic_kernel_deinit(int16_t IsSPMD,
+                                         int16_t RequiredOMPRuntime);
+/// TODO
+///
+/// NOTE: Changing this type will require changes in the Clang NVPTX code
+/// generation as well as the LLVM OpenMPOpt pass!
+EXTERN void __kmpc_generic_kernel_parallel(void *OutlinedFn, void *Payload,
+                                           int16_t PayloadBytes,
+                                           int16_t RequiredOMPRuntime);
+
 #endif
Index: openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu
@@ -62,4 +62,5 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Data sharing related variables.
 ////////////////////////////////////////////////////////////////////////////////
-__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
+__device__ __shared__ omptarget_nvptx_SharedBuffer
+    omptarget_nvptx_globalArgBuffer;
Index: openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h
@@ -64,45 +64,45 @@
 #define __SYNCTHREADS() __SYNCTHREADS_N(0)
 
 // arguments needed for L0 parallelism only.
-class omptarget_nvptx_SharedArgs {
+class omptarget_nvptx_SharedBuffer {
 public:
   // All these methods must be called by the master thread only.
   INLINE void Init() {
-    args  = buffer;
-    nArgs = MAX_SHARED_ARGS;
+    UsedBuffer  = &FixedBuffer[0];
+    NumBytes = MAX_SHARED_BYTES;
   }
   INLINE void DeInit() {
     // Free any memory allocated for outlined parallel function with a large
     // number of arguments.
-    if (nArgs > MAX_SHARED_ARGS) {
-      SafeFree(args, (char *)"new extended args");
+    if (NumBytes > MAX_SHARED_BYTES) {
+      SafeFree(UsedBuffer, (char *)"deinit extended shared buffer");
       Init();
     }
   }
-  INLINE void EnsureSize(size_t size) {
-    if (size > nArgs) {
-      if (nArgs > MAX_SHARED_ARGS) {
-        SafeFree(args, (char *)"new extended args");
+  INLINE void EnsureSize(size_t RequestedBytes) {
+    if (RequestedBytes > NumBytes) {
+      if (NumBytes > MAX_SHARED_BYTES) {
+        SafeFree(UsedBuffer, (char *)"new extended shared buffer");
       }
-      args = (void **) SafeMalloc(size * sizeof(void *),
-                                  (char *)"new extended args");
-      nArgs = size;
+      UsedBuffer = (char *)SafeMalloc(RequestedBytes,
+                                      (char *)"new extended shared buffer");
+      NumBytes = RequestedBytes;
     }
   }
   // Called by all threads.
-  INLINE void **GetArgs() const { return args; };
+  INLINE char *begin() const { return UsedBuffer; };
 private:
-  // buffer of pre-allocated arguments.
-  void *buffer[MAX_SHARED_ARGS];
-  // pointer to arguments buffer.
-  // starts off as a pointer to 'buffer' but can be dynamically allocated.
-  void **args;
-  // starts off as MAX_SHARED_ARGS but can increase in size.
-  uint32_t nArgs;
+  // FixedBuffer of pre-allocated arguments.
+  char FixedBuffer[MAX_SHARED_BYTES * sizeof(void *)];
+  // pointer to arguments FixedBuffer.
+  // starts off as a pointer to 'FixedBuffer' but can be dynamically allocated.
+  char *UsedBuffer;
+  // starts off as MAX_SHARED_BYTES but can increase in size.
+  uint32_t NumBytes;
 };
 
-extern __device__ __shared__ omptarget_nvptx_SharedArgs
-    omptarget_nvptx_globalArgs;
+extern __device__ __shared__ omptarget_nvptx_SharedBuffer
+    omptarget_nvptx_globalArgBuffer;
 
 // Data sharing related quantities, need to match what is used in the compiler.
 enum DATA_SHARING_SIZES {
Index: openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
+++ openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu
@@ -182,3 +182,128 @@
   PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n");
   return isSPMDMode();
 }
+
+typedef void (*WorkFnTy)(void*);
+
+INLINE static void
+__kmpc_generic_kernel_state_machine(int16_t IsOMPRuntimeInitialized) {
+
+  do {
+    void *WorkFn = 0;
+
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+    bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized);
+
+    // If there is nothing more to do, break out of the state machine by
+    // returning to the caller.
+    if (!WorkFn)
+      return;
+
+    if (IsActive) {
+      char *Args = omptarget_nvptx_globalArgBuffer.begin();
+
+      ((WorkFnTy) WorkFn)((void *)Args);
+
+      __kmpc_kernel_end_parallel();
+    }
+
+    __kmpc_barrier_simple_spmd(NULL, 0);
+
+  } while (true);
+}
+
+/// Filter threads if \p UseSM is true. Workers will enter the state machine
+/// through __kmpc_generic_kernel_state_machine and be trapped there. The master
+/// and non working threads will return from this function. The return value
+/// indicates if the thread is the master.
+INLINE static int16_t
+__kmpc_generic_kernel_thread_filter(unsigned ThreadLimit,
+                                    int16_t UseSM, int16_t IsOMPRuntimeInitialized) {
+
+  unsigned TId = GetThreadIdInBlock();
+  bool IsWorker = TId < ThreadLimit;
+
+  if (IsWorker) {
+    if (UseSM)
+      __kmpc_generic_kernel_state_machine(IsOMPRuntimeInitialized);
+    return -1;
+  }
+
+  return TId == GetMasterThreadID();
+}
+
+EXTERN int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t UseSM,
+                                          int16_t RequiresOMPRuntime,
+                                          int16_t RequiresDataSharing) {
+  unsigned NumThreads = GetNumberOfThreadsInBlock();// GetNumberOfWorkersInTeam();
+
+  // Handle the SPMD case first.
+  if (IsSPMD) {
+
+    __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime,
+                            RequiresDataSharing);
+
+    // TODO: This was copied from the clang code but it seems odd that we use
+    // RequiresOMPRuntime and not RequiresDataSharing. The latter seems to be
+    // always false anyway.
+    //
+    // For data sharing, we need to initialize the stack.
+    if (RequiresOMPRuntime)
+      __kmpc_data_sharing_init_stack_spmd();
+
+    return 1;
+  }
+
+  unsigned ThreadLimit = NumThreads - WARPSIZE;
+  int16_t FilterVal = __kmpc_generic_kernel_thread_filter(
+      ThreadLimit, UseSM, RequiresOMPRuntime);
+
+  if (FilterVal == 1) {
+    __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime);
+    __kmpc_data_sharing_init_stack();
+  }
+
+  return FilterVal;
+}
+
+EXTERN void __kmpc_generic_kernel_deinit(int16_t IsSPMD,
+                                         int16_t RequiredOMPRuntime) {
+  if (IsSPMD) {
+    __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime);
+  } else {
+    // TODO port vars epilog to the runtime.
+
+    __kmpc_kernel_deinit(RequiredOMPRuntime);
+
+    // Barrier to terminate worker threads.
+    __kmpc_barrier_simple_spmd(NULL, 0);
+  }
+}
+
+EXTERN void __kmpc_generic_kernel_parallel(void *OutlinedFn,
+                                           void *Payload, int16_t PayloadBytes,
+                                           int16_t RequiredOMPRuntime) {
+  __kmpc_kernel_prepare_parallel(OutlinedFn, RequiredOMPRuntime);
+
+  if (PayloadBytes) {
+    omptarget_nvptx_globalArgBuffer.EnsureSize(PayloadBytes);
+    char *Args = omptarget_nvptx_globalArgBuffer.begin();
+    memcpy(Args, Payload, PayloadBytes);
+  }
+
+  // Activate workers. This barrier is used by the master to signal
+  // work for the workers.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  // OpenMP [2.5, Parallel Construct, p.49]
+  // There is an implied barrier at the end of a parallel region. After the
+  // end of a parallel region, only the master thread of the team resumes
+  // execution of the enclosing task region.
+  //
+  // The master waits at this barrier until all workers are done.
+  __kmpc_barrier_simple_spmd(NULL, 0);
+
+  if (PayloadBytes)
+    __kmpc_end_sharing_variables();
+}
Index: openmp/libomptarget/deviceRTLs/nvptx/src/option.h
===================================================================
--- openmp/libomptarget/deviceRTLs/nvptx/src/option.h
+++ openmp/libomptarget/deviceRTLs/nvptx/src/option.h
@@ -27,9 +27,9 @@
 // region to synchronize with each other.
 #define L1_BARRIER (1)
 
-// Maximum number of preallocated arguments to an outlined parallel/simd function.
-// Anything more requires dynamic memory allocation.
-#define MAX_SHARED_ARGS 20
+// Maximum number of preallocated bytes sharable with an outlined parallel/simd
+// function. Anything more requires dynamic memory allocation.
+#define MAX_SHARED_BYTES 20
 
 // Maximum number of omp state objects per SM allocated statically in global
 // memory.