Index: SPMD_examples/v0.1/target_offload_is_SPMD.c =================================================================== --- /dev/null +++ SPMD_examples/v0.1/target_offload_is_SPMD.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#define N 10 +#define TEAMS 3 + +void foo(int* dis, int *team) { + + #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N],team[:N]) + { + #pragma omp distribute parallel for + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; // <- Increment dis[0:N] from i to i+1 + + #pragma omp distribute parallel for + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; + + + team[omp_get_team_num()] += 1; + } +} + +int main() { + int dis[N], team[N]; + + for (int i = 0; i < N; i++) { + dis[i] = i; + team[i] = 0; + } + + foo(dis, team); + + for (int i = 0; i < N; i++) { + printf("dis[%3i] = %4i\t\tteam[%3i] = %4i\n", i, dis[i], i, team[i]); + } + + return 0; +} Index: SPMD_examples/v0.1/target_offload_not_SPMD.c =================================================================== --- /dev/null +++ SPMD_examples/v0.1/target_offload_not_SPMD.c @@ -0,0 +1,42 @@ +#include +#include +#include + +#define N 10 +#define TEAMS 3 + +void foo(int* dis, int *team) { + + #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N],team[:N]) + { + #pragma omp distribute parallel for + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; // <- Increment dis[0:N] from i to i+1 + + #pragma omp parallel // <- Not valid in SPMD mode without guard + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; // <- Increment dis[0:N] from i+1 + // to i+1+128 * TEAMS(-1/*masters*/) + 32 + + team[omp_get_team_num()] += 1; + } +} + +int main() { + int dis[N], team[N]; + + for (int i = 0; i < N; i++) { + dis[i] = i; + team[i] = 0; + } + + foo(dis, team); + + for (int i = 0; i < N; i++) { + printf("dis[%3i] = %4i\t\tteam[%3i] = %4i\n", i, dis[i], i, team[i]); + } + + return 0; +} Index: SPMD_examples/v0.2/target_offload_is_SPMD.new.host.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_is_SPMD.new.host.ll @@ -0,0 +1,530 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/jdoerfert/target_offload_is_SPMD-7bb1c0.bc' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_2b_142c531_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40] +@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c531_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_2b_142c531_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis, i32* %team) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %team.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 0 + %4 = load i32*, i32** %team.addr, align 8 + %5 = load i32*, i32** %team.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %7 = bitcast i8** %6 to i32** + store i32* %2, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = bitcast i8** %8 to i32** + store i32* %arrayidx, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1 + %11 = bitcast i8** %10 to i32** + store i32* %4, i32** %11, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1 + %13 = bitcast i8** %12 to i32** + store i32* %arrayidx1, i32** %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_2b_142c531_foo_l10(i32* %0, i32* %1) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %team.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %.omp.iv2 = alloca i32, align 4 + %tmp3 = alloca i32, align 4 + %.omp.comb.lb4 = alloca i32, align 4 + %.omp.comb.ub5 = alloca i32, align 4 + %.omp.stride6 = alloca i32, align 4 + %.omp.is_last7 = alloca i32, align 4 + %i8 = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %11, %12 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + store i32 0, i32* %.omp.comb.lb4, align 4 + store i32 9, i32* %.omp.comb.ub5, align 4 + store i32 1, i32* %.omp.stride6, align 4 + store i32 0, i32* %.omp.is_last7, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last7, i32* %.omp.comb.lb4, i32* %.omp.comb.ub5, i32* %.omp.stride6, i32 1, i32 1) + %13 = load i32, i32* %.omp.comb.ub5, align 4 + %cmp9 = icmp sgt i32 %13, 9 + br i1 %cmp9, label %cond.true10, label %cond.false11 + +cond.true10: ; preds = %omp.loop.exit + br label %cond.end12 + +cond.false11: ; preds = %omp.loop.exit + %14 = load i32, i32* %.omp.comb.ub5, align 4 + br label %cond.end12 + +cond.end12: ; preds = %cond.false11, %cond.true10 + %cond13 = phi i32 [ 9, %cond.true10 ], [ %14, %cond.false11 ] + store i32 %cond13, i32* %.omp.comb.ub5, align 4 + %15 = load i32, i32* %.omp.comb.lb4, align 4 + store i32 %15, i32* %.omp.iv2, align 4 + br label %omp.inner.for.cond14 + +omp.inner.for.cond14: ; preds = %omp.inner.for.inc17, %cond.end12 + %16 = load i32, i32* %.omp.iv2, align 4 + %17 = load i32, i32* %.omp.comb.ub5, align 4 + %cmp15 = icmp sle i32 %16, %17 + br i1 %cmp15, label %omp.inner.for.body16, label %omp.inner.for.end19 + +omp.inner.for.body16: ; preds = %omp.inner.for.cond14 + %18 = load i32, i32* %.omp.comb.lb4, align 4 + %19 = zext i32 %18 to i64 + %20 = load i32, i32* %.omp.comb.ub5, align 4 + %21 = zext i32 %20 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %19, i64 %21, i32** %dis.addr) + br label %omp.inner.for.inc17 + +omp.inner.for.inc17: ; preds = %omp.inner.for.body16 + %22 = load i32, i32* %.omp.iv2, align 4 + %23 = load i32, i32* %.omp.stride6, align 4 + %add18 = add nsw i32 %22, %23 + store i32 %add18, i32* %.omp.iv2, align 4 + br label %omp.inner.for.cond14 + +omp.inner.for.end19: ; preds = %omp.inner.for.cond14 + br label %omp.loop.exit20 + +omp.loop.exit20: ; preds = %omp.inner.for.end19 + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + %24 = load i32*, i32** %team.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %24, i64 %idxprom + %25 = load i32, i32* %arrayidx, align 4 + %add21 = add nsw i32 %25, 1 + store i32 %add21, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %team = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i4 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1 + store i32 0, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0 + call void @foo(i32* %arraydecay, i32* %arraydecay3) + store i32 0, i32* %i4, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc12, %for.end + %5 = load i32, i32* %i4, align 4 + %cmp6 = icmp slt i32 %5, 10 + br i1 %cmp6, label %for.body7, label %for.end14 + +for.body7: ; preds = %for.cond5 + %6 = load i32, i32* %i4, align 4 + %7 = load i32, i32* %i4, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8 + %8 = load i32, i32* %arrayidx9, align 4 + %9 = load i32, i32* %i4, align 4 + %10 = load i32, i32* %i4, align 4 + %idxprom10 = sext i32 %10 to i64 + %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10 + %11 = load i32, i32* %arrayidx11, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11) + br label %for.inc12 + +for.inc12: ; preds = %for.body7 + %12 = load i32, i32* %i4, align 4 + %inc13 = add nsw i32 %12, 1 + store i32 %inc13, i32* %i4, align 4 + br label %for.cond5 + +for.end14: ; preds = %for.cond5 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.2/target_offload_is_SPMD.new.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_is_SPMD.new.ll @@ -0,0 +1,437 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda +; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] } +%omp.shared.struct = type { i64, i64, i32** } +%omp.shared.struct.0 = type { i64, i64, i32** } +%struct._globalized_locals_ty = type { i32* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1 +@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c531_foo_l10_exec_mode = weak constant i8 1 +@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c531_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #0 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %team.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.omp.iv3.i = alloca i32, align 4 + %tmp4.i = alloca i32, align 4 + %.omp.comb.lb5.i = alloca i32, align 4 + %.omp.comb.ub6.i = alloca i32, align 4 + %.omp.stride7.i = alloca i32, align 4 + %.omp.is_last8.i = alloca i32, align 4 + %i9.i = alloca i32, align 4 + %.zero.addr18.i = alloca i32, align 4 + %.captured19.i = alloca %omp.shared.struct.0, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0) + %1 = icmp eq i16 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %3 = load i32*, i32** %dis.addr, align 8 + %4 = load i32*, i32** %team.addr, align 8 + store i32 %2, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr18.i, align 4, !noalias !10 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !10 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !10 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !10 + store i32* %3, i32** %dis.addr.i, align 8, !noalias !10 + store i32* %4, i32** %team.addr.i, align 8, !noalias !10 + call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #3 + %5 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 + %6 = bitcast i8* %5 to %struct._globalized_locals_ty* + %7 = load i32*, i32** %dis.addr.i, align 8, !noalias !10 + %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %6, i32 0, i32 0 + store i32* %7, i32** %dis1.i, align 8 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !10 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !10 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !10 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !10 + %8 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !10 + %9 = load i32, i32* %8, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #3 + %10 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %cmp.i = icmp sgt i32 %10, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %11, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %12 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10 + store i32 %12, i32* %.omp.iv.i, align 4, !noalias !10 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %13 = load i32, i32* %.omp.iv.i, align 4, !noalias !10 + %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %cmp2.i = icmp sle i32 %13, %14 + br i1 %cmp2.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10 + %16 = zext i32 %15 to i64 + %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %18 = zext i32 %17 to i64 + %19 = bitcast %omp.shared.struct* %.captured.i to i8* + %20 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %16, i64* %20, !noalias !10 + %21 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %18, i64* %21, !noalias !10 + %22 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store i32** %dis1.i, i32*** %22, !noalias !10 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %19, i16 24, i16 1) #3 + %23 = load i32, i32* %.omp.iv.i, align 4, !noalias !10 + %24 = load i32, i32* %.omp.stride.i, align 4, !noalias !10 + %add.i = add nsw i32 %23, %24 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !10 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) #3 + store i32 0, i32* %.omp.comb.lb5.i, align 4, !noalias !10 + store i32 9, i32* %.omp.comb.ub6.i, align 4, !noalias !10 + store i32 1, i32* %.omp.stride7.i, align 4, !noalias !10 + store i32 0, i32* %.omp.is_last8.i, align 4, !noalias !10 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 92, i32* %.omp.is_last8.i, i32* %.omp.comb.lb5.i, i32* %.omp.comb.ub6.i, i32* %.omp.stride7.i, i32 1, i32 1) #3 + %25 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10 + %cmp10.i = icmp sgt i32 %25, 9 + br i1 %cmp10.i, label %cond.true11.i, label %cond.false12.i + +cond.true11.i: ; preds = %omp.inner.for.end.i + br label %cond.end13.i + +cond.false12.i: ; preds = %omp.inner.for.end.i + %26 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10 + br label %cond.end13.i + +cond.end13.i: ; preds = %cond.false12.i, %cond.true11.i + %cond14.i = phi i32 [ 9, %cond.true11.i ], [ %26, %cond.false12.i ] + store i32 %cond14.i, i32* %.omp.comb.ub6.i, align 4, !noalias !10 + %27 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !10 + store i32 %27, i32* %.omp.iv3.i, align 4, !noalias !10 + br label %omp.inner.for.cond15.i + +omp.inner.for.cond15.i: ; preds = %omp.inner.for.body17.i, %cond.end13.i + %28 = load i32, i32* %.omp.iv3.i, align 4, !noalias !10 + %29 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10 + %cmp16.i = icmp sle i32 %28, %29 + br i1 %cmp16.i, label %omp.inner.for.body17.i, label %__omp_outlined__.exit + +omp.inner.for.body17.i: ; preds = %omp.inner.for.cond15.i + %30 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !10 + %31 = zext i32 %30 to i64 + %32 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !10 + %33 = zext i32 %32 to i64 + %34 = bitcast %omp.shared.struct.0* %.captured19.i to i8* + %35 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i32 0, i32 0 + store i64 %31, i64* %35, !noalias !10 + %36 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i32 0, i32 1 + store i64 %33, i64* %36, !noalias !10 + %37 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i32 0, i32 2 + store i32** %dis1.i, i32*** %37, !noalias !10 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %34, i16 24, i16 1) #3 + %38 = load i32, i32* %.omp.iv3.i, align 4, !noalias !10 + %39 = load i32, i32* %.omp.stride7.i, align 4, !noalias !10 + %add21.i = add nsw i32 %38, %39 + store i32 %add21.i, i32* %.omp.iv3.i, align 4, !noalias !10 + br label %omp.inner.for.cond15.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond15.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) #3 + %40 = load i32*, i32** %team.addr.i, align 8, !noalias !10 + %call.i = call i32 @omp_get_team_num() #3 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %40, i64 %idxprom.i + %41 = load i32, i32* %arrayidx.i, align 4 + %add24.i = add nsw i32 %41, 1 + store i32 %add24.i, i32* %arrayidx.i, align 4 + call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #3 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct* + %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0 + %4 = load i64, i64* %3, align 1 + %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1 + %6 = load i64, i64* %5, align 1 + %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2 + %8 = load i32**, i32*** %7, align 1 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32** %8) #3 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct.0* + %3 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 0 + %4 = load i64, i64* %3, align 1 + %5 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 1 + %6 = load i64, i64* %5, align 1 + %7 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 2 + %8 = load i32**, i32*** %7, align 1 + call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32** %8) #3 + ret void +} + +declare i32 @omp_get_team_num() #2 + +declare void @__kmpc_restore_team_static_memory(i16, i16) + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c531_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{!11, !13} +!11 = distinct !{!11, !12, !"__omp_outlined__: %.global_tid."} +!12 = distinct !{!12, !"__omp_outlined__"} +!13 = distinct !{!13, !12, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda Index: SPMD_examples/v0.2/target_offload_is_SPMD.old.forced.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_is_SPMD.old.forced.ll @@ -0,0 +1,944 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda +; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c531_foo_l10_exec_mode = weak constant i8 0 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c531_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: noinline norecurse nounwind optnone +define weak void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #0 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %team.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %.omp.iv9.i = alloca i32, align 4 + %tmp10.i = alloca i32, align 4 + %.omp.comb.lb11.i = alloca i32, align 4 + %.omp.comb.ub12.i = alloca i32, align 4 + %.omp.stride13.i = alloca i32, align 4 + %.omp.is_last14.i = alloca i32, align 4 + %i15.i = alloca i32, align 4 + %.zero.addr25.i = alloca i32, align 4 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !10 + call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) + call void @__kmpc_data_sharing_init_stack_spmd() + br label %.execute + +.execute: ; preds = %entry + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = load i32*, i32** %dis.addr, align 8 + %2 = load i32*, i32** %team.addr, align 8 + store i32 %0, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr25.i, align 4, !noalias !11 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !11 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !11 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !11 + store i32* %1, i32** %dis.addr.i, align 8, !noalias !11 + store i32* %2, i32** %team.addr.i, align 8, !noalias !11 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !11 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !11 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !11 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !11 + %nvptx_num_threads.i = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3, !range !10 + %3 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %4, i32 91, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 %nvptx_num_threads.i) #3 + %5 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %cmp.i = icmp sgt i32 %5, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %6, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + store i32 %7, i32* %.omp.iv.i, align 4, !noalias !11 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %cond.end7.i, %cond.end.i + %8 = load i32, i32* %.omp.iv.i, align 4, !noalias !11 + %cmp1.i = icmp slt i32 %8, 10 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %9 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + %10 = zext i32 %9 to i64 + %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %12 = zext i32 %11 to i64 + %13 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11 + call void @__omp_outlined__1(i32* %13, i32* %.zero.addr.i, i64 %10, i64 %12, i32** %dis.addr.i) #3 + %14 = load i32, i32* %.omp.iv.i, align 4, !noalias !11 + %15 = load i32, i32* %.omp.stride.i, align 4, !noalias !11 + %add.i = add nsw i32 %14, %15 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !11 + %16 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !11 + %add2.i = add nsw i32 %16, %17 + store i32 %add2.i, i32* %.omp.comb.lb.i, align 4, !noalias !11 + %18 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %19 = load i32, i32* %.omp.stride.i, align 4, !noalias !11 + %add3.i = add nsw i32 %18, %19 + store i32 %add3.i, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %cmp4.i = icmp sgt i32 %20, 9 + br i1 %cmp4.i, label %cond.true5.i, label %cond.false6.i + +cond.true5.i: ; preds = %omp.inner.for.body.i + br label %cond.end7.i + +cond.false6.i: ; preds = %omp.inner.for.body.i + %21 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + br label %cond.end7.i + +cond.end7.i: ; preds = %cond.false6.i, %cond.true5.i + %cond8.i = phi i32 [ 9, %cond.true5.i ], [ %21, %cond.false6.i ] + store i32 %cond8.i, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %22 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + store i32 %22, i32* %.omp.iv.i, align 4, !noalias !11 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) #3 + store i32 0, i32* %.omp.comb.lb11.i, align 4, !noalias !11 + store i32 9, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + store i32 1, i32* %.omp.stride13.i, align 4, !noalias !11 + store i32 0, i32* %.omp.is_last14.i, align 4, !noalias !11 + %nvptx_num_threads16.i = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3, !range !10 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %4, i32 91, i32* %.omp.is_last14.i, i32* %.omp.comb.lb11.i, i32* %.omp.comb.ub12.i, i32* %.omp.stride13.i, i32 1, i32 %nvptx_num_threads16.i) #3 + %23 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %cmp17.i = icmp sgt i32 %23, 9 + br i1 %cmp17.i, label %cond.true18.i, label %cond.false19.i + +cond.true18.i: ; preds = %omp.inner.for.end.i + br label %cond.end20.i + +cond.false19.i: ; preds = %omp.inner.for.end.i + %24 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + br label %cond.end20.i + +cond.end20.i: ; preds = %cond.false19.i, %cond.true18.i + %cond21.i = phi i32 [ 9, %cond.true18.i ], [ %24, %cond.false19.i ] + store i32 %cond21.i, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %25 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11 + store i32 %25, i32* %.omp.iv9.i, align 4, !noalias !11 + br label %omp.inner.for.cond22.i + +omp.inner.for.cond22.i: ; preds = %cond.end33.i, %cond.end20.i + %26 = load i32, i32* %.omp.iv9.i, align 4, !noalias !11 + %cmp23.i = icmp slt i32 %26, 10 + br i1 %cmp23.i, label %omp.inner.for.body24.i, label %__omp_outlined__.exit + +omp.inner.for.body24.i: ; preds = %omp.inner.for.cond22.i + %27 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11 + %28 = zext i32 %27 to i64 + %29 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %30 = zext i32 %29 to i64 + %31 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11 + call void @__omp_outlined__2(i32* %31, i32* %.zero.addr25.i, i64 %28, i64 %30, i32** %dis.addr.i) #3 + %32 = load i32, i32* %.omp.iv9.i, align 4, !noalias !11 + %33 = load i32, i32* %.omp.stride13.i, align 4, !noalias !11 + %add27.i = add nsw i32 %32, %33 + store i32 %add27.i, i32* %.omp.iv9.i, align 4, !noalias !11 + %34 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11 + %35 = load i32, i32* %.omp.stride13.i, align 4, !noalias !11 + %add28.i = add nsw i32 %34, %35 + store i32 %add28.i, i32* %.omp.comb.lb11.i, align 4, !noalias !11 + %36 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %37 = load i32, i32* %.omp.stride13.i, align 4, !noalias !11 + %add29.i = add nsw i32 %36, %37 + store i32 %add29.i, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %38 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %cmp30.i = icmp sgt i32 %38, 9 + br i1 %cmp30.i, label %cond.true31.i, label %cond.false32.i + +cond.true31.i: ; preds = %omp.inner.for.body24.i + br label %cond.end33.i + +cond.false32.i: ; preds = %omp.inner.for.body24.i + %39 = load i32, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + br label %cond.end33.i + +cond.end33.i: ; preds = %cond.false32.i, %cond.true31.i + %cond34.i = phi i32 [ 9, %cond.true31.i ], [ %39, %cond.false32.i ] + store i32 %cond34.i, i32* %.omp.comb.ub12.i, align 4, !noalias !11 + %40 = load i32, i32* %.omp.comb.lb11.i, align 4, !noalias !11 + store i32 %40, i32* %.omp.iv9.i, align 4, !noalias !11 + br label %omp.inner.for.cond22.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond22.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) #3 + %41 = load i32*, i32** %team.addr.i, align 8, !noalias !11 + %call.i = call i32 @omp_get_team_num() #3 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %41, i64 %idxprom.i + %42 = load i32, i32* %arrayidx.i, align 4 + %add37.i = add nsw i32 %42, 1 + store i32 %add37.i, i32* %arrayidx.i, align 4 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_spmd_kernel_deinit_v2(i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 + +declare void @__kmpc_spmd_kernel_init(i32, i16, i16) + +declare void @__kmpc_data_sharing_init_stack_spmd() + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare i32 @omp_get_team_num() #2 + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_spmd_kernel_deinit_v2(i16) + +attributes #0 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c531_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{i32 1, i32 1025} +!11 = !{!12, !14} +!12 = distinct !{!12, !13, !"__omp_outlined__: %.global_tid."} +!13 = distinct !{!13, !"__omp_outlined__"} +!14 = distinct !{!14, !13, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/jdoerfert/target_offload_is_SPMD-c76c92.bc' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_2b_142c531_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40] +@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c531_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_2b_142c531_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis, i32* %team) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %team.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 0 + %4 = load i32*, i32** %team.addr, align 8 + %5 = load i32*, i32** %team.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %7 = bitcast i8** %6 to i32** + store i32* %2, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = bitcast i8** %8 to i32** + store i32* %arrayidx, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1 + %11 = bitcast i8** %10 to i32** + store i32* %4, i32** %11, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1 + %13 = bitcast i8** %12 to i32** + store i32* %arrayidx1, i32** %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_2b_142c531_foo_l10(i32* %0, i32* %1) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %team.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %.omp.iv2 = alloca i32, align 4 + %tmp3 = alloca i32, align 4 + %.omp.comb.lb4 = alloca i32, align 4 + %.omp.comb.ub5 = alloca i32, align 4 + %.omp.stride6 = alloca i32, align 4 + %.omp.is_last7 = alloca i32, align 4 + %i8 = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %11, %12 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + store i32 0, i32* %.omp.comb.lb4, align 4 + store i32 9, i32* %.omp.comb.ub5, align 4 + store i32 1, i32* %.omp.stride6, align 4 + store i32 0, i32* %.omp.is_last7, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last7, i32* %.omp.comb.lb4, i32* %.omp.comb.ub5, i32* %.omp.stride6, i32 1, i32 1) + %13 = load i32, i32* %.omp.comb.ub5, align 4 + %cmp9 = icmp sgt i32 %13, 9 + br i1 %cmp9, label %cond.true10, label %cond.false11 + +cond.true10: ; preds = %omp.loop.exit + br label %cond.end12 + +cond.false11: ; preds = %omp.loop.exit + %14 = load i32, i32* %.omp.comb.ub5, align 4 + br label %cond.end12 + +cond.end12: ; preds = %cond.false11, %cond.true10 + %cond13 = phi i32 [ 9, %cond.true10 ], [ %14, %cond.false11 ] + store i32 %cond13, i32* %.omp.comb.ub5, align 4 + %15 = load i32, i32* %.omp.comb.lb4, align 4 + store i32 %15, i32* %.omp.iv2, align 4 + br label %omp.inner.for.cond14 + +omp.inner.for.cond14: ; preds = %omp.inner.for.inc17, %cond.end12 + %16 = load i32, i32* %.omp.iv2, align 4 + %17 = load i32, i32* %.omp.comb.ub5, align 4 + %cmp15 = icmp sle i32 %16, %17 + br i1 %cmp15, label %omp.inner.for.body16, label %omp.inner.for.end19 + +omp.inner.for.body16: ; preds = %omp.inner.for.cond14 + %18 = load i32, i32* %.omp.comb.lb4, align 4 + %19 = zext i32 %18 to i64 + %20 = load i32, i32* %.omp.comb.ub5, align 4 + %21 = zext i32 %20 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %19, i64 %21, i32** %dis.addr) + br label %omp.inner.for.inc17 + +omp.inner.for.inc17: ; preds = %omp.inner.for.body16 + %22 = load i32, i32* %.omp.iv2, align 4 + %23 = load i32, i32* %.omp.stride6, align 4 + %add18 = add nsw i32 %22, %23 + store i32 %add18, i32* %.omp.iv2, align 4 + br label %omp.inner.for.cond14 + +omp.inner.for.end19: ; preds = %omp.inner.for.cond14 + br label %omp.loop.exit20 + +omp.loop.exit20: ; preds = %omp.inner.for.end19 + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + %24 = load i32*, i32** %team.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %24, i64 %idxprom + %25 = load i32, i32* %arrayidx, align 4 + %add21 = add nsw i32 %25, 1 + store i32 %add21, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %team = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i4 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1 + store i32 0, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0 + call void @foo(i32* %arraydecay, i32* %arraydecay3) + store i32 0, i32* %i4, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc12, %for.end + %5 = load i32, i32* %i4, align 4 + %cmp6 = icmp slt i32 %5, 10 + br i1 %cmp6, label %for.body7, label %for.end14 + +for.body7: ; preds = %for.cond5 + %6 = load i32, i32* %i4, align 4 + %7 = load i32, i32* %i4, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8 + %8 = load i32, i32* %arrayidx9, align 4 + %9 = load i32, i32* %i4, align 4 + %10 = load i32, i32* %i4, align 4 + %idxprom10 = sext i32 %10 to i64 + %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10 + %11 = load i32, i32* %arrayidx11, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11) + br label %for.inc12 + +for.inc12: ; preds = %for.body7 + %12 = load i32, i32* %i4, align 4 + %inc13 = add nsw i32 %12, 1 + store i32 %inc13, i32* %i4, align 4 + br label %for.cond5 + +for.end14: ; preds = %for.cond5 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.2/target_offload_is_SPMD.old.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_is_SPMD.old.ll @@ -0,0 +1,1095 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda +; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] } +%struct._globalized_locals_ty = type { i32* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1 +@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c531_foo_l10_exec_mode = weak constant i8 1 +@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c531_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_offloading_2b_142c531_foo_l10_worker() #0 { +entry: + %work_fn = alloca i8*, align 8 + %exec_status = alloca i8, align 1 + store i8* null, i8** %work_fn, align 8 + store i8 0, i8* %exec_status, align 1 + br label %.await.work + +.await.work: ; preds = %.barrier.parallel, %entry + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1) + %1 = zext i1 %0 to i8 + store i8 %1, i8* %exec_status, align 1 + %2 = load i8*, i8** %work_fn, align 8 + %should_terminate = icmp eq i8* %2, null + br i1 %should_terminate, label %.exit, label %.select.workers + +.select.workers: ; preds = %.await.work + %3 = load i8, i8* %exec_status, align 1 + %is_active = icmp ne i8 %3, 0 + br i1 %is_active, label %.execute.parallel, label %.barrier.parallel + +.execute.parallel: ; preds = %.select.workers + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %5 = load i8*, i8** %work_fn, align 8 + %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) + br i1 %work_match, label %.execute.fn, label %.check.next + +.execute.fn: ; preds = %.execute.parallel + call void @__omp_outlined__1_wrapper(i16 0, i32 %4) #5 + br label %.terminate.parallel + +.check.next: ; preds = %.execute.parallel + %6 = load i8*, i8** %work_fn, align 8 + %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) + br i1 %work_match1, label %.execute.fn2, label %.check.next3 + +.execute.fn2: ; preds = %.check.next + call void @__omp_outlined__2_wrapper(i16 0, i32 %4) #5 + br label %.terminate.parallel + +.check.next3: ; preds = %.check.next + %7 = bitcast i8* %2 to void (i16, i32)* + call void %7(i16 0, i32 %4) + br label %.terminate.parallel + +.terminate.parallel: ; preds = %.check.next3, %.execute.fn2, %.execute.fn + call void @__kmpc_kernel_end_parallel() + br label %.barrier.parallel + +.barrier.parallel: ; preds = %.terminate.parallel, %.select.workers + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.await.work + +.exit: ; preds = %.await.work + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone +define weak void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %team.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %shared_arg_refs.i = alloca i8**, align 8 + %.omp.iv3.i = alloca i32, align 4 + %tmp4.i = alloca i32, align 4 + %.omp.comb.lb5.i = alloca i32, align 4 + %.omp.comb.ub6.i = alloca i32, align 4 + %.omp.stride7.i = alloca i32, align 4 + %.omp.is_last8.i = alloca i32, align 4 + %i9.i = alloca i32, align 4 + %.zero.addr18.i = alloca i32, align 4 + %shared_arg_refs19.i = alloca i8**, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10 + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11 + %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size + %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12 + %0 = icmp ult i32 %nvptx_tid, %thread_limit + br i1 %0, label %.worker, label %.mastercheck + +.worker: ; preds = %entry + call void @__omp_offloading_2b_142c531_foo_l10_worker() #5 + br label %.exit + +.mastercheck: ; preds = %entry + %nvptx_num_threads1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11 + %nvptx_warp_size2 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10 + %1 = sub nuw i32 %nvptx_warp_size2, 1 + %2 = xor i32 %1, -1 + %3 = sub nuw i32 %nvptx_num_threads1, 1 + %master_tid = and i32 %3, %2 + %nvptx_tid3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12 + %4 = icmp eq i32 %nvptx_tid3, %master_tid + br i1 %4, label %.master, label %.exit + +.master: ; preds = %.mastercheck + %nvptx_warp_size4 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10 + %nvptx_num_threads5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11 + %thread_limit6 = sub nuw i32 %nvptx_num_threads5, %nvptx_warp_size4 + call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1) + call void @__kmpc_data_sharing_init_stack() + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %6 = load i32*, i32** %dis.addr, align 8 + %7 = load i32*, i32** %team.addr, align 8 + store i32 %5, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr18.i, align 4, !noalias !13 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !13 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !13 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !13 + store i32* %6, i32** %dis.addr.i, align 8, !noalias !13 + store i32* %7, i32** %team.addr.i, align 8, !noalias !13 + call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #5 + %8 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 + %9 = bitcast i8* %8 to %struct._globalized_locals_ty* + %10 = load i32*, i32** %dis.addr.i, align 8, !noalias !13 + %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %9, i32 0, i32 0 + store i32* %10, i32** %dis1.i, align 8 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !13 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !13 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !13 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !13 + %11 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !13 + %12 = load i32, i32* %11, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %12, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #5 + %13 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %cmp.i = icmp sgt i32 %13, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.master + br label %cond.end.i + +cond.false.i: ; preds = %.master + %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %14, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13 + store i32 %15, i32* %.omp.iv.i, align 4, !noalias !13 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %16 = load i32, i32* %.omp.iv.i, align 4, !noalias !13 + %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %cmp2.i = icmp sle i32 %16, %17 + br i1 %cmp2.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %18 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13 + %19 = zext i32 %18 to i64 + %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %21 = zext i32 %20 to i64 + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i16 1) #5 + call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs.i, i64 3) #5 + %22 = load i8**, i8*** %shared_arg_refs.i, align 8, !noalias !13 + %23 = inttoptr i64 %19 to i8* + store i8* %23, i8** %22, align 8 + %24 = getelementptr inbounds i8*, i8** %22, i64 1 + %25 = inttoptr i64 %21 to i8* + store i8* %25, i8** %24, align 8 + %26 = getelementptr inbounds i8*, i8** %22, i64 2 + %27 = bitcast i32** %dis1.i to i8* + store i8* %27, i8** %26, align 8 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_end_sharing_variables() #5 + %28 = load i32, i32* %.omp.iv.i, align 4, !noalias !13 + %29 = load i32, i32* %.omp.stride.i, align 4, !noalias !13 + %add.i = add nsw i32 %28, %29 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !13 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %12) #5 + store i32 0, i32* %.omp.comb.lb5.i, align 4, !noalias !13 + store i32 9, i32* %.omp.comb.ub6.i, align 4, !noalias !13 + store i32 1, i32* %.omp.stride7.i, align 4, !noalias !13 + store i32 0, i32* %.omp.is_last8.i, align 4, !noalias !13 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %12, i32 92, i32* %.omp.is_last8.i, i32* %.omp.comb.lb5.i, i32* %.omp.comb.ub6.i, i32* %.omp.stride7.i, i32 1, i32 1) #5 + %30 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13 + %cmp10.i = icmp sgt i32 %30, 9 + br i1 %cmp10.i, label %cond.true11.i, label %cond.false12.i + +cond.true11.i: ; preds = %omp.inner.for.end.i + br label %cond.end13.i + +cond.false12.i: ; preds = %omp.inner.for.end.i + %31 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13 + br label %cond.end13.i + +cond.end13.i: ; preds = %cond.false12.i, %cond.true11.i + %cond14.i = phi i32 [ 9, %cond.true11.i ], [ %31, %cond.false12.i ] + store i32 %cond14.i, i32* %.omp.comb.ub6.i, align 4, !noalias !13 + %32 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !13 + store i32 %32, i32* %.omp.iv3.i, align 4, !noalias !13 + br label %omp.inner.for.cond15.i + +omp.inner.for.cond15.i: ; preds = %omp.inner.for.body17.i, %cond.end13.i + %33 = load i32, i32* %.omp.iv3.i, align 4, !noalias !13 + %34 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13 + %cmp16.i = icmp sle i32 %33, %34 + br i1 %cmp16.i, label %omp.inner.for.body17.i, label %__omp_outlined__.exit + +omp.inner.for.body17.i: ; preds = %omp.inner.for.cond15.i + %35 = load i32, i32* %.omp.comb.lb5.i, align 4, !noalias !13 + %36 = zext i32 %35 to i64 + %37 = load i32, i32* %.omp.comb.ub6.i, align 4, !noalias !13 + %38 = zext i32 %37 to i64 + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i16 1) #5 + call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs19.i, i64 3) #5 + %39 = load i8**, i8*** %shared_arg_refs19.i, align 8, !noalias !13 + %40 = inttoptr i64 %36 to i8* + store i8* %40, i8** %39, align 8 + %41 = getelementptr inbounds i8*, i8** %39, i64 1 + %42 = inttoptr i64 %38 to i8* + store i8* %42, i8** %41, align 8 + %43 = getelementptr inbounds i8*, i8** %39, i64 2 + %44 = bitcast i32** %dis1.i to i8* + store i8* %44, i8** %43, align 8 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_end_sharing_variables() #5 + %45 = load i32, i32* %.omp.iv3.i, align 4, !noalias !13 + %46 = load i32, i32* %.omp.stride7.i, align 4, !noalias !13 + %add21.i = add nsw i32 %45, %46 + store i32 %add21.i, i32* %.omp.iv3.i, align 4, !noalias !13 + br label %omp.inner.for.cond15.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond15.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %12) #5 + %47 = load i32*, i32** %team.addr.i, align 8, !noalias !13 + %call.i = call i32 @omp_get_team_num() #5 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %47, i64 %idxprom.i + %48 = load i32, i32* %arrayidx.i, align 4 + %add24.i = add nsw i32 %48, 1 + store i32 %add24.i, i32* %arrayidx.i, align 4 + call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #5 + br label %.termination.notifier + +.termination.notifier: ; preds = %__omp_outlined__.exit + call void @__kmpc_kernel_deinit(i16 1) + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.exit + +.exit: ; preds = %.termination.notifier, %.mastercheck, %.worker + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +declare void @__kmpc_kernel_init(i32, i16) + +declare void @__kmpc_data_sharing_init_stack() + +declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i16 zeroext, i32) #0 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + %2 = load i8**, i8*** %global_args, align 8 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i64* + %5 = load i64, i64* %4, align 8 + %6 = getelementptr inbounds i8*, i8** %2, i64 1 + %7 = bitcast i8** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds i8*, i8** %2, i64 2 + %10 = bitcast i8** %9 to i32*** + %11 = load i32**, i32*** %10, align 8 + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, i32** %11) #5 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_kernel_prepare_parallel(i8*, i16) + +declare void @__kmpc_begin_sharing_variables(i8***, i64) + +; Function Attrs: convergent +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) #3 + +declare void @__kmpc_end_sharing_variables() + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext, i32) #0 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + %2 = load i8**, i8*** %global_args, align 8 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i64* + %5 = load i64, i64* %4, align 8 + %6 = getelementptr inbounds i8*, i8** %2, i64 1 + %7 = bitcast i8** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds i8*, i8** %2, i64 2 + %10 = bitcast i8** %9 to i32*** + %11 = load i32**, i32*** %10, align 8 + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, i32** %11) #5 + ret void +} + +declare i32 @omp_get_team_num() #4 + +declare void @__kmpc_restore_team_static_memory(i16, i16) + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_kernel_deinit(i16) + +declare i1 @__kmpc_kernel_parallel(i8**, i16) + +declare void @__kmpc_kernel_end_parallel() + +attributes #0 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent } +attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c531_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{i32 32, i32 33} +!11 = !{i32 1, i32 1025} +!12 = !{i32 0, i32 1024} +!13 = !{!14, !16} +!14 = distinct !{!14, !15, !"__omp_outlined__: %.global_tid."} +!15 = distinct !{!15, !"__omp_outlined__"} +!16 = distinct !{!16, !15, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/jdoerfert/target_offload_is_SPMD-7856f8.bc' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_is_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_2b_142c531_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40] +@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c531_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_2b_142c531_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis, i32* %team) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %team.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 0 + %4 = load i32*, i32** %team.addr, align 8 + %5 = load i32*, i32** %team.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %7 = bitcast i8** %6 to i32** + store i32* %2, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = bitcast i8** %8 to i32** + store i32* %arrayidx, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1 + %11 = bitcast i8** %10 to i32** + store i32* %4, i32** %11, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1 + %13 = bitcast i8** %12 to i32** + store i32* %arrayidx1, i32** %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c531_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_2b_142c531_foo_l10(i32* %0, i32* %1) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_2b_142c531_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %team.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + %.omp.iv2 = alloca i32, align 4 + %tmp3 = alloca i32, align 4 + %.omp.comb.lb4 = alloca i32, align 4 + %.omp.comb.ub5 = alloca i32, align 4 + %.omp.stride6 = alloca i32, align 4 + %.omp.is_last7 = alloca i32, align 4 + %i8 = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %11, %12 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + store i32 0, i32* %.omp.comb.lb4, align 4 + store i32 9, i32* %.omp.comb.ub5, align 4 + store i32 1, i32* %.omp.stride6, align 4 + store i32 0, i32* %.omp.is_last7, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last7, i32* %.omp.comb.lb4, i32* %.omp.comb.ub5, i32* %.omp.stride6, i32 1, i32 1) + %13 = load i32, i32* %.omp.comb.ub5, align 4 + %cmp9 = icmp sgt i32 %13, 9 + br i1 %cmp9, label %cond.true10, label %cond.false11 + +cond.true10: ; preds = %omp.loop.exit + br label %cond.end12 + +cond.false11: ; preds = %omp.loop.exit + %14 = load i32, i32* %.omp.comb.ub5, align 4 + br label %cond.end12 + +cond.end12: ; preds = %cond.false11, %cond.true10 + %cond13 = phi i32 [ 9, %cond.true10 ], [ %14, %cond.false11 ] + store i32 %cond13, i32* %.omp.comb.ub5, align 4 + %15 = load i32, i32* %.omp.comb.lb4, align 4 + store i32 %15, i32* %.omp.iv2, align 4 + br label %omp.inner.for.cond14 + +omp.inner.for.cond14: ; preds = %omp.inner.for.inc17, %cond.end12 + %16 = load i32, i32* %.omp.iv2, align 4 + %17 = load i32, i32* %.omp.comb.ub5, align 4 + %cmp15 = icmp sle i32 %16, %17 + br i1 %cmp15, label %omp.inner.for.body16, label %omp.inner.for.end19 + +omp.inner.for.body16: ; preds = %omp.inner.for.cond14 + %18 = load i32, i32* %.omp.comb.lb4, align 4 + %19 = zext i32 %18 to i64 + %20 = load i32, i32* %.omp.comb.ub5, align 4 + %21 = zext i32 %20 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %19, i64 %21, i32** %dis.addr) + br label %omp.inner.for.inc17 + +omp.inner.for.inc17: ; preds = %omp.inner.for.body16 + %22 = load i32, i32* %.omp.iv2, align 4 + %23 = load i32, i32* %.omp.stride6, align 4 + %add18 = add nsw i32 %22, %23 + store i32 %add18, i32* %.omp.iv2, align 4 + br label %omp.inner.for.cond14 + +omp.inner.for.end19: ; preds = %omp.inner.for.cond14 + br label %omp.loop.exit20 + +omp.loop.exit20: ; preds = %omp.inner.for.end19 + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + %24 = load i32*, i32** %team.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %24, i64 %idxprom + %25 = load i32, i32* %arrayidx, align 4 + %add21 = add nsw i32 %25, 1 + store i32 %add21, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %team = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i4 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1 + store i32 0, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0 + call void @foo(i32* %arraydecay, i32* %arraydecay3) + store i32 0, i32* %i4, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc12, %for.end + %5 = load i32, i32* %i4, align 4 + %cmp6 = icmp slt i32 %5, 10 + br i1 %cmp6, label %for.body7, label %for.end14 + +for.body7: ; preds = %for.cond5 + %6 = load i32, i32* %i4, align 4 + %7 = load i32, i32* %i4, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8 + %8 = load i32, i32* %arrayidx9, align 4 + %9 = load i32, i32* %i4, align 4 + %10 = load i32, i32* %i4, align 4 + %idxprom10 = sext i32 %10 to i64 + %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10 + %11 = load i32, i32* %arrayidx11, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11) + br label %for.inc12 + +for.inc12: ; preds = %for.body7 + %12 = load i32, i32* %i4, align 4 + %inc13 = add nsw i32 %12, 1 + store i32 %inc13, i32* %i4, align 4 + br label %for.cond5 + +for.end14: ; preds = %for.cond5 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 43, i32 21153073, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.2/target_offload_not_SPMD.new.host.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_not_SPMD.new.host.ll @@ -0,0 +1,425 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/jdoerfert/target_offload_not_SPMD-778fa0.bc' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_2b_142c58b_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40] +@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c58b_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_2b_142c58b_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis, i32* %team) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %team.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 0 + %4 = load i32*, i32** %team.addr, align 8 + %5 = load i32*, i32** %team.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %7 = bitcast i8** %6 to i32** + store i32* %2, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = bitcast i8** %8 to i32** + store i32* %arrayidx, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1 + %11 = bitcast i8** %10 to i32** + store i32* %4, i32** %11, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1 + %13 = bitcast i8** %12 to i32** + store i32* %arrayidx1, i32** %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_2b_142c58b_foo_l10(i32* %0, i32* %1) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %team.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %11, %12 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32** %dis.addr) + %13 = load i32*, i32** %team.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom + %14 = load i32, i32* %arrayidx, align 4 + %add2 = add nsw i32 %14, 1 + store i32 %add2, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32**, align 8 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %0, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %team = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i4 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1 + store i32 0, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0 + call void @foo(i32* %arraydecay, i32* %arraydecay3) + store i32 0, i32* %i4, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc12, %for.end + %5 = load i32, i32* %i4, align 4 + %cmp6 = icmp slt i32 %5, 10 + br i1 %cmp6, label %for.body7, label %for.end14 + +for.body7: ; preds = %for.cond5 + %6 = load i32, i32* %i4, align 4 + %7 = load i32, i32* %i4, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8 + %8 = load i32, i32* %arrayidx9, align 4 + %9 = load i32, i32* %i4, align 4 + %10 = load i32, i32* %i4, align 4 + %idxprom10 = sext i32 %10 to i64 + %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10 + %11 = load i32, i32* %arrayidx11, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11) + br label %for.inc12 + +for.inc12: ; preds = %for.body7 + %12 = load i32, i32* %i4, align 4 + %inc13 = add nsw i32 %12, 1 + store i32 %inc13, i32* %i4, align 4 + br label %for.cond5 + +for.end14: ; preds = %for.cond5 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.2/target_offload_not_SPMD.new.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_not_SPMD.new.ll @@ -0,0 +1,342 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda +; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] } +%omp.shared.struct = type { i64, i64, i32** } +%omp.shared.struct.0 = type { i32** } +%struct._globalized_locals_ty = type { i32* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1 +@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 1 +@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #0 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %team.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.zero.addr3.i = alloca i32, align 4 + %.captured4.i = alloca %omp.shared.struct.0, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0) + %1 = icmp eq i16 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %3 = load i32*, i32** %dis.addr, align 8 + %4 = load i32*, i32** %team.addr, align 8 + store i32 %2, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr3.i, align 4, !noalias !10 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !10 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !10 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !10 + store i32* %3, i32** %dis.addr.i, align 8, !noalias !10 + store i32* %4, i32** %team.addr.i, align 8, !noalias !10 + call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #3 + %5 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 + %6 = bitcast i8* %5 to %struct._globalized_locals_ty* + %7 = load i32*, i32** %dis.addr.i, align 8, !noalias !10 + %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %6, i32 0, i32 0 + store i32* %7, i32** %dis1.i, align 8 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !10 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !10 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !10 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !10 + %8 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !10 + %9 = load i32, i32* %8, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %9, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #3 + %10 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %cmp.i = icmp sgt i32 %10, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %11, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %12 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10 + store i32 %12, i32* %.omp.iv.i, align 4, !noalias !10 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %13 = load i32, i32* %.omp.iv.i, align 4, !noalias !10 + %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %cmp2.i = icmp sle i32 %13, %14 + br i1 %cmp2.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10 + %16 = zext i32 %15 to i64 + %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %18 = zext i32 %17 to i64 + %19 = bitcast %omp.shared.struct* %.captured.i to i8* + %20 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %16, i64* %20, !noalias !10 + %21 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %18, i64* %21, !noalias !10 + %22 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store i32** %dis1.i, i32*** %22, !noalias !10 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %19, i16 24, i16 1) #3 + %23 = load i32, i32* %.omp.iv.i, align 4, !noalias !10 + %24 = load i32, i32* %.omp.stride.i, align 4, !noalias !10 + %add.i = add nsw i32 %23, %24 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !10 + br label %omp.inner.for.cond.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %9) #3 + %25 = bitcast %omp.shared.struct.0* %.captured4.i to i8* + %26 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured4.i, i32 0, i32 0 + store i32** %dis1.i, i32*** %26, !noalias !10 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %25, i16 8, i16 1) #3 + %27 = load i32*, i32** %team.addr.i, align 8, !noalias !10 + %call.i = call i32 @omp_get_team_num() #3 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %27, i64 %idxprom.i + %28 = load i32, i32* %arrayidx.i, align 4 + %add5.i = add nsw i32 %28, 1 + store i32 %add5.i, i32* %arrayidx.i, align 4 + call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #3 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct* + %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0 + %4 = load i64, i64* %3, align 1 + %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1 + %6 = load i64, i64* %5, align 1 + %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2 + %8 = load i32**, i32*** %7, align 1 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32** %8) #3 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32**, align 8 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %0, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct.0* + %3 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 0 + %4 = load i32**, i32*** %3, align 1 + call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i32** %4) #3 + ret void +} + +declare i32 @omp_get_team_num() #2 + +declare void @__kmpc_restore_team_static_memory(i16, i16) + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{!11, !13} +!11 = distinct !{!11, !12, !"__omp_outlined__: %.global_tid."} +!12 = distinct !{!12, !"__omp_outlined__"} +!13 = distinct !{!13, !12, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda Index: SPMD_examples/v0.2/target_offload_not_SPMD.new.opt.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_not_SPMD.new.opt.ll @@ -0,0 +1,319 @@ +; ModuleID = '/home/jdoerfert/SPMDtests/target_offload_not_SPMD.new.ll' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] } +%omp.shared.struct = type { i64, i64, i32** } +%omp.shared.struct.0 = type { i32** } +%struct._globalized_locals_ty = type { i32* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1 +@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 1 +@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #0 { +entry: + %work_fn.addr = alloca i8* + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.captured4.i = alloca %omp.shared.struct.0, align 8 + %thread_kind = call i16 @__kmpc_generic_kernel_init(i16 0, i16 0, i16 1, i16 0) + %is_worker = icmp eq i16 %thread_kind, -1 + br i1 %is_worker, label %worker.wait, label %master_check + +worker.wait: ; preds = %worker.inactive, %entry + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + %is_active = call i1 @__kmpc_kernel_parallel(i8** %work_fn.addr, i16 1) + %Work_fn.addr_cast = bitcast i8** %work_fn.addr to void (i8*)** + %work_fn = load void (i8*)*, void (i8*)** %Work_fn.addr_cast + %no_work = icmp eq void (i8*)* %work_fn, null + br i1 %no_work, label %worker.finished, label %worker.active_check + +worker.finished: ; preds = %worker.wait + br label %master_check + +worker.active_check: ; preds = %worker.wait + br i1 %is_active, label %worker.active, label %worker.inactive + +worker.active: ; preds = %worker.active_check + %0 = call i8* @__kmpc_get_shared_variables() + %par_fn_check = icmp eq void (i8*)* %work_fn, @__omp_outlined__2_wrapper + br i1 %par_fn_check, label %worker.execute.__omp_outlined__2_wrapper, label %worker.check.next + +worker.execute.__omp_outlined__2_wrapper: ; preds = %worker.active + call void @__omp_outlined__2_wrapper(i8* %0) + br label %worker.parallel_end + +worker.check.next: ; preds = %worker.active + %par_fn_check1 = icmp eq void (i8*)* %work_fn, @__omp_outlined__1_wrapper + br i1 %par_fn_check1, label %worker.execute.__omp_outlined__1_wrapper, label %worker.check.next2 + +worker.execute.__omp_outlined__1_wrapper: ; preds = %worker.check.next + call void @__omp_outlined__1_wrapper(i8* %0) + br label %worker.parallel_end + +worker.check.next2: ; preds = %worker.check.next + call void %work_fn(i8* %0) + br label %worker.parallel_end + +worker.parallel_end: ; preds = %worker.execute.__omp_outlined__1_wrapper, %worker.execute.__omp_outlined__2_wrapper, %worker.check.next2 + call void @__kmpc_kernel_end_parallel() + br label %worker.inactive + +worker.inactive: ; preds = %worker.active_check, %worker.parallel_end + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %worker.wait + +master_check: ; preds = %worker.finished, %entry + %1 = icmp eq i16 %thread_kind, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %master_check + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #2 + %3 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 + %4 = bitcast i8* %3 to %struct._globalized_locals_ty* + %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %4, i32 0, i32 0 + store i32* %dis, i32** %dis1.i, align 8 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !10 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !10 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !10 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !10 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2 + %5 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %cmp.i = icmp sgt i32 %5, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %6, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %.omp.iv.i.0 = phi i32 [ %7, %cond.end.i ], [ %add.i, %omp.inner.for.body.i ] + %8 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %cmp2.i = icmp sle i32 %.omp.iv.i.0, %8 + br i1 %cmp2.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %9 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !10 + %10 = zext i32 %9 to i64 + %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !10 + %12 = zext i32 %11 to i64 + %13 = bitcast %omp.shared.struct* %.captured.i to i8* + %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %10, i64* %14, !noalias !10 + %15 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %12, i64* %15, !noalias !10 + %16 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store i32** %dis1.i, i32*** %16, !noalias !10 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %13, i16 24, i16 1) #2 + %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !10 + %add.i = add nsw i32 %.omp.iv.i.0, %17 + br label %omp.inner.for.cond.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2 + %18 = bitcast %omp.shared.struct.0* %.captured4.i to i8* + %19 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured4.i, i32 0, i32 0 + store i32** %dis1.i, i32*** %19, !noalias !10 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %18, i16 8, i16 1) #2 + %call.i = call i32 @omp_get_team_num() #2 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %team, i64 %idxprom.i + %20 = load i32, i32* %arrayidx.i, align 4 + %add5.i = add nsw i32 %20, 1 + store i32 %add5.i, i32* %arrayidx.i, align 4 + call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #2 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %master_check + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %conv = trunc i64 %.previous.lb. to i32 + %conv1 = trunc i64 %.previous.ub. to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32, i32* %.global_tid., align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %1 = load i32, i32* %.omp.lb, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %.omp.iv.0 = phi i32 [ %1, %entry ], [ %add4, %omp.inner.for.inc ] + %conv2 = sext i32 %.omp.iv.0 to i64 + %cmp = icmp ule i64 %conv2, %.previous.ub. + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %mul = mul nsw i32 %.omp.iv.0, 1 + %add = add nsw i32 0, %mul + %2 = load i32*, i32** %dis, align 8 + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %3 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %4 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %.omp.iv.0, %4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = bitcast i8* %payload to %omp.shared.struct* + %2 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 0 + %3 = load i64, i64* %2, align 1 + %4 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 1 + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 2 + %7 = load i32**, i32*** %6, align 1 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %3, i64 %5, i32** %7) #2 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #0 { +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp slt i32 %i.0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %0 = load i32*, i32** %dis, align 8 + %idxprom = sext i32 %i.0 to i64 + %arrayidx = getelementptr inbounds i32, i32* %0, i64 %idxprom + %1 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %inc = add nsw i32 %i.0, 1 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i8* %payload) #0 { +entry: + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = bitcast i8* %payload to %omp.shared.struct.0* + %2 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 0 + %3 = load i32**, i32*** %2, align 1 + call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i32** %3) #2 + ret void +} + +declare i32 @omp_get_team_num() #1 + +declare void @__kmpc_restore_team_static_memory(i16, i16) + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) + +declare i1 @__kmpc_kernel_parallel(i8**, i16) + +declare i8* @__kmpc_get_shared_variables() + +declare void @__kmpc_kernel_end_parallel() + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{!11, !13} +!11 = distinct !{!11, !12, !"__omp_outlined__: %.global_tid."} +!12 = distinct !{!12, !"__omp_outlined__"} +!13 = distinct !{!13, !12, !"__omp_outlined__: %.bound_tid."} Index: SPMD_examples/v0.2/target_offload_not_SPMD.old.forced.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_not_SPMD.old.forced.ll @@ -0,0 +1,728 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda +; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 0 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: noinline norecurse nounwind optnone +define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #0 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %team.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %.zero.addr9.i = alloca i32, align 4 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !10 + call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) + call void @__kmpc_data_sharing_init_stack_spmd() + br label %.execute + +.execute: ; preds = %entry + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = load i32*, i32** %dis.addr, align 8 + %2 = load i32*, i32** %team.addr, align 8 + store i32 %0, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr9.i, align 4, !noalias !11 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !11 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !11 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !11 + store i32* %1, i32** %dis.addr.i, align 8, !noalias !11 + store i32* %2, i32** %team.addr.i, align 8, !noalias !11 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !11 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !11 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !11 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !11 + %nvptx_num_threads.i = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #3, !range !10 + %3 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %4, i32 91, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 %nvptx_num_threads.i) #3 + %5 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %cmp.i = icmp sgt i32 %5, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %6, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + store i32 %7, i32* %.omp.iv.i, align 4, !noalias !11 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %cond.end7.i, %cond.end.i + %8 = load i32, i32* %.omp.iv.i, align 4, !noalias !11 + %cmp1.i = icmp slt i32 %8, 10 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %9 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + %10 = zext i32 %9 to i64 + %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %12 = zext i32 %11 to i64 + %13 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11 + call void @__omp_outlined__1(i32* %13, i32* %.zero.addr.i, i64 %10, i64 %12, i32** %dis.addr.i) #3 + %14 = load i32, i32* %.omp.iv.i, align 4, !noalias !11 + %15 = load i32, i32* %.omp.stride.i, align 4, !noalias !11 + %add.i = add nsw i32 %14, %15 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !11 + %16 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !11 + %add2.i = add nsw i32 %16, %17 + store i32 %add2.i, i32* %.omp.comb.lb.i, align 4, !noalias !11 + %18 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %19 = load i32, i32* %.omp.stride.i, align 4, !noalias !11 + %add3.i = add nsw i32 %18, %19 + store i32 %add3.i, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %cmp4.i = icmp sgt i32 %20, 9 + br i1 %cmp4.i, label %cond.true5.i, label %cond.false6.i + +cond.true5.i: ; preds = %omp.inner.for.body.i + br label %cond.end7.i + +cond.false6.i: ; preds = %omp.inner.for.body.i + %21 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !11 + br label %cond.end7.i + +cond.end7.i: ; preds = %cond.false6.i, %cond.true5.i + %cond8.i = phi i32 [ 9, %cond.true5.i ], [ %21, %cond.false6.i ] + store i32 %cond8.i, i32* %.omp.comb.ub.i, align 4, !noalias !11 + %22 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !11 + store i32 %22, i32* %.omp.iv.i, align 4, !noalias !11 + br label %omp.inner.for.cond.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) #3 + %23 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !11 + call void @__omp_outlined__2(i32* %23, i32* %.zero.addr9.i, i32** %dis.addr.i) #3 + %24 = load i32*, i32** %team.addr.i, align 8, !noalias !11 + %call.i = call i32 @omp_get_team_num() #3 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %24, i64 %idxprom.i + %25 = load i32, i32* %arrayidx.i, align 4 + %add10.i = add nsw i32 %25, 1 + store i32 %add10.i, i32* %arrayidx.i, align 4 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_spmd_kernel_deinit_v2(i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1 + +declare void @__kmpc_spmd_kernel_init(i32, i16, i16) + +declare void @__kmpc_data_sharing_init_stack_spmd() + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32**, align 8 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %0, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare i32 @omp_get_team_num() #2 + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_spmd_kernel_deinit_v2(i16) + +attributes #0 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{i32 1, i32 1025} +!11 = !{!12, !14} +!12 = distinct !{!12, !13, !"__omp_outlined__: %.global_tid."} +!13 = distinct !{!13, !"__omp_outlined__"} +!14 = distinct !{!14, !13, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/jdoerfert/target_offload_not_SPMD-5f7337.bc' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_2b_142c58b_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40] +@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c58b_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_2b_142c58b_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis, i32* %team) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %team.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 0 + %4 = load i32*, i32** %team.addr, align 8 + %5 = load i32*, i32** %team.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %7 = bitcast i8** %6 to i32** + store i32* %2, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = bitcast i8** %8 to i32** + store i32* %arrayidx, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1 + %11 = bitcast i8** %10 to i32** + store i32* %4, i32** %11, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1 + %13 = bitcast i8** %12 to i32** + store i32* %arrayidx1, i32** %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_2b_142c58b_foo_l10(i32* %0, i32* %1) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %team.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %11, %12 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32** %dis.addr) + %13 = load i32*, i32** %team.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom + %14 = load i32, i32* %arrayidx, align 4 + %add2 = add nsw i32 %14, 1 + store i32 %add2, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32**, align 8 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %0, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %team = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i4 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1 + store i32 0, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0 + call void @foo(i32* %arraydecay, i32* %arraydecay3) + store i32 0, i32* %i4, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc12, %for.end + %5 = load i32, i32* %i4, align 4 + %cmp6 = icmp slt i32 %5, 10 + br i1 %cmp6, label %for.body7, label %for.end14 + +for.body7: ; preds = %for.cond5 + %6 = load i32, i32* %i4, align 4 + %7 = load i32, i32* %i4, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8 + %8 = load i32, i32* %arrayidx9, align 4 + %9 = load i32, i32* %i4, align 4 + %10 = load i32, i32* %i4, align 4 + %idxprom10 = sext i32 %10 to i64 + %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10 + %11 = load i32, i32* %arrayidx11, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11) + br label %for.inc12 + +for.inc12: ; preds = %for.body7 + %12 = load i32, i32* %i4, align 4 + %inc13 = add nsw i32 %12, 1 + store i32 %inc13, i32* %i4, align 4 + br label %for.cond5 + +for.end14: ; preds = %for.cond5 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.2/target_offload_not_SPMD.old.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.2/target_offload_not_SPMD.old.ll @@ -0,0 +1,891 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cuda +; ModuleID = '/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cuda" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%"union._shared_openmp_static_memory_type_$_" = type { [128 x i8] } +%struct._globalized_locals_ty = type { i32* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@"_openmp_static_kernel$is_shared" = internal unnamed_addr constant i16 1 +@"_openmp_static_kernel$size" = internal unnamed_addr constant i64 8 +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_2b_142c58b_foo_l10_exec_mode = weak constant i8 1 +@"_openmp_shared_static_glob_rd_$_" = common addrspace(3) global %"union._shared_openmp_static_memory_type_$_" zeroinitializer +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_2b_142c58b_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_offloading_2b_142c58b_foo_l10_worker() #0 { +entry: + %work_fn = alloca i8*, align 8 + %exec_status = alloca i8, align 1 + store i8* null, i8** %work_fn, align 8 + store i8 0, i8* %exec_status, align 1 + br label %.await.work + +.await.work: ; preds = %.barrier.parallel, %entry + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1) + %1 = zext i1 %0 to i8 + store i8 %1, i8* %exec_status, align 1 + %2 = load i8*, i8** %work_fn, align 8 + %should_terminate = icmp eq i8* %2, null + br i1 %should_terminate, label %.exit, label %.select.workers + +.select.workers: ; preds = %.await.work + %3 = load i8, i8* %exec_status, align 1 + %is_active = icmp ne i8 %3, 0 + br i1 %is_active, label %.execute.parallel, label %.barrier.parallel + +.execute.parallel: ; preds = %.select.workers + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %5 = load i8*, i8** %work_fn, align 8 + %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) + br i1 %work_match, label %.execute.fn, label %.check.next + +.execute.fn: ; preds = %.execute.parallel + call void @__omp_outlined__1_wrapper(i16 0, i32 %4) #5 + br label %.terminate.parallel + +.check.next: ; preds = %.execute.parallel + %6 = load i8*, i8** %work_fn, align 8 + %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) + br i1 %work_match1, label %.execute.fn2, label %.check.next3 + +.execute.fn2: ; preds = %.check.next + call void @__omp_outlined__2_wrapper(i16 0, i32 %4) #5 + br label %.terminate.parallel + +.check.next3: ; preds = %.check.next + %7 = bitcast i8* %2 to void (i16, i32)* + call void %7(i16 0, i32 %4) + br label %.terminate.parallel + +.terminate.parallel: ; preds = %.check.next3, %.execute.fn2, %.execute.fn + call void @__kmpc_kernel_end_parallel() + br label %.barrier.parallel + +.barrier.parallel: ; preds = %.terminate.parallel, %.select.workers + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.await.work + +.exit: ; preds = %.await.work + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone +define weak void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %team.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %shared_arg_refs.i = alloca i8**, align 8 + %.zero.addr3.i = alloca i32, align 4 + %shared_arg_refs4.i = alloca i8**, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10 + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11 + %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size + %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12 + %0 = icmp ult i32 %nvptx_tid, %thread_limit + br i1 %0, label %.worker, label %.mastercheck + +.worker: ; preds = %entry + call void @__omp_offloading_2b_142c58b_foo_l10_worker() #5 + br label %.exit + +.mastercheck: ; preds = %entry + %nvptx_num_threads1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11 + %nvptx_warp_size2 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10 + %1 = sub nuw i32 %nvptx_warp_size2, 1 + %2 = xor i32 %1, -1 + %3 = sub nuw i32 %nvptx_num_threads1, 1 + %master_tid = and i32 %3, %2 + %nvptx_tid3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !12 + %4 = icmp eq i32 %nvptx_tid3, %master_tid + br i1 %4, label %.master, label %.exit + +.master: ; preds = %.mastercheck + %nvptx_warp_size4 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range !10 + %nvptx_num_threads5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range !11 + %thread_limit6 = sub nuw i32 %nvptx_num_threads5, %nvptx_warp_size4 + call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1) + call void @__kmpc_data_sharing_init_stack() + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %6 = load i32*, i32** %dis.addr, align 8 + %7 = load i32*, i32** %team.addr, align 8 + store i32 %5, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr3.i, align 4, !noalias !13 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !13 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !13 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !13 + store i32* %6, i32** %dis.addr.i, align 8, !noalias !13 + store i32* %7, i32** %team.addr.i, align 8, !noalias !13 + call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds (%"union._shared_openmp_static_memory_type_$_", %"union._shared_openmp_static_memory_type_$_" addrspace(3)* @"_openmp_shared_static_glob_rd_$_", i32 0, i32 0, i32 0) to i8*), i64 8, i16 1, i8** addrspacecast (i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr" to i8**)) #5 + %8 = load i8*, i8* addrspace(3)* @"_openmp_kernel_static_glob_rd$ptr", align 8 + %9 = bitcast i8* %8 to %struct._globalized_locals_ty* + %10 = load i32*, i32** %dis.addr.i, align 8, !noalias !13 + %dis1.i = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %9, i32 0, i32 0 + store i32* %10, i32** %dis1.i, align 8 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !13 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !13 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !13 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !13 + %11 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !13 + %12 = load i32, i32* %11, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %12, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #5 + %13 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %cmp.i = icmp sgt i32 %13, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.master + br label %cond.end.i + +cond.false.i: ; preds = %.master + %14 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %14, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %15 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13 + store i32 %15, i32* %.omp.iv.i, align 4, !noalias !13 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %16 = load i32, i32* %.omp.iv.i, align 4, !noalias !13 + %17 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %cmp2.i = icmp sle i32 %16, %17 + br i1 %cmp2.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %18 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !13 + %19 = zext i32 %18 to i64 + %20 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !13 + %21 = zext i32 %20 to i64 + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i16 1) #5 + call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs.i, i64 3) #5 + %22 = load i8**, i8*** %shared_arg_refs.i, align 8, !noalias !13 + %23 = inttoptr i64 %19 to i8* + store i8* %23, i8** %22, align 8 + %24 = getelementptr inbounds i8*, i8** %22, i64 1 + %25 = inttoptr i64 %21 to i8* + store i8* %25, i8** %24, align 8 + %26 = getelementptr inbounds i8*, i8** %22, i64 2 + %27 = bitcast i32** %dis1.i to i8* + store i8* %27, i8** %26, align 8 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_end_sharing_variables() #5 + %28 = load i32, i32* %.omp.iv.i, align 4, !noalias !13 + %29 = load i32, i32* %.omp.stride.i, align 4, !noalias !13 + %add.i = add nsw i32 %28, %29 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !13 + br label %omp.inner.for.cond.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %12) #5 + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i16 1) #5 + call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs4.i, i64 1) #5 + %30 = load i8**, i8*** %shared_arg_refs4.i, align 8, !noalias !13 + %31 = bitcast i32** %dis1.i to i8* + store i8* %31, i8** %30, align 8 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #5 + call void @__kmpc_end_sharing_variables() #5 + %32 = load i32*, i32** %team.addr.i, align 8, !noalias !13 + %call.i = call i32 @omp_get_team_num() #5 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %32, i64 %idxprom.i + %33 = load i32, i32* %arrayidx.i, align 4 + %add5.i = add nsw i32 %33, 1 + store i32 %add5.i, i32* %arrayidx.i, align 4 + call void @__kmpc_restore_team_static_memory(i16 0, i16 1) #5 + br label %.termination.notifier + +.termination.notifier: ; preds = %__omp_outlined__.exit + call void @__kmpc_kernel_deinit(i16 1) + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.exit + +.exit: ; preds = %.termination.notifier, %.mastercheck, %.worker + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +declare void @__kmpc_kernel_init(i32, i16) + +declare void @__kmpc_data_sharing_init_stack() + +declare void @__kmpc_get_team_static_memory(i16, i8*, i64, i16, i8**) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.lb, align 4 + store i32 %5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %6 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %6 to i64 + %7 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %7 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %8 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %8, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %9 = load i32*, i32** %0, align 8 + %10 = load i32, i32* %i, align 4 + %idxprom = sext i32 %10 to i64 + %arrayidx = getelementptr inbounds i32, i32* %9, i64 %idxprom + %11 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %12, %13 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i16 zeroext, i32) #0 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + %2 = load i8**, i8*** %global_args, align 8 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i64* + %5 = load i64, i64* %4, align 8 + %6 = getelementptr inbounds i8*, i8** %2, i64 1 + %7 = bitcast i8** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds i8*, i8** %2, i64 2 + %10 = bitcast i8** %9 to i32*** + %11 = load i32**, i32*** %10, align 8 + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, i32** %11) #5 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_kernel_prepare_parallel(i8*, i16) + +declare void @__kmpc_begin_sharing_variables(i8***, i64) + +; Function Attrs: convergent +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) #3 + +declare void @__kmpc_end_sharing_variables() + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32**, align 8 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %0, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext, i32) #0 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + %2 = load i8**, i8*** %global_args, align 8 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i32*** + %5 = load i32**, i32*** %4, align 8 + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr, i32** %5) #5 + ret void +} + +declare i32 @omp_get_team_num() #4 + +declare void @__kmpc_restore_team_static_memory(i16, i16) + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_kernel_deinit(i16) + +declare i1 @__kmpc_kernel_parallel(i8**, i16) + +declare void @__kmpc_kernel_end_parallel() + +attributes #0 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent } +attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_70" "target-features"="+ptx61,+sm_70" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1, !2, !3, !2, !4, !4, !4, !4, !5, !5, !4} +!llvm.module.flags = !{!6, !7} +!llvm.ident = !{!8} +!nvvm.internalize.after.link = !{} +!nvvmir.version = !{!9} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{void (i32*, i32*)* @__omp_offloading_2b_142c58b_foo_l10, !"kernel", i32 1} +!2 = !{null, !"align", i32 8} +!3 = !{null, !"align", i32 8, !"align", i32 65544, !"align", i32 131080} +!4 = !{null, !"align", i32 16} +!5 = !{null, !"align", i32 16, !"align", i32 65552, !"align", i32 131088} +!6 = !{i32 1, !"wchar_size", i32 4} +!7 = !{i32 7, !"PIC Level", i32 2} +!8 = !{!"clang version 9.0.0 "} +!9 = !{i32 1, i32 2} +!10 = !{i32 32, i32 33} +!11 = !{i32 1, i32 1025} +!12 = !{i32 0, i32 1024} +!13 = !{!14, !16} +!14 = distinct !{!14, !15, !"__omp_outlined__: %.global_tid."} +!15 = distinct !{!15, !"__omp_outlined__"} +!16 = distinct !{!16, !15, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cuda + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/jdoerfert/target_offload_not_SPMD-16350b.bc' +source_filename = "/home/jdoerfert/projects/llvm-project/llvm/test/Transforms/OpenMP/target_offload_not_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cuda = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_2b_142c58b_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [2 x i64] [i64 40, i64 40] +@.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35] +@.str.3 = private unnamed_addr constant [33 x i8] c"dis[%3i] = %4i\09\09team[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_2b_142c58b_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_2b_142c58b_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cuda = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cuda, i8* @.omp_offloading.img_end.nvptx64-nvida-cuda, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis, i32* %team) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [2 x i8*], align 8 + %.offload_ptrs = alloca [2 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %team.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %3, i64 0 + %4 = load i32*, i32** %team.addr, align 8 + %5 = load i32*, i32** %team.addr, align 8 + %arrayidx1 = getelementptr inbounds i32, i32* %5, i64 0 + %6 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %7 = bitcast i8** %6 to i32** + store i32* %2, i32** %7, align 8 + %8 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = bitcast i8** %8 to i32** + store i32* %arrayidx, i32** %9, align 8 + %10 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 1 + %11 = bitcast i8** %10 to i32** + store i32* %4, i32** %11, align 8 + %12 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 1 + %13 = bitcast i8** %12 to i32** + store i32* %arrayidx1, i32** %13, align 8 + %14 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %15 = getelementptr inbounds [2 x i8*], [2 x i8*]* %.offload_ptrs, i32 0, i32 0 + %16 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_2b_142c58b_foo_l10.region_id, i32 2, i8** %14, i8** %15, i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([2 x i64], [2 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %17 = icmp ne i32 %16, 0 + br i1 %17, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_2b_142c58b_foo_l10(i32* %0, i32* %1) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_2b_142c58b_foo_l10(i32* %dis, i32* %team) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + %3 = load i32*, i32** %team.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 2, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2, i32* %3) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis, i32* %team) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %team.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32* %team, i32** %team.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32**)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32** %dis.addr) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %11, %12 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32**)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i32** %dis.addr) + %13 = load i32*, i32** %team.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %13, i64 %idxprom + %14 = load i32, i32* %arrayidx, align 4 + %add2 = add nsw i32 %14, 1 + store i32 %add2, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32**, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %1 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %1 to i32 + %2 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %2 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %3 = load i32*, i32** %.global_tid..addr, align 8 + %4 = load i32, i32* %3, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %4, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %5 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %5, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %6 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %6, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %7 = load i32, i32* %.omp.lb, align 4 + store i32 %7, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %8 = load i32, i32* %.omp.iv, align 4 + %9 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %8, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %10 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %10, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %11 = load i32*, i32** %0, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom = sext i32 %12 to i64 + %arrayidx = getelementptr inbounds i32, i32* %11, i64 %idxprom + %13 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %14 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %14, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %4) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32** dereferenceable(8) %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32**, align 8 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32** %dis, i32*** %dis.addr, align 8 + %0 = load i32**, i32*** %dis.addr, align 8 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %1 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %1, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %2 = load i32*, i32** %0, align 8 + %3 = load i32, i32* %i, align 4 + %idxprom = sext i32 %3 to i64 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom + %4 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %for.inc + +for.inc: ; preds = %for.body + %5 = load i32, i32* %i, align 4 + %inc = add nsw i32 %5, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + ret void +} + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %team = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i4 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + %3 = load i32, i32* %i, align 4 + %idxprom1 = sext i32 %3 to i64 + %arrayidx2 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom1 + store i32 0, i32* %arrayidx2, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %4 = load i32, i32* %i, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + %arraydecay3 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i32 0, i32 0 + call void @foo(i32* %arraydecay, i32* %arraydecay3) + store i32 0, i32* %i4, align 4 + br label %for.cond5 + +for.cond5: ; preds = %for.inc12, %for.end + %5 = load i32, i32* %i4, align 4 + %cmp6 = icmp slt i32 %5, 10 + br i1 %cmp6, label %for.body7, label %for.end14 + +for.body7: ; preds = %for.cond5 + %6 = load i32, i32* %i4, align 4 + %7 = load i32, i32* %i4, align 4 + %idxprom8 = sext i32 %7 to i64 + %arrayidx9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom8 + %8 = load i32, i32* %arrayidx9, align 4 + %9 = load i32, i32* %i4, align 4 + %10 = load i32, i32* %i4, align 4 + %idxprom10 = sext i32 %10 to i64 + %arrayidx11 = getelementptr inbounds [10 x i32], [10 x i32]* %team, i64 0, i64 %idxprom10 + %11 = load i32, i32* %arrayidx11, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([33 x i8], [33 x i8]* @.str.3, i32 0, i32 0), i32 %6, i32 %8, i32 %9, i32 %11) + br label %for.inc12 + +for.inc12: ; preds = %for.body7 + %12 = load i32, i32* %i4, align 4 + %inc13 = add nsw i32 %12, 1 + store i32 %inc13, i32* %i4, align 4 + br label %for.cond5 + +for.end14: ; preds = %for.cond5 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cuda) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cuda() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 43, i32 21153163, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.3/target_offload_is_SPMD.O3.new.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.3/target_offload_is_SPMD.O3.new.ll @@ -0,0 +1,736 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud +; ModuleID = '../SPMD_examples/v0.3/target_offload_is_SPMD.c' +source_filename = "../SPMD_examples/v0.3/target_offload_is_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.shared.struct = type { i64, i64, i32* } +%omp.shared.struct.0 = type { i64, i64, i32* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_2852ec2_foo_l10_exec_mode = weak constant i8 0 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_2852ec2_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_2852ec2_foo_l10(i32* %dis) local_unnamed_addr #0 { +entry: + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.omp.comb.lb4.i = alloca i32, align 4 + %.omp.comb.ub5.i = alloca i32, align 4 + %.omp.stride6.i = alloca i32, align 4 + %.omp.is_last7.i = alloca i32, align 4 + %.captured19.i = alloca %omp.shared.struct.0, align 8 + %0 = tail call i16 @__kmpc_generic_kernel_init(i16 1, i16 1, i16 1, i16 0) #2 + %1 = icmp eq i16 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #2 + %3 = bitcast %omp.shared.struct* %.captured.i to i8* + call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %3) + %4 = bitcast %omp.shared.struct.0* %.captured19.i to i8* + call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %4) + %5 = bitcast i32* %.omp.comb.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %5) #2 + store i32 0, i32* %.omp.comb.lb.i, align 4, !tbaa !5 + %6 = bitcast i32* %.omp.comb.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %6) #2 + store i32 9, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %7 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %7) #2 + store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5 + %8 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %8) #2 + store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %2, i32 92, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.comb.lb.i, i32* nonnull %.omp.comb.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #2 + %9 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %10 = icmp slt i32 %9, 9 + %cond.i = select i1 %10, i32 %9, i32 9 + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %11 = load i32, i32* %.omp.comb.lb.i, align 4, !tbaa !5 + %cmp13.i = icmp sgt i32 %11, %cond.i + br i1 %cmp13.i, label %omp.loop.exit.i, label %omp.inner.for.body.lr.ph.i + +omp.inner.for.body.lr.ph.i: ; preds = %.execute + %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 0 + %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 1 + %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 2 + %15 = zext i32 %11 to i64 + %16 = zext i32 %cond.i to i64 + store i64 %15, i64* %12, align 8 + store i64 %16, i64* %13, align 8 + store i32* %dis, i32** %14, align 8 + call fastcc void @__omp_outlined__1_wrapper(i8* %3) + %17 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %add.i4 = add nsw i32 %17, %11 + %18 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %cmp1.i5 = icmp sgt i32 %add.i4, %18 + br i1 %cmp1.i5, label %omp.loop.exit.i, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i + +omp.inner.for.body.omp.inner.for.body_crit_edge.i: ; preds = %omp.inner.for.body.lr.ph.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i + %19 = phi i32 [ %23, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %18, %omp.inner.for.body.lr.ph.i ] + %add.i6 = phi i32 [ %add.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %add.i4, %omp.inner.for.body.lr.ph.i ] + %.pre.i = load i32, i32* %.omp.comb.lb.i, align 4 + %20 = zext i32 %.pre.i to i64 + %21 = zext i32 %19 to i64 + store i64 %20, i64* %12, align 8 + store i64 %21, i64* %13, align 8 + store i32* %dis, i32** %14, align 8 + call fastcc void @__omp_outlined__1_wrapper(i8* %3) + %22 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %add.i = add nsw i32 %22, %add.i6 + %23 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %cmp1.i = icmp sgt i32 %add.i, %23 + br i1 %cmp1.i, label %omp.loop.exit.i, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i + +omp.loop.exit.i: ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge.i, %omp.inner.for.body.lr.ph.i, %.execute + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %2) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %8) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %7) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %6) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %5) #2 + %24 = bitcast i32* %.omp.comb.lb4.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %24) #2 + store i32 0, i32* %.omp.comb.lb4.i, align 4, !tbaa !5 + %25 = bitcast i32* %.omp.comb.ub5.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %25) #2 + store i32 9, i32* %.omp.comb.ub5.i, align 4, !tbaa !5 + %26 = bitcast i32* %.omp.stride6.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %26) #2 + store i32 1, i32* %.omp.stride6.i, align 4, !tbaa !5 + %27 = bitcast i32* %.omp.is_last7.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %27) #2 + store i32 0, i32* %.omp.is_last7.i, align 4, !tbaa !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %2, i32 92, i32* nonnull %.omp.is_last7.i, i32* nonnull %.omp.comb.lb4.i, i32* nonnull %.omp.comb.ub5.i, i32* nonnull %.omp.stride6.i, i32 1, i32 1) #2 + %28 = load i32, i32* %.omp.comb.ub5.i, align 4, !tbaa !5 + %29 = icmp slt i32 %28, 9 + %cond13.i = select i1 %29, i32 %28, i32 9 + store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !tbaa !5 + %30 = load i32, i32* %.omp.comb.lb4.i, align 4, !tbaa !5 + %cmp151.i = icmp sgt i32 %30, %cond13.i + br i1 %cmp151.i, label %__omp_outlined__.exit, label %omp.inner.for.body17.lr.ph.i + +omp.inner.for.body17.lr.ph.i: ; preds = %omp.loop.exit.i + %31 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i64 0, i32 0 + %32 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i64 0, i32 1 + %33 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured19.i, i64 0, i32 2 + %34 = zext i32 %30 to i64 + %35 = zext i32 %cond13.i to i64 + store i64 %34, i64* %31, align 8 + store i64 %35, i64* %32, align 8 + store i32* %dis, i32** %33, align 8 + call fastcc void @__omp_outlined__2_wrapper(i8* %4) + %36 = load i32, i32* %.omp.stride6.i, align 4, !tbaa !5 + %add21.i1 = add nsw i32 %36, %30 + %37 = load i32, i32* %.omp.comb.ub5.i, align 4, !tbaa !5 + %cmp15.i2 = icmp sgt i32 %add21.i1, %37 + br i1 %cmp15.i2, label %__omp_outlined__.exit, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i + +omp.inner.for.body17.omp.inner.for.body17_crit_edge.i: ; preds = %omp.inner.for.body17.lr.ph.i, %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i + %38 = phi i32 [ %42, %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i ], [ %37, %omp.inner.for.body17.lr.ph.i ] + %add21.i3 = phi i32 [ %add21.i, %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i ], [ %add21.i1, %omp.inner.for.body17.lr.ph.i ] + %.pre5.i = load i32, i32* %.omp.comb.lb4.i, align 4 + %39 = zext i32 %.pre5.i to i64 + %40 = zext i32 %38 to i64 + store i64 %39, i64* %31, align 8 + store i64 %40, i64* %32, align 8 + store i32* %dis, i32** %33, align 8 + call fastcc void @__omp_outlined__2_wrapper(i8* %4) + %41 = load i32, i32* %.omp.stride6.i, align 4, !tbaa !5 + %add21.i = add nsw i32 %41, %add21.i3 + %42 = load i32, i32* %.omp.comb.ub5.i, align 4, !tbaa !5 + %cmp15.i = icmp sgt i32 %add21.i, %42 + br i1 %cmp15.i, label %__omp_outlined__.exit, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.body17.omp.inner.for.body17_crit_edge.i, %omp.inner.for.body17.lr.ph.i, %omp.loop.exit.i + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %2) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %27) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %26) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %25) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %24) #2 + call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %3) + call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %4) + call void @__kmpc_generic_kernel_deinit(i16 1, i16 1) #2 + br label %.exit + +.exit: ; preds = %__omp_outlined__.exit, %entry + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) local_unnamed_addr + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define internal fastcc void @__omp_outlined__1_wrapper(i8* nocapture readonly %payload) unnamed_addr #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #2 + %1 = bitcast i8* %payload to i64* + %2 = load i64, i64* %1, align 1 + %3 = getelementptr inbounds i8, i8* %payload, i64 8 + %4 = bitcast i8* %3 to i64* + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds i8, i8* %payload, i64 16 + %7 = bitcast i8* %6 to i32** + %8 = load i32*, i32** %7, align 1 + %9 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9) #2 + %10 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #2 + %conv.i = trunc i64 %2 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !tbaa !5 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !tbaa !5 + %11 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11) #2 + store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5 + %12 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %12) #2 + store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %0, i32 33, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #2 + %13 = load i32, i32* %.omp.lb.i, align 4, !tbaa !5 + %conv21.i = sext i32 %13 to i64 + %cmp2.i = icmp ult i64 %5, %conv21.i + br i1 %cmp2.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.lr.ph.i + +omp.inner.for.body.lr.ph.i: ; preds = %entry + %14 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %15 = sext i32 %14 to i64 + br label %omp.inner.for.body.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.body.i, %omp.inner.for.body.lr.ph.i + %indvars.iv.i = phi i64 [ %conv21.i, %omp.inner.for.body.lr.ph.i ], [ %indvars.iv.next.i, %omp.inner.for.body.i ] + %arrayidx.i = getelementptr inbounds i32, i32* %8, i64 %indvars.iv.i + %16 = atomicrmw add i32* %arrayidx.i, i32 1 monotonic + %indvars.iv.next.i = add i64 %indvars.iv.i, %15 + %cmp.i = icmp ugt i64 %indvars.iv.next.i, %5 + br i1 %cmp.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.i + +__omp_outlined__1.exit: ; preds = %omp.inner.for.body.i, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %0) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %12) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %10) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %9) #2 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr + +; Function Attrs: norecurse nounwind +define internal fastcc void @__omp_outlined__2_wrapper(i8* nocapture readonly %payload) unnamed_addr #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #2 + %1 = bitcast i8* %payload to i64* + %2 = load i64, i64* %1, align 1 + %3 = getelementptr inbounds i8, i8* %payload, i64 8 + %4 = bitcast i8* %3 to i64* + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds i8, i8* %payload, i64 16 + %7 = bitcast i8* %6 to i32** + %8 = load i32*, i32** %7, align 1 + %9 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9) #2 + %10 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #2 + %conv.i = trunc i64 %2 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !tbaa !5 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !tbaa !5 + %11 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11) #2 + store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5 + %12 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %12) #2 + store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %0, i32 33, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #2 + %13 = load i32, i32* %.omp.lb.i, align 4, !tbaa !5 + %conv21.i = sext i32 %13 to i64 + %cmp2.i = icmp ult i64 %5, %conv21.i + br i1 %cmp2.i, label %__omp_outlined__2.exit, label %omp.inner.for.body.lr.ph.i + +omp.inner.for.body.lr.ph.i: ; preds = %entry + %14 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %15 = sext i32 %14 to i64 + br label %omp.inner.for.body.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.body.i, %omp.inner.for.body.lr.ph.i + %indvars.iv.i = phi i64 [ %conv21.i, %omp.inner.for.body.lr.ph.i ], [ %indvars.iv.next.i, %omp.inner.for.body.i ] + %arrayidx.i = getelementptr inbounds i32, i32* %8, i64 %indvars.iv.i + %16 = atomicrmw add i32* %arrayidx.i, i32 1 monotonic + %indvars.iv.next.i = add i64 %indvars.iv.i, %15 + %cmp.i = icmp ugt i64 %indvars.iv.next.i, %5 + br i1 %cmp.i, label %__omp_outlined__2.exit, label %omp.inner.for.body.i + +__omp_outlined__2.exit: ; preds = %omp.inner.for.body.i, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %0) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %12) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %10) #2 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %9) #2 + ret void +} + +declare void @__kmpc_generic_kernel_deinit(i16, i16) local_unnamed_addr + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 42282690, !"foo", i32 10, i32 0} +!1 = !{void (i32*)* @__omp_offloading_18_2852ec2_foo_l10, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 "} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/johannes/target_offload_is_SPMD-241dc4.bc' +source_filename = "../SPMD_examples/v0.3/target_offload_is_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cud = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_18_2852ec2_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 40] +@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] +@.str.3 = private unnamed_addr constant [16 x i8] c"dis[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_18_2852ec2_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_18_2852ec2_foo_l10 = weak local_unnamed_addr constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_18_2852ec2_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cud = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cud = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cud, i8* @.omp_offloading.img_end.nvptx64-nvida-cud, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud to i8*) }] + +; Function Attrs: nounwind uwtable +define dso_local void @foo(i32* %dis) local_unnamed_addr #0 { +entry: + %.offload_baseptrs = alloca [1 x i8*], align 8 + %.offload_ptrs = alloca [1 x i8*], align 8 + %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 + %1 = bitcast [1 x i8*]* %.offload_baseptrs to i32** + store i32* %dis, i32** %1, align 8 + %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 + %3 = bitcast [1 x i8*]* %.offload_ptrs to i32** + store i32* %dis, i32** %3, align 8 + %4 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852ec2_foo_l10.region_id, i32 1, i8** nonnull %0, i8** nonnull %2, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #4 + %5 = icmp eq i32 %4, 0 + br i1 %5, label %omp_offload.cont, label %omp_offload.failed + +omp_offload.failed: ; preds = %entry + %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #4 + %7 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %6, i32 3, i32 0) #4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %dis) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %entry, %omp_offload.failed + ret void +} + +; Function Attrs: norecurse nounwind uwtable +define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* %dis) #1 { +entry: + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %.omp.comb.lb4 = alloca i32, align 4 + %.omp.comb.ub5 = alloca i32, align 4 + %.omp.stride6 = alloca i32, align 4 + %.omp.is_last7 = alloca i32, align 4 + %0 = bitcast i32* %.omp.comb.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4 + store i32 0, i32* %.omp.comb.lb, align 4, !tbaa !3 + %1 = bitcast i32* %.omp.comb.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4 + store i32 9, i32* %.omp.comb.ub, align 4, !tbaa !3 + %2 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !3 + %3 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !3 + %4 = load i32, i32* %.global_tid., align 4, !tbaa !3 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %4, i32 92, i32* nonnull %.omp.is_last, i32* nonnull %.omp.comb.lb, i32* nonnull %.omp.comb.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #4 + %5 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3 + %6 = icmp slt i32 %5, 9 + %cond = select i1 %6, i32 %5, i32 9 + store i32 %cond, i32* %.omp.comb.ub, align 4, !tbaa !3 + %7 = load i32, i32* %.omp.comb.lb, align 4, !tbaa !3 + %cmp132 = icmp sgt i32 %7, %cond + br i1 %cmp132, label %omp.loop.exit, label %omp.inner.for.body.preheader + +omp.inner.for.body.preheader: ; preds = %entry + %8 = zext i32 %7 to i64 + %9 = zext i32 %cond to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %9, i32* %dis) #4 + %10 = load i32, i32* %.omp.stride, align 4, !tbaa !3 + %add38 = add nsw i32 %10, %7 + %11 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3 + %cmp139 = icmp sgt i32 %add38, %11 + br i1 %cmp139, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge + +omp.inner.for.body.omp.inner.for.body_crit_edge: ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body.omp.inner.for.body_crit_edge + %12 = phi i32 [ %16, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %11, %omp.inner.for.body.preheader ] + %add40 = phi i32 [ %add, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %add38, %omp.inner.for.body.preheader ] + %.pre = load i32, i32* %.omp.comb.lb, align 4 + %13 = zext i32 %.pre to i64 + %14 = zext i32 %12 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %13, i64 %14, i32* %dis) #4 + %15 = load i32, i32* %.omp.stride, align 4, !tbaa !3 + %add = add nsw i32 %15, %add40 + %16 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3 + %cmp1 = icmp sgt i32 %add, %16 + br i1 %cmp1, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge + +omp.loop.exit: ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge, %omp.inner.for.body.preheader, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4 + %17 = bitcast i32* %.omp.comb.lb4 to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %17) #4 + store i32 0, i32* %.omp.comb.lb4, align 4, !tbaa !3 + %18 = bitcast i32* %.omp.comb.ub5 to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %18) #4 + store i32 9, i32* %.omp.comb.ub5, align 4, !tbaa !3 + %19 = bitcast i32* %.omp.stride6 to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %19) #4 + store i32 1, i32* %.omp.stride6, align 4, !tbaa !3 + %20 = bitcast i32* %.omp.is_last7 to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %20) #4 + store i32 0, i32* %.omp.is_last7, align 4, !tbaa !3 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %4, i32 92, i32* nonnull %.omp.is_last7, i32* nonnull %.omp.comb.lb4, i32* nonnull %.omp.comb.ub5, i32* nonnull %.omp.stride6, i32 1, i32 1) #4 + %21 = load i32, i32* %.omp.comb.ub5, align 4, !tbaa !3 + %22 = icmp slt i32 %21, 9 + %cond13 = select i1 %22, i32 %21, i32 9 + store i32 %cond13, i32* %.omp.comb.ub5, align 4, !tbaa !3 + %23 = load i32, i32* %.omp.comb.lb4, align 4, !tbaa !3 + %cmp1530 = icmp sgt i32 %23, %cond13 + br i1 %cmp1530, label %omp.loop.exit21, label %omp.inner.for.body17.preheader + +omp.inner.for.body17.preheader: ; preds = %omp.loop.exit + %24 = zext i32 %23 to i64 + %25 = zext i32 %cond13 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %24, i64 %25, i32* %dis) #4 + %26 = load i32, i32* %.omp.stride6, align 4, !tbaa !3 + %add1935 = add nsw i32 %26, %23 + %27 = load i32, i32* %.omp.comb.ub5, align 4, !tbaa !3 + %cmp1536 = icmp sgt i32 %add1935, %27 + br i1 %cmp1536, label %omp.loop.exit21, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge + +omp.inner.for.body17.omp.inner.for.body17_crit_edge: ; preds = %omp.inner.for.body17.preheader, %omp.inner.for.body17.omp.inner.for.body17_crit_edge + %28 = phi i32 [ %32, %omp.inner.for.body17.omp.inner.for.body17_crit_edge ], [ %27, %omp.inner.for.body17.preheader ] + %add1937 = phi i32 [ %add19, %omp.inner.for.body17.omp.inner.for.body17_crit_edge ], [ %add1935, %omp.inner.for.body17.preheader ] + %.pre34 = load i32, i32* %.omp.comb.lb4, align 4 + %29 = zext i32 %.pre34 to i64 + %30 = zext i32 %28 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..2 to void (i32*, i32*, ...)*), i64 %29, i64 %30, i32* %dis) #4 + %31 = load i32, i32* %.omp.stride6, align 4, !tbaa !3 + %add19 = add nsw i32 %31, %add1937 + %32 = load i32, i32* %.omp.comb.ub5, align 4, !tbaa !3 + %cmp15 = icmp sgt i32 %add19, %32 + br i1 %cmp15, label %omp.loop.exit21, label %omp.inner.for.body17.omp.inner.for.body17_crit_edge + +omp.loop.exit21: ; preds = %omp.inner.for.body17.omp.inner.for.body17_crit_edge, %omp.inner.for.body17.preheader, %omp.loop.exit + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %20) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %19) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %18) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %17) #4 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr + +; Function Attrs: norecurse nounwind uwtable +define internal void @.omp_outlined..1(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* nocapture %dis) #1 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %0 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4 + %1 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4 + %conv = trunc i64 %.previous.lb. to i32 + %conv1 = trunc i64 %.previous.ub. to i32 + store i32 %conv, i32* %.omp.lb, align 4, !tbaa !3 + store i32 %conv1, i32* %.omp.ub, align 4, !tbaa !3 + %2 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !3 + %3 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !3 + %4 = load i32, i32* %.global_tid., align 4, !tbaa !3 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %4, i32 34, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #4 + %5 = load i32, i32* %.omp.ub, align 4, !tbaa !3 + %6 = icmp slt i32 %5, 9 + %cond = select i1 %6, i32 %5, i32 9 + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !3 + %7 = load i32, i32* %.omp.lb, align 4, !tbaa !3 + %cmp310 = icmp sgt i32 %7, %cond + br i1 %cmp310, label %omp.loop.exit, label %omp.inner.for.body.preheader + +omp.inner.for.body.preheader: ; preds = %entry + %8 = sext i32 %7 to i64 + %9 = sext i32 %cond to i64 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body + %indvars.iv = phi i64 [ %8, %omp.inner.for.body.preheader ], [ %indvars.iv.next, %omp.inner.for.body ] + %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %indvars.iv + %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %cmp3 = icmp slt i64 %indvars.iv, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.body, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4 + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare !callback !7 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr + +; Function Attrs: norecurse nounwind uwtable +define internal void @.omp_outlined..2(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* nocapture %dis) #1 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %0 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #4 + %1 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #4 + %conv = trunc i64 %.previous.lb. to i32 + %conv1 = trunc i64 %.previous.ub. to i32 + store i32 %conv, i32* %.omp.lb, align 4, !tbaa !3 + store i32 %conv1, i32* %.omp.ub, align 4, !tbaa !3 + %2 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #4 + store i32 1, i32* %.omp.stride, align 4, !tbaa !3 + %3 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #4 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !3 + %4 = load i32, i32* %.global_tid., align 4, !tbaa !3 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %4, i32 34, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #4 + %5 = load i32, i32* %.omp.ub, align 4, !tbaa !3 + %6 = icmp slt i32 %5, 9 + %cond = select i1 %6, i32 %5, i32 9 + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !3 + %7 = load i32, i32* %.omp.lb, align 4, !tbaa !3 + %cmp310 = icmp sgt i32 %7, %cond + br i1 %cmp310, label %omp.loop.exit, label %omp.inner.for.body.preheader + +omp.inner.for.body.preheader: ; preds = %entry + %8 = sext i32 %7 to i64 + %9 = sext i32 %cond to i64 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body + %indvars.iv = phi i64 [ %8, %omp.inner.for.body.preheader ], [ %indvars.iv.next, %omp.inner.for.body ] + %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %indvars.iv + %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %cmp3 = icmp slt i64 %indvars.iv, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.body, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #4 + ret void +} + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) local_unnamed_addr + +declare !callback !7 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) local_unnamed_addr + +; Function Attrs: nounwind uwtable +define dso_local i32 @main() local_unnamed_addr #0 { +entry: + %.offload_baseptrs.i = alloca [1 x i8*], align 8 + %.offload_ptrs.i = alloca [1 x i8*], align 8 + %dis = alloca [10 x i32], align 16 + %0 = bitcast [10 x i32]* %dis to i8* + call void @llvm.lifetime.start.p0i8(i64 40, i8* nonnull %0) #4 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 0 + %arrayidx.1 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 1 + %arrayidx.2 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 2 + %arrayidx.3 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 3 + %1 = bitcast [10 x i32]* %dis to <4 x i32>* + store <4 x i32> , <4 x i32>* %1, align 16, !tbaa !3 + %arrayidx.4 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 4 + %arrayidx.5 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 5 + %arrayidx.6 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 6 + %arrayidx.7 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 7 + %2 = bitcast i32* %arrayidx.4 to <4 x i32>* + store <4 x i32> , <4 x i32>* %2, align 16, !tbaa !3 + %arrayidx.8 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 8 + store i32 8, i32* %arrayidx.8, align 16, !tbaa !3 + %arrayidx.9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 9 + store i32 9, i32* %arrayidx.9, align 4, !tbaa !3 + %3 = bitcast [1 x i8*]* %.offload_baseptrs.i to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) + %4 = bitcast [1 x i8*]* %.offload_ptrs.i to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %4) + %5 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs.i, i64 0, i64 0 + %6 = bitcast [1 x i8*]* %.offload_baseptrs.i to i32** + store i32* %arrayidx, i32** %6, align 8 + %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs.i, i64 0, i64 0 + %8 = bitcast [1 x i8*]* %.offload_ptrs.i to i32** + store i32* %arrayidx, i32** %8, align 8 + %9 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852ec2_foo_l10.region_id, i32 1, i8** nonnull %5, i8** nonnull %7, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #4 + %10 = icmp eq i32 %9, 0 + br i1 %10, label %foo.exit, label %omp_offload.failed.i + +omp_offload.failed.i: ; preds = %entry + %11 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #4 + %12 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %11, i32 3, i32 0) #4 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %arrayidx) #4 + br label %foo.exit + +foo.exit: ; preds = %entry, %omp_offload.failed.i + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %4) + %13 = load i32, i32* %arrayidx, align 16, !tbaa !3 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 0, i32 %13) + %14 = load i32, i32* %arrayidx.1, align 4, !tbaa !3 + %call.1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 1, i32 %14) + %15 = load i32, i32* %arrayidx.2, align 8, !tbaa !3 + %call.2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 2, i32 %15) + %16 = load i32, i32* %arrayidx.3, align 4, !tbaa !3 + %call.3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 3, i32 %16) + %17 = load i32, i32* %arrayidx.4, align 16, !tbaa !3 + %call.4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 4, i32 %17) + %18 = load i32, i32* %arrayidx.5, align 4, !tbaa !3 + %call.5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 5, i32 %18) + %19 = load i32, i32* %arrayidx.6, align 8, !tbaa !3 + %call.6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 6, i32 %19) + %20 = load i32, i32* %arrayidx.7, align 4, !tbaa !3 + %call.7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 7, i32 %20) + %21 = load i32, i32* %arrayidx.8, align 16, !tbaa !3 + %call.8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 8, i32 %21) + %22 = load i32, i32* %arrayidx.9, align 4, !tbaa !3 + %call.9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.3, i64 0, i64 0), i32 9, i32 %22) + call void @llvm.lifetime.end.p0i8(i64 40, i8* nonnull %0) #4 + ret i32 0 +} + +; Function Attrs: nounwind +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #3 + +; Function Attrs: nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8* nocapture readnone) #0 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud) { +entry: + %1 = tail call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #4 + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) local_unnamed_addr + +; Function Attrs: nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cud() #0 section ".text.startup" comdat { +entry: + %0 = tail call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #4 + %1 = tail call i32 @__cxa_atexit(void (i8*)* nonnull @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* nonnull @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) local_unnamed_addr + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #4 + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 24, i32 42282690, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8} +!8 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.3/target_offload_is_SPMD.c =================================================================== --- /dev/null +++ SPMD_examples/v0.3/target_offload_is_SPMD.c @@ -0,0 +1,36 @@ +#include +#include +#include + +#define N 10 +#define TEAMS 3 + +void foo(int* dis) { + + #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N]) + { + #pragma omp distribute parallel for firstprivate(dis) + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; + + #pragma omp distribute parallel for firstprivate(dis) + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; + } +} + +int main() { + int dis[N]; + + for (int i = 0; i < N; i++) + dis[i] = i; + + foo(dis); + + for (int i = 0; i < N; i++) + printf("dis[%3i] = %4i\n", i, dis[i]); + + return 0; +} Index: SPMD_examples/v0.3/target_offload_not_SPMD.O0.new.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.3/target_offload_not_SPMD.O0.new.ll @@ -0,0 +1,610 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud +; ModuleID = '../SPMD_examples/v0.3/target_offload_not_SPMD.c' +source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.shared.struct = type { i64, i64, i32* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_2852fc0_foo_l10_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_2852fc0_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: noinline norecurse nounwind optnone +define weak void @__omp_offloading_18_2852fc0_foo_l10(i32* %dis) #0 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %dis.addr.i = alloca i32*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %dis.addr = alloca i32*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %dis, i32** %dis.addr, align 8 + %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0) + %1 = icmp eq i16 %0, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %3 = load i32*, i32** %dis.addr, align 8 + store i32 %2, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !5 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !5 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !5 + store i32* %3, i32** %dis.addr.i, align 8, !noalias !5 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 9, i32* %.omp.comb.ub.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5 + %4 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !5 + %5 = load i32, i32* %4, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %5, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #3 + %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp.i = icmp sgt i32 %6, 9 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %7 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 9, %cond.true.i ], [ %7, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %8 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 %8, i32* %.omp.iv.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %9 = load i32, i32* %.omp.iv.i, align 4, !noalias !5 + %10 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp1.i = icmp sle i32 %9, %10 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %__omp_outlined__.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %11 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + %12 = zext i32 %11 to i64 + %13 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %14 = zext i32 %13 to i64 + %15 = load i32*, i32** %dis.addr.i, align 8, !noalias !5 + %16 = bitcast %omp.shared.struct* %.captured.i to i8* + %17 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %12, i64* %17, !noalias !5 + %18 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %14, i64* %18, !noalias !5 + %19 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store i32* %15, i32** %19, !noalias !5 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %16, i16 24, i16 1) #3 + %20 = load i32, i32* %.omp.iv.i, align 4, !noalias !5 + %21 = load i32, i32* %.omp.stride.i, align 4, !noalias !5 + %add.i = add nsw i32 %20, %21 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %5) #3 + %22 = load i32*, i32** %dis.addr.i, align 8, !noalias !5 + %call.i = call i32 @omp_get_team_num() #3 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %22, i64 %idxprom.i + %23 = load i32, i32* %arrayidx.i, align 4 + %add2.i = add nsw i32 %23, 1 + store i32 %add2.i, i32* %arrayidx.i, align 4 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* %dis) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %5 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %5 to i64 + %6 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %6 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %8 = load i32*, i32** %dis.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %11 = load i32, i32* %.omp.iv, align 4 + %12 = load i32, i32* %.omp.stride, align 4 + %add4 = add nsw i32 %11, %12 + store i32 %add4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: noinline norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct* + %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0 + %4 = load i64, i64* %3, align 1 + %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1 + %6 = load i64, i64* %5, align 1 + %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2 + %8 = load i32*, i32** %7, align 1 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, i32* %8) #3 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +declare i32 @omp_get_team_num() #2 + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +attributes #0 = { noinline norecurse nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0} +!1 = !{void (i32*)* @__omp_offloading_18_2852fc0_foo_l10, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 "} +!5 = !{!6, !8} +!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."} +!7 = distinct !{!7, !"__omp_outlined__"} +!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/johannes/target_offload_not_SPMD-9b84f7.bc' +source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cud = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_18_2852fc0_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 40] +@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] +@.str.2 = private unnamed_addr constant [16 x i8] c"dis[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_18_2852fc0_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_18_2852fc0_foo_l10 = weak constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_18_2852fc0_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cud = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cud = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cud, i8* @.omp_offloading.img_end.nvptx64-nvida-cud, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud to i8*) }] + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @foo(i32* %dis) #0 { +entry: + %dis.addr = alloca i32*, align 8 + %.offload_baseptrs = alloca [1 x i8*], align 8 + %.offload_ptrs = alloca [1 x i8*], align 8 + store i32* %dis, i32** %dis.addr, align 8 + %0 = load i32*, i32** %dis.addr, align 8 + %1 = load i32*, i32** %dis.addr, align 8 + %2 = load i32*, i32** %dis.addr, align 8 + %arrayidx = getelementptr inbounds i32, i32* %2, i64 0 + %3 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %4 = bitcast i8** %3 to i32** + store i32* %1, i32** %4, align 8 + %5 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i32 0, i32 0 + %6 = bitcast i8** %5 to i32** + store i32* %arrayidx, i32** %6, align 8 + %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i32 0, i32 0 + %8 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i32 0, i32 0 + %9 = call i32 @__tgt_target_teams(i64 -1, i8* @.__omp_offloading_18_2852fc0_foo_l10.region_id, i32 1, i8** %7, i8** %8, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i32 0, i32 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i32 0, i32 0), i32 3, i32 0) + %10 = icmp ne i32 %9, 0 + br i1 %10, label %omp_offload.failed, label %omp_offload.cont + +omp_offload.failed: ; preds = %entry + call void @__omp_offloading_18_2852fc0_foo_l10(i32* %0) #4 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %omp_offload.failed, %entry + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @__omp_offloading_18_2852fc0_foo_l10(i32* %dis) #1 { +entry: + %dis.addr = alloca i32*, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32* %dis, i32** %dis.addr, align 8 + %1 = call i32 @__kmpc_push_num_teams(%struct.ident_t* @2, i32 %0, i32 3, i32 0) + %2 = load i32*, i32** %dis.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %2) + ret void +} + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %dis.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32 0, i32* %.omp.comb.lb, align 4 + store i32 9, i32* %.omp.comb.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %0 = load i32*, i32** %.global_tid..addr, align 8 + %1 = load i32, i32* %0, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %1, i32 92, i32* %.omp.is_last, i32* %.omp.comb.lb, i32* %.omp.comb.ub, i32* %.omp.stride, i32 1, i32 1) + %2 = load i32, i32* %.omp.comb.ub, align 4 + %cmp = icmp sgt i32 %2, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %3 = load i32, i32* %.omp.comb.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %3, %cond.false ] + store i32 %cond, i32* %.omp.comb.ub, align 4 + %4 = load i32, i32* %.omp.comb.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %5 = load i32, i32* %.omp.iv, align 4 + %6 = load i32, i32* %.omp.comb.ub, align 4 + %cmp1 = icmp sle i32 %5, %6 + br i1 %cmp1, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.comb.lb, align 4 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub, align 4 + %10 = zext i32 %9 to i64 + %11 = load i32*, i32** %dis.addr, align 8 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %10, i32* %11) + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.inner.for.body + %12 = load i32, i32* %.omp.iv, align 4 + %13 = load i32, i32* %.omp.stride, align 4 + %add = add nsw i32 %12, %13 + store i32 %add, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %1) + %14 = load i32*, i32** %dis.addr, align 8 + %call = call i32 @omp_get_team_num() + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %14, i64 %idxprom + %15 = load i32, i32* %arrayidx, align 4 + %add2 = add nsw i32 %15, 1 + store i32 %add2, i32* %arrayidx, align 4 + ret void +} + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: noinline norecurse nounwind optnone uwtable +define internal void @.omp_outlined..1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* %dis) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %dis.addr = alloca i32*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store i32* %dis, i32** %dis.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 9, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 34, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.ub, align 4 + %cmp = icmp sgt i32 %4, 9 + br i1 %cmp, label %cond.true, label %cond.false + +cond.true: ; preds = %entry + br label %cond.end + +cond.false: ; preds = %entry + %5 = load i32, i32* %.omp.ub, align 4 + br label %cond.end + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i32 [ 9, %cond.true ], [ %5, %cond.false ] + store i32 %cond, i32* %.omp.ub, align 4 + %6 = load i32, i32* %.omp.lb, align 4 + store i32 %6, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %cond.end + %7 = load i32, i32* %.omp.iv, align 4 + %8 = load i32, i32* %.omp.ub, align 4 + %cmp3 = icmp sle i32 %7, %8 + br i1 %cmp3, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %9 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %9, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %10 = load i32*, i32** %dis.addr, align 8 + %11 = load i32, i32* %i, align 4 + %idxprom = sext i32 %11 to i64 + %arrayidx = getelementptr inbounds i32, i32* %10, i64 %idxprom + %12 = atomicrmw add i32* %arrayidx, i32 1 monotonic + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %13 = load i32, i32* %.omp.iv, align 4 + %add5 = add nsw i32 %13, 1 + store i32 %add5, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3) + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +declare !callback !3 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @omp_get_team_num() #2 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) + +declare !callback !3 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %dis = alloca [10 x i32], align 16 + %i = alloca i32, align 4 + %i1 = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %2 = load i32, i32* %i, align 4 + %idxprom = sext i32 %2 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom + store i32 %1, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %3 = load i32, i32* %i, align 4 + %inc = add nsw i32 %3, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %arraydecay = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i32 0, i32 0 + call void @foo(i32* %arraydecay) + store i32 0, i32* %i1, align 4 + br label %for.cond2 + +for.cond2: ; preds = %for.inc7, %for.end + %4 = load i32, i32* %i1, align 4 + %cmp3 = icmp slt i32 %4, 10 + br i1 %cmp3, label %for.body4, label %for.end9 + +for.body4: ; preds = %for.cond2 + %5 = load i32, i32* %i1, align 4 + %6 = load i32, i32* %i1, align 4 + %idxprom5 = sext i32 %6 to i64 + %arrayidx6 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 %idxprom5 + %7 = load i32, i32* %arrayidx6, align 4 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i32 0, i32 0), i32 %5, i32 %7) + br label %for.inc7 + +for.inc7: ; preds = %for.body4 + %8 = load i32, i32* %i1, align 4 + %inc8 = add nsw i32 %8, 1 + store i32 %inc8, i32* %i1, align 4 + br label %for.cond2 + +for.end9: ; preds = %for.cond2 + ret i32 0 +} + +declare dso_local i32 @printf(i8*, ...) #2 + +; Function Attrs: noinline nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8*) #3 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud) { +entry: + %.addr = alloca i8*, align 8 + store i8* %0, i8** %.addr, align 8 + %1 = call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: noinline nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cud() #3 section ".text.startup" comdat { +entry: + %0 = call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* @.omp_offloading.descriptor) + %1 = call i32 @__cxa_atexit(void (i8*)* @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* @__dso_handle) #4 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) #4 + +attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { noinline norecurse nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4} +!4 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.3/target_offload_not_SPMD.O3.new.ll =================================================================== --- /dev/null +++ SPMD_examples/v0.3/target_offload_not_SPMD.O3.new.ll @@ -0,0 +1,576 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud +; ModuleID = '../SPMD_examples/v0.3/target_offload_not_SPMD.c' +source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.shared.struct = type { i64, i64, i32* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_2852fc0_foo_l10_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_2852fc0_foo_l10_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_2852fc0_foo_l10(i32* %dis) local_unnamed_addr #0 { +entry: + %work_fn.addr = alloca i8*, align 8 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %thread_kind = tail call i16 @__kmpc_generic_kernel_init(i16 0, i16 0, i16 1, i16 0) #3 + switch i16 %thread_kind, label %.exit [ + i16 -1, label %worker.wait.preheader + i16 1, label %.execute + ] + +worker.wait.preheader: ; preds = %entry + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #3 + %is_active7 = call i1 @__kmpc_kernel_parallel(i8** nonnull %work_fn.addr, i16 1) #3 + %Work_fn.addr_cast = bitcast i8** %work_fn.addr to void (i8*)** + %work_fn8 = load void (i8*)*, void (i8*)** %Work_fn.addr_cast, align 8 + %no_work9 = icmp eq void (i8*)* %work_fn8, null + br i1 %no_work9, label %master_check, label %worker.active_check + +worker.active_check: ; preds = %worker.wait.preheader, %worker.inactive + %work_fn11 = phi void (i8*)* [ %work_fn, %worker.inactive ], [ %work_fn8, %worker.wait.preheader ] + %is_active10 = phi i1 [ %is_active, %worker.inactive ], [ %is_active7, %worker.wait.preheader ] + br i1 %is_active10, label %worker.active, label %worker.inactive + +worker.active: ; preds = %worker.active_check + %0 = call i8* @__kmpc_get_shared_variables() #3 + %par_fn_check = icmp eq void (i8*)* %work_fn11, @__omp_outlined__1_wrapper + br i1 %par_fn_check, label %worker.execute.__omp_outlined__1_wrapper, label %worker.check.next5 + +worker.execute.__omp_outlined__1_wrapper: ; preds = %worker.active + call void @__omp_outlined__1_wrapper(i8* %0) + br label %worker.parallel_end + +worker.check.next5: ; preds = %worker.active + call void %work_fn11(i8* %0) #3 + br label %worker.parallel_end + +worker.parallel_end: ; preds = %worker.execute.__omp_outlined__1_wrapper, %worker.check.next5 + call void @__kmpc_kernel_end_parallel() #3 + br label %worker.inactive + +worker.inactive: ; preds = %worker.active_check, %worker.parallel_end + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #3 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #3 + %is_active = call i1 @__kmpc_kernel_parallel(i8** nonnull %work_fn.addr, i16 1) #3 + %work_fn = load void (i8*)*, void (i8*)** %Work_fn.addr_cast, align 8 + %no_work = icmp eq void (i8*)* %work_fn, null + br i1 %no_work, label %master_check, label %worker.active_check + +master_check: ; preds = %worker.inactive, %worker.wait.preheader + %1 = icmp eq i16 %thread_kind, 1 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry, %master_check + %2 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #3 + %3 = bitcast %omp.shared.struct* %.captured.i to i8* + call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %3) + %4 = bitcast i32* %.omp.comb.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %4) #3 + store i32 0, i32* %.omp.comb.lb.i, align 4, !tbaa !5 + %5 = bitcast i32* %.omp.comb.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %5) #3 + store i32 9, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %6 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %6) #3 + store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5 + %7 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %7) #3 + store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %2, i32 92, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.comb.lb.i, i32* nonnull %.omp.comb.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #3 + %8 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %9 = icmp slt i32 %8, 9 + %cond.i = select i1 %9, i32 %8, i32 9 + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %10 = load i32, i32* %.omp.comb.lb.i, align 4, !tbaa !5 + %cmp11.i = icmp sgt i32 %10, %cond.i + br i1 %cmp11.i, label %__omp_outlined__.exit, label %omp.inner.for.body.lr.ph.i + +omp.inner.for.body.lr.ph.i: ; preds = %.execute + %11 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 0 + %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 1 + %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i64 0, i32 2 + %14 = zext i32 %10 to i64 + %15 = zext i32 %cond.i to i64 + store i64 %14, i64* %11, align 8 + store i64 %15, i64* %12, align 8 + store i32* %dis, i32** %13, align 8 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* nonnull %3, i16 24, i16 1) #3 + %16 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %add.i1 = add nsw i32 %16, %10 + %17 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %cmp1.i2 = icmp sgt i32 %add.i1, %17 + br i1 %cmp1.i2, label %__omp_outlined__.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i + +omp.inner.for.body.omp.inner.for.body_crit_edge.i: ; preds = %omp.inner.for.body.lr.ph.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i + %18 = phi i32 [ %22, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %17, %omp.inner.for.body.lr.ph.i ] + %add.i3 = phi i32 [ %add.i, %omp.inner.for.body.omp.inner.for.body_crit_edge.i ], [ %add.i1, %omp.inner.for.body.lr.ph.i ] + %.pre.i = load i32, i32* %.omp.comb.lb.i, align 4 + %19 = zext i32 %.pre.i to i64 + %20 = zext i32 %18 to i64 + store i64 %19, i64* %11, align 8 + store i64 %20, i64* %12, align 8 + store i32* %dis, i32** %13, align 8 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* nonnull %3, i16 24, i16 1) #3 + %21 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %add.i = add nsw i32 %21, %add.i3 + %22 = load i32, i32* %.omp.comb.ub.i, align 4, !tbaa !5 + %cmp1.i = icmp sgt i32 %add.i, %22 + br i1 %cmp1.i, label %__omp_outlined__.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge.i, %omp.inner.for.body.lr.ph.i, %.execute + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %2) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %7) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %6) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %5) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %4) #3 + %call.i = call i32 @omp_get_team_num() #3 + %idxprom.i = sext i32 %call.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %dis, i64 %idxprom.i + %23 = load i32, i32* %arrayidx.i, align 4, !tbaa !5 + %add2.i = add nsw i32 %23, 1 + store i32 %add2.i, i32* %arrayidx.i, align 4, !tbaa !5 + call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %3) + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) #3 + br label %.exit + +.exit: ; preds = %entry, %__omp_outlined__.exit, %master_check + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) local_unnamed_addr + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* nocapture readonly %payload) #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #3 + %1 = bitcast i8* %payload to i64* + %2 = load i64, i64* %1, align 1 + %3 = getelementptr inbounds i8, i8* %payload, i64 8 + %4 = bitcast i8* %3 to i64* + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds i8, i8* %payload, i64 16 + %7 = bitcast i8* %6 to i32** + %8 = load i32*, i32** %7, align 1 + %9 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %9) #3 + %10 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %10) #3 + %conv.i = trunc i64 %2 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !tbaa !5 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !tbaa !5 + %11 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %11) #3 + store i32 1, i32* %.omp.stride.i, align 4, !tbaa !5 + %12 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %12) #3 + store i32 0, i32* %.omp.is_last.i, align 4, !tbaa !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %0, i32 33, i32* nonnull %.omp.is_last.i, i32* nonnull %.omp.lb.i, i32* nonnull %.omp.ub.i, i32* nonnull %.omp.stride.i, i32 1, i32 1) #3 + %13 = load i32, i32* %.omp.lb.i, align 4, !tbaa !5 + %conv21.i = sext i32 %13 to i64 + %cmp2.i = icmp ult i64 %5, %conv21.i + br i1 %cmp2.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.lr.ph.i + +omp.inner.for.body.lr.ph.i: ; preds = %entry + %14 = load i32, i32* %.omp.stride.i, align 4, !tbaa !5 + %15 = sext i32 %14 to i64 + br label %omp.inner.for.body.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.body.i, %omp.inner.for.body.lr.ph.i + %indvars.iv.i = phi i64 [ %conv21.i, %omp.inner.for.body.lr.ph.i ], [ %indvars.iv.next.i, %omp.inner.for.body.i ] + %arrayidx.i = getelementptr inbounds i32, i32* %8, i64 %indvars.iv.i + %16 = atomicrmw add i32* %arrayidx.i, i32 1 monotonic + %indvars.iv.next.i = add i64 %indvars.iv.i, %15 + %cmp.i = icmp ugt i64 %indvars.iv.next.i, %5 + br i1 %cmp.i, label %__omp_outlined__1.exit, label %omp.inner.for.body.i + +__omp_outlined__1.exit: ; preds = %omp.inner.for.body.i, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %0) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %12) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %11) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %10) #3 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %9) #3 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) local_unnamed_addr + +declare i32 @omp_get_team_num() local_unnamed_addr #2 + +declare void @__kmpc_generic_kernel_deinit(i16, i16) local_unnamed_addr + +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) local_unnamed_addr + +declare i1 @__kmpc_kernel_parallel(i8**, i16) local_unnamed_addr + +declare i8* @__kmpc_get_shared_variables() local_unnamed_addr + +declare void @__kmpc_kernel_end_parallel() local_unnamed_addr + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0} +!1 = !{void (i32*)* @__omp_offloading_18_2852fc0_foo_l10, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 "} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud + +; __CLANG_OFFLOAD_BUNDLE____START__ host-x86_64-unknown-linux-gnu +; ModuleID = '/tmp/johannes/target_offload_not_SPMD-ce0dd6.bc' +source_filename = "../SPMD_examples/v0.3/target_offload_not_SPMD.c" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%struct.__tgt_offload_entry = type { i8*, i8*, i64, i32, i32 } +%struct.__tgt_device_image = type { i8*, i8*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } +%struct.__tgt_bin_desc = type { i32, %struct.__tgt_device_image*, %struct.__tgt_offload_entry*, %struct.__tgt_offload_entry* } + +$.omp_offloading.descriptor_reg.nvptx64-nvida-cud = comdat any + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@.__omp_offloading_18_2852fc0_foo_l10.region_id = weak constant i8 0 +@.offload_sizes = private unnamed_addr constant [1 x i64] [i64 40] +@.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] +@.str.2 = private unnamed_addr constant [16 x i8] c"dis[%3i] = %4i\0A\00", align 1 +@.omp_offloading.entry_name = internal unnamed_addr constant [36 x i8] c"__omp_offloading_18_2852fc0_foo_l10\00" +@.omp_offloading.entry.__omp_offloading_18_2852fc0_foo_l10 = weak local_unnamed_addr constant %struct.__tgt_offload_entry { i8* @.__omp_offloading_18_2852fc0_foo_l10.region_id, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.omp_offloading.entry_name, i32 0, i32 0), i64 0, i32 0, i32 0 }, section ".omp_offloading.entries", align 1 +@.omp_offloading.entries_begin = external constant %struct.__tgt_offload_entry +@.omp_offloading.entries_end = external constant %struct.__tgt_offload_entry +@.omp_offloading.img_start.nvptx64-nvida-cud = extern_weak constant i8 +@.omp_offloading.img_end.nvptx64-nvida-cud = extern_weak constant i8 +@.omp_offloading.device_images = internal unnamed_addr constant [1 x %struct.__tgt_device_image] [%struct.__tgt_device_image { i8* @.omp_offloading.img_start.nvptx64-nvida-cud, i8* @.omp_offloading.img_end.nvptx64-nvida-cud, %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }], comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8 +@.omp_offloading.descriptor = internal constant %struct.__tgt_bin_desc { i32 1, %struct.__tgt_device_image* getelementptr inbounds ([1 x %struct.__tgt_device_image], [1 x %struct.__tgt_device_image]* @.omp_offloading.device_images, i32 0, i32 0), %struct.__tgt_offload_entry* @.omp_offloading.entries_begin, %struct.__tgt_offload_entry* @.omp_offloading.entries_end }, comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud), align 8 +@__dso_handle = external hidden global i8 +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 0, void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud, i8* bitcast (void ()* @.omp_offloading.descriptor_reg.nvptx64-nvida-cud to i8*) }] + +; Function Attrs: nounwind uwtable +define dso_local void @foo(i32* %dis) local_unnamed_addr #0 { +entry: + %.offload_baseptrs = alloca [1 x i8*], align 8 + %.offload_ptrs = alloca [1 x i8*], align 8 + %0 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs, i64 0, i64 0 + %1 = bitcast [1 x i8*]* %.offload_baseptrs to i32** + store i32* %dis, i32** %1, align 8 + %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs, i64 0, i64 0 + %3 = bitcast [1 x i8*]* %.offload_ptrs to i32** + store i32* %dis, i32** %3, align 8 + %4 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852fc0_foo_l10.region_id, i32 1, i8** nonnull %0, i8** nonnull %2, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #5 + %5 = icmp eq i32 %4, 0 + br i1 %5, label %omp_offload.cont, label %omp_offload.failed + +omp_offload.failed: ; preds = %entry + %6 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #5 + %7 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %6, i32 3, i32 0) #5 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* %dis) #5 + br label %omp_offload.cont + +omp_offload.cont: ; preds = %entry, %omp_offload.failed + ret void +} + +; Function Attrs: norecurse nounwind uwtable +define internal void @.omp_outlined.(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i32* %dis) #1 { +entry: + %.omp.comb.lb = alloca i32, align 4 + %.omp.comb.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %0 = bitcast i32* %.omp.comb.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #5 + store i32 0, i32* %.omp.comb.lb, align 4, !tbaa !3 + %1 = bitcast i32* %.omp.comb.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #5 + store i32 9, i32* %.omp.comb.ub, align 4, !tbaa !3 + %2 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #5 + store i32 1, i32* %.omp.stride, align 4, !tbaa !3 + %3 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #5 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !3 + %4 = load i32, i32* %.global_tid., align 4, !tbaa !3 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @0, i32 %4, i32 92, i32* nonnull %.omp.is_last, i32* nonnull %.omp.comb.lb, i32* nonnull %.omp.comb.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #5 + %5 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3 + %6 = icmp slt i32 %5, 9 + %cond = select i1 %6, i32 %5, i32 9 + store i32 %cond, i32* %.omp.comb.ub, align 4, !tbaa !3 + %7 = load i32, i32* %.omp.comb.lb, align 4, !tbaa !3 + %cmp17 = icmp sgt i32 %7, %cond + br i1 %cmp17, label %omp.loop.exit, label %omp.inner.for.body.preheader + +omp.inner.for.body.preheader: ; preds = %entry + %8 = zext i32 %7 to i64 + %9 = zext i32 %cond to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %8, i64 %9, i32* %dis) #5 + %10 = load i32, i32* %.omp.stride, align 4, !tbaa !3 + %add9 = add nsw i32 %10, %7 + %11 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3 + %cmp110 = icmp sgt i32 %add9, %11 + br i1 %cmp110, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge + +omp.inner.for.body.omp.inner.for.body_crit_edge: ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body.omp.inner.for.body_crit_edge + %12 = phi i32 [ %16, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %11, %omp.inner.for.body.preheader ] + %add11 = phi i32 [ %add, %omp.inner.for.body.omp.inner.for.body_crit_edge ], [ %add9, %omp.inner.for.body.preheader ] + %.pre = load i32, i32* %.omp.comb.lb, align 4 + %13 = zext i32 %.pre to i64 + %14 = zext i32 %12 to i64 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @2, i32 3, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i64, i64, i32*)* @.omp_outlined..1 to void (i32*, i32*, ...)*), i64 %13, i64 %14, i32* %dis) #5 + %15 = load i32, i32* %.omp.stride, align 4, !tbaa !3 + %add = add nsw i32 %15, %add11 + %16 = load i32, i32* %.omp.comb.ub, align 4, !tbaa !3 + %cmp1 = icmp sgt i32 %add, %16 + br i1 %cmp1, label %omp.loop.exit, label %omp.inner.for.body.omp.inner.for.body_crit_edge + +omp.loop.exit: ; preds = %omp.inner.for.body.omp.inner.for.body_crit_edge, %omp.inner.for.body.preheader, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #5 + %call = call i32 @omp_get_team_num() #5 + %idxprom = sext i32 %call to i64 + %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %idxprom + %17 = load i32, i32* %arrayidx, align 4, !tbaa !3 + %add2 = add nsw i32 %17, 1 + store i32 %add2, i32* %arrayidx, align 4, !tbaa !3 + ret void +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2 + +declare dso_local void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) local_unnamed_addr + +; Function Attrs: norecurse nounwind uwtable +define internal void @.omp_outlined..1(i32* noalias nocapture readonly %.global_tid., i32* noalias nocapture readnone %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., i32* nocapture %dis) #1 { +entry: + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %0 = bitcast i32* %.omp.lb to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %0) #5 + %1 = bitcast i32* %.omp.ub to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %1) #5 + %conv = trunc i64 %.previous.lb. to i32 + %conv1 = trunc i64 %.previous.ub. to i32 + store i32 %conv, i32* %.omp.lb, align 4, !tbaa !3 + store i32 %conv1, i32* %.omp.ub, align 4, !tbaa !3 + %2 = bitcast i32* %.omp.stride to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %2) #5 + store i32 1, i32* %.omp.stride, align 4, !tbaa !3 + %3 = bitcast i32* %.omp.is_last to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %3) #5 + store i32 0, i32* %.omp.is_last, align 4, !tbaa !3 + %4 = load i32, i32* %.global_tid., align 4, !tbaa !3 + call void @__kmpc_for_static_init_4(%struct.ident_t* nonnull @1, i32 %4, i32 34, i32* nonnull %.omp.is_last, i32* nonnull %.omp.lb, i32* nonnull %.omp.ub, i32* nonnull %.omp.stride, i32 1, i32 1) #5 + %5 = load i32, i32* %.omp.ub, align 4, !tbaa !3 + %6 = icmp slt i32 %5, 9 + %cond = select i1 %6, i32 %5, i32 9 + store i32 %cond, i32* %.omp.ub, align 4, !tbaa !3 + %7 = load i32, i32* %.omp.lb, align 4, !tbaa !3 + %cmp310 = icmp sgt i32 %7, %cond + br i1 %cmp310, label %omp.loop.exit, label %omp.inner.for.body.preheader + +omp.inner.for.body.preheader: ; preds = %entry + %8 = sext i32 %7 to i64 + %9 = sext i32 %cond to i64 + br label %omp.inner.for.body + +omp.inner.for.body: ; preds = %omp.inner.for.body.preheader, %omp.inner.for.body + %indvars.iv = phi i64 [ %8, %omp.inner.for.body.preheader ], [ %indvars.iv.next, %omp.inner.for.body ] + %arrayidx = getelementptr inbounds i32, i32* %dis, i64 %indvars.iv + %10 = atomicrmw add i32* %arrayidx, i32 1 monotonic + %indvars.iv.next = add nsw i64 %indvars.iv, 1 + %cmp3 = icmp slt i64 %indvars.iv, %9 + br i1 %cmp3, label %omp.inner.for.body, label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.body, %entry + call void @__kmpc_for_static_fini(%struct.ident_t* nonnull @0, i32 %4) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %3) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %2) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %1) #5 + call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %0) #5 + ret void +} + +declare dso_local void @__kmpc_for_static_fini(%struct.ident_t*, i32) local_unnamed_addr + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2 + +declare !callback !7 dso_local void @__kmpc_fork_call(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr + +declare dso_local i32 @omp_get_team_num() local_unnamed_addr #3 + +declare dso_local i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr + +declare dso_local i32 @__kmpc_push_num_teams(%struct.ident_t*, i32, i32, i32) local_unnamed_addr + +declare !callback !7 dso_local void @__kmpc_fork_teams(%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) local_unnamed_addr + +declare dso_local i32 @__tgt_target_teams(i64, i8*, i32, i8**, i8**, i64*, i64*, i32, i32) local_unnamed_addr + +; Function Attrs: nounwind uwtable +define dso_local i32 @main() local_unnamed_addr #0 { +entry: + %.offload_baseptrs.i = alloca [1 x i8*], align 8 + %.offload_ptrs.i = alloca [1 x i8*], align 8 + %dis = alloca [10 x i32], align 16 + %0 = bitcast [10 x i32]* %dis to i8* + call void @llvm.lifetime.start.p0i8(i64 40, i8* nonnull %0) #5 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 0 + %arrayidx.1 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 1 + %arrayidx.2 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 2 + %arrayidx.3 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 3 + %1 = bitcast [10 x i32]* %dis to <4 x i32>* + store <4 x i32> , <4 x i32>* %1, align 16, !tbaa !3 + %arrayidx.4 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 4 + %arrayidx.5 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 5 + %arrayidx.6 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 6 + %arrayidx.7 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 7 + %2 = bitcast i32* %arrayidx.4 to <4 x i32>* + store <4 x i32> , <4 x i32>* %2, align 16, !tbaa !3 + %arrayidx.8 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 8 + store i32 8, i32* %arrayidx.8, align 16, !tbaa !3 + %arrayidx.9 = getelementptr inbounds [10 x i32], [10 x i32]* %dis, i64 0, i64 9 + store i32 9, i32* %arrayidx.9, align 4, !tbaa !3 + %3 = bitcast [1 x i8*]* %.offload_baseptrs.i to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %3) + %4 = bitcast [1 x i8*]* %.offload_ptrs.i to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %4) + %5 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_baseptrs.i, i64 0, i64 0 + %6 = bitcast [1 x i8*]* %.offload_baseptrs.i to i32** + store i32* %arrayidx, i32** %6, align 8 + %7 = getelementptr inbounds [1 x i8*], [1 x i8*]* %.offload_ptrs.i, i64 0, i64 0 + %8 = bitcast [1 x i8*]* %.offload_ptrs.i to i32** + store i32* %arrayidx, i32** %8, align 8 + %9 = call i32 @__tgt_target_teams(i64 -1, i8* nonnull @.__omp_offloading_18_2852fc0_foo_l10.region_id, i32 1, i8** nonnull %5, i8** nonnull %7, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_sizes, i64 0, i64 0), i64* getelementptr inbounds ([1 x i64], [1 x i64]* @.offload_maptypes, i64 0, i64 0), i32 3, i32 0) #5 + %10 = icmp eq i32 %9, 0 + br i1 %10, label %foo.exit, label %omp_offload.failed.i + +omp_offload.failed.i: ; preds = %entry + %11 = call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @2) #5 + %12 = call i32 @__kmpc_push_num_teams(%struct.ident_t* nonnull @2, i32 %11, i32 3, i32 0) #5 + call void (%struct.ident_t*, i32, void (i32*, i32*, ...)*, ...) @__kmpc_fork_teams(%struct.ident_t* nonnull @2, i32 1, void (i32*, i32*, ...)* bitcast (void (i32*, i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*), i32* nonnull %arrayidx) #5 + br label %foo.exit + +foo.exit: ; preds = %entry, %omp_offload.failed.i + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %3) + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %4) + %13 = load i32, i32* %arrayidx, align 16, !tbaa !3 + %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 0, i32 %13) + %14 = load i32, i32* %arrayidx.1, align 4, !tbaa !3 + %call.1 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 1, i32 %14) + %15 = load i32, i32* %arrayidx.2, align 8, !tbaa !3 + %call.2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 2, i32 %15) + %16 = load i32, i32* %arrayidx.3, align 4, !tbaa !3 + %call.3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 3, i32 %16) + %17 = load i32, i32* %arrayidx.4, align 16, !tbaa !3 + %call.4 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 4, i32 %17) + %18 = load i32, i32* %arrayidx.5, align 4, !tbaa !3 + %call.5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 5, i32 %18) + %19 = load i32, i32* %arrayidx.6, align 8, !tbaa !3 + %call.6 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 6, i32 %19) + %20 = load i32, i32* %arrayidx.7, align 4, !tbaa !3 + %call.7 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 7, i32 %20) + %21 = load i32, i32* %arrayidx.8, align 16, !tbaa !3 + %call.8 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 8, i32 %21) + %22 = load i32, i32* %arrayidx.9, align 4, !tbaa !3 + %call.9 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.2, i64 0, i64 0), i32 9, i32 %22) + call void @llvm.lifetime.end.p0i8(i64 40, i8* nonnull %0) #5 + ret i32 0 +} + +; Function Attrs: nounwind +declare dso_local i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #4 + +; Function Attrs: nounwind uwtable +define internal void @.omp_offloading.descriptor_unreg(i8* nocapture readnone) #0 section ".text.startup" comdat($.omp_offloading.descriptor_reg.nvptx64-nvida-cud) { +entry: + %1 = tail call i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #5 + ret void +} + +declare dso_local i32 @__tgt_unregister_lib(%struct.__tgt_bin_desc*) local_unnamed_addr + +; Function Attrs: nounwind uwtable +define linkonce hidden void @.omp_offloading.descriptor_reg.nvptx64-nvida-cud() #0 section ".text.startup" comdat { +entry: + %0 = tail call i32 @__tgt_register_lib(%struct.__tgt_bin_desc* nonnull @.omp_offloading.descriptor) #5 + %1 = tail call i32 @__cxa_atexit(void (i8*)* nonnull @.omp_offloading.descriptor_unreg, i8* bitcast (%struct.__tgt_bin_desc* @.omp_offloading.descriptor to i8*), i8* nonnull @__dso_handle) #5 + ret void +} + +declare dso_local i32 @__tgt_register_lib(%struct.__tgt_bin_desc*) local_unnamed_addr + +; Function Attrs: nounwind +declare dso_local i32 @__cxa_atexit(void (i8*)*, i8*, i8*) local_unnamed_addr #5 + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { argmemonly nounwind } +attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { nounwind } + +!omp_offload.info = !{!0} +!llvm.module.flags = !{!1} +!llvm.ident = !{!2} + +!0 = !{i32 0, i32 24, i32 42282944, !"foo", i32 10, i32 0} +!1 = !{i32 1, !"wchar_size", i32 4} +!2 = !{!"clang version 9.0.0 "} +!3 = !{!4, !4, i64 0} +!4 = !{!"int", !5, i64 0} +!5 = !{!"omnipotent char", !6, i64 0} +!6 = !{!"Simple C/C++ TBAA"} +!7 = !{!8} +!8 = !{i64 2, i64 -1, i64 -1, i1 true} + +; __CLANG_OFFLOAD_BUNDLE____END__ host-x86_64-unknown-linux-gnu Index: SPMD_examples/v0.3/target_offload_not_SPMD.c =================================================================== --- /dev/null +++ SPMD_examples/v0.3/target_offload_not_SPMD.c @@ -0,0 +1,33 @@ +#include +#include +#include + +#define N 10 +#define TEAMS 3 + +void foo(int* dis) { + + #pragma omp target teams num_teams(TEAMS) map(tofrom:dis[:N]) + { + #pragma omp distribute parallel for firstprivate(dis) + for (int i = 0; i < N; i++) + #pragma omp atomic + dis[i] += 1; + + dis[omp_get_team_num()] += 1; + } +} + +int main() { + int dis[N]; + + for (int i = 0; i < N; i++) + dis[i] = i; + + foo(dis); + + for (int i = 0; i < N; i++) + printf("dis[%3i] = %4i\n", i, dis[i]); + + return 0; +} Index: clang/lib/CodeGen/CGOpenMPRuntime.h =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntime.h +++ clang/lib/CodeGen/CGOpenMPRuntime.h @@ -211,6 +211,10 @@ ~DisableAutoDeclareTargetRAII(); }; + /// Emits \p Callee function call with arguments \p Args with location \p Loc. + void emitCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *Callee, + ArrayRef Args = llvm::None) const; + protected: CodeGenModule &CGM; StringRef FirstSeparator, Separator; @@ -270,10 +274,6 @@ // virtual StringRef getOutlinedHelperName() const { return ".omp_outlined."; } - /// Emits \p Callee function call with arguments \p Args with location \p Loc. - void emitCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *Callee, - ArrayRef Args = llvm::None) const; - /// Emits address of the word in a memory where current thread id is /// stored. virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc); Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h +++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.h @@ -34,47 +34,15 @@ EM_Unknown, }; private: - /// Parallel outlined function work for workers to execute. - llvm::SmallVector Work; struct EntryFunctionState { llvm::BasicBlock *ExitBB = nullptr; }; - class WorkerFunctionState { - public: - llvm::Function *WorkerFn; - const CGFunctionInfo &CGFI; - SourceLocation Loc; - - WorkerFunctionState(CodeGenModule &CGM, SourceLocation Loc); - - private: - void createWorkerFunction(CodeGenModule &CGM); - }; - ExecutionMode getExecutionMode() const; bool requiresFullRuntime() const { return RequiresFullRuntime; } - /// Get barrier to synchronize all threads in a block. - void syncCTAThreads(CodeGenFunction &CGF); - - /// Emit the worker function for the current target region. - void emitWorkerFunction(WorkerFunctionState &WST); - - /// Helper for worker function. Emit body of worker loop. - void emitWorkerLoop(CodeGenFunction &CGF, WorkerFunctionState &WST); - - /// Helper for non-SPMD target entry function. Guide the master and - /// worker threads to their respective locations. - void emitNonSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - WorkerFunctionState &WST); - - /// Signal termination of OMP execution for non-SPMD target entry - /// function. - void emitNonSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); - /// Helper for generic variables globalization prolog. void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc, bool WithSPMDCheck = false); @@ -82,12 +50,13 @@ /// Helper for generic variables globalization epilog. void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false); - /// Helper for SPMD mode target directive's entry function. - void emitSPMDEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, - const OMPExecutableDirective &D); + /// Helper for generic kernel mode, target directive's entry function. + void emitGenericEntryHeader(CodeGenFunction &CGF, EntryFunctionState &EST, + const OMPExecutableDirective &D, bool IsSPMD); - /// Signal termination of SPMD mode execution. - void emitSPMDEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST); + /// Signal termination of generic mode execution. + void emitGenericEntryFooter(CodeGenFunction &CGF, EntryFunctionState &EST, + bool IsSPMD); // // Base class overrides. @@ -99,20 +68,6 @@ uint64_t Size, int32_t Flags, llvm::GlobalValue::LinkageTypes Linkage) override; - /// Emit outlined function specialized for the Fork-Join - /// programming model for applicable target directives on the NVPTX device. - /// \param D Directive to emit. - /// \param ParentName Name of the function that encloses the target region. - /// \param OutlinedFn Outlined function value to be defined by this call. - /// \param OutlinedFnID Outlined function ID value to be defined by this call. - /// \param IsOffloadEntry True if the outlined function is an offload entry. - /// An outlined function may not be an entry if, e.g. the if clause always - /// evaluates to false. - void emitNonSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); - /// Emit outlined function specialized for the Single Program /// Multiple Data programming model for applicable target directives on the /// NVPTX device. @@ -121,13 +76,14 @@ /// \param OutlinedFn Outlined function value to be defined by this call. /// \param OutlinedFnID Outlined function ID value to be defined by this call. /// \param IsOffloadEntry True if the outlined function is an offload entry. + /// \param IsSPMD True if the kernel is know to be executed in SPMD mode. /// \param CodeGen Object containing the target statements. /// An outlined function may not be an entry if, e.g. the if clause always /// evaluates to false. - void emitSPMDKernel(const OMPExecutableDirective &D, StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen); + void emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, + llvm::Function *&OutlinedFn, + llvm::Constant *&OutlinedFnID, bool IsOffloadEntry, + bool IsSPMD, const RegionCodeGenTy &CodeGen); /// Emit outlined function for 'target' directive on the NVPTX /// device. @@ -145,21 +101,6 @@ bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) override; - /// Emits code for parallel or serial call of the \a OutlinedFn with - /// variables captured in a record which address is stored in \a - /// CapturedStruct. - /// This call is for the Non-SPMD Execution Mode. - /// \param OutlinedFn Outlined function to be run in parallel threads. Type of - /// this function is void(*)(kmp_int32 *, kmp_int32, struct context_vars*). - /// \param CapturedVars A pointer to the record with the references to - /// variables used in \a OutlinedFn function. - /// \param IfCond Condition in the associated 'if' clause, if it was - /// specified, nullptr otherwise. - void emitNonSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc, - llvm::Value *OutlinedFn, - ArrayRef CapturedVars, - const Expr *IfCond); - /// Emits code for parallel or serial call of the \a OutlinedFn with /// variables captured in a record which address is stored in \a /// CapturedStruct. @@ -170,11 +111,12 @@ /// variables used in \a OutlinedFn function. /// \param IfCond Condition in the associated 'if' clause, if it was /// specified, nullptr otherwise. + /// \param IsSPMD True if the kernel is know to be executed in SPMD mode. /// - void emitSPMDParallelCall(CodeGenFunction &CGF, SourceLocation Loc, - llvm::Value *OutlinedFn, - ArrayRef CapturedVars, - const Expr *IfCond); + void emitGenericParallelCall(CodeGenFunction &CGF, SourceLocation Loc, + llvm::Value *OutlinedFn, + ArrayRef CapturedVars, + const Expr *IfCond, bool IsSPMD); protected: /// Get the function name of an outlined region. @@ -406,15 +348,20 @@ /// true if we're definitely in the parallel region. bool IsInParallelRegion = false; - /// Map between an outlined function and its wrapper. - llvm::DenseMap WrapperFunctionsMap; + /// Map between an outlined function and its wrapper + shared struct type. + struct WrapperInfo { + llvm::Function *WrapperFn; + llvm::StructType *SharedStructTy; + }; + + llvm::DenseMap WrapperInfoMap; /// Emit function which wraps the outline parallel region /// and controls the parameters which are passed to this function. /// The wrapper ensures that the outlined function is called /// with the correct arguments when data is shared. - llvm::Function *createParallelDataSharingWrapper( - llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D); + void createParallelDataSharingWrapper(llvm::Function *OutlinedParallelFn, + const OMPExecutableDirective &D); /// The data for the single globalized variable. struct MappedVarData { Index: clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp =================================================================== --- clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -24,25 +24,6 @@ namespace { enum OpenMPRTLFunctionNVPTX { - /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit, - /// int16_t RequiresOMPRuntime); - OMPRTL_NVPTX__kmpc_kernel_init, - /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); - OMPRTL_NVPTX__kmpc_kernel_deinit, - /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); - OMPRTL_NVPTX__kmpc_spmd_kernel_init, - /// Call to void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); - OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2, - /// Call to void __kmpc_kernel_prepare_parallel(void - /// *outlined_function, int16_t - /// IsOMPRuntimeInitialized); - OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, - /// Call to bool __kmpc_kernel_parallel(void **outlined_function, - /// int16_t IsOMPRuntimeInitialized); - OMPRTL_NVPTX__kmpc_kernel_parallel, - /// Call to void __kmpc_kernel_end_parallel(); - OMPRTL_NVPTX__kmpc_kernel_end_parallel, /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_serialized_parallel, @@ -69,22 +50,11 @@ OMPRTL_NVPTX__kmpc_nvptx_teams_end_reduce_nowait_simple, /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); OMPRTL_NVPTX__kmpc_end_reduce_nowait, - /// Call to void __kmpc_data_sharing_init_stack(); - OMPRTL_NVPTX__kmpc_data_sharing_init_stack, - /// Call to void __kmpc_data_sharing_init_stack_spmd(); - OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd, /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size, /// int16_t UseSharedMemory); OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack, /// Call to void __kmpc_data_sharing_pop_stack(void *a); OMPRTL_NVPTX__kmpc_data_sharing_pop_stack, - /// Call to void __kmpc_begin_sharing_variables(void ***args, - /// size_t n_args); - OMPRTL_NVPTX__kmpc_begin_sharing_variables, - /// Call to void __kmpc_end_sharing_variables(); - OMPRTL_NVPTX__kmpc_end_sharing_variables, - /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs) - OMPRTL_NVPTX__kmpc_get_shared_variables, /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 /// global_tid); OMPRTL_NVPTX__kmpc_parallel_level, @@ -101,6 +71,15 @@ /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32 /// global_tid); OMPRTL__kmpc_barrier_simple_spmd, + /// Call to int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t + /// RequiresOMPRuntime, int16_t RequiresDataSharing) + OMPRTL_NVPTX__kmpc_generic_kernel_init, + /// Call to void __kmpc_generic_kernel_deinit(int16_t IsSPMD, int16_t + /// RequiredOMPRuntime) + OMPRTL_NVPTX__kmpc_generic_kernel_deinit, + /// Call to void __kmpc_generic_kernel_parallel(void *OutlinedFn, + /// void *Payload, int16_t PayloadBytes, int16_t RequiredOMPRuntime) + OMPRTL_NVPTX__kmpc_generic_kernel_parallel, }; /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. @@ -160,11 +139,11 @@ } /// Constructor for SPMD mode. ExecutionRuntimeModesRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &ExecMode, - bool &RuntimeMode, bool FullRuntimeMode) + bool &RuntimeMode, bool FullRuntimeMode, bool IsSPMD) : ExecMode(ExecMode), RuntimeMode(&RuntimeMode) { SavedExecMode = ExecMode; SavedRuntimeMode = RuntimeMode; - ExecMode = CGOpenMPRuntimeNVPTX::EM_SPMD; + ExecMode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD : CGOpenMPRuntimeNVPTX::EM_NonSPMD; RuntimeMode = FullRuntimeMode; } ~ExecutionRuntimeModesRAII() { @@ -643,56 +622,6 @@ "nvptx_num_threads"); } -/// Get the value of the thread_limit clause in the teams directive. -/// For the 'generic' execution mode, the runtime encodes thread_limit in -/// the launch parameters, always starting thread_limit+warpSize threads per -/// CTA. The threads in the last warp are reserved for master execution. -/// For the 'spmd' execution mode, all threads in a CTA are part of the team. -static llvm::Value *getThreadLimit(CodeGenFunction &CGF, - bool IsInSPMDExecutionMode = false) { - CGBuilderTy &Bld = CGF.Builder; - return IsInSPMDExecutionMode - ? getNVPTXNumThreads(CGF) - : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), - "thread_limit"); -} - -/// Get the thread id of the OMP master thread. -/// The master thread id is the first thread (lane) of the last warp in the -/// GPU block. Warp size is assumed to be some power of 2. -/// Thread id is 0 indexed. -/// E.g: If NumThreads is 33, master id is 32. -/// If NumThreads is 64, master id is 32. -/// If NumThreads is 1024, master id is 992. -static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { - CGBuilderTy &Bld = CGF.Builder; - llvm::Value *NumThreads = getNVPTXNumThreads(CGF); - - // We assume that the warp size is a power of 2. - llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); - - return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)), - Bld.CreateNot(Mask), "master_tid"); -} - -CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( - CodeGenModule &CGM, SourceLocation Loc) - : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()), - Loc(Loc) { - createWorkerFunction(CGM); -} - -void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( - CodeGenModule &CGM) { - // Create an worker function with no arguments. - - WorkerFn = llvm::Function::Create( - CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, - /*placeholder=*/"_worker", &CGM.getModule()); - CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI); - WorkerFn->setDoesNotRecurse(); -} - CGOpenMPRuntimeNVPTX::ExecutionMode CGOpenMPRuntimeNVPTX::getExecutionMode() const { return CurrentExecutionMode; @@ -1159,149 +1088,18 @@ "Unknown programming model for OpenMP directive on NVPTX target."); } -void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D, +void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, StringRef ParentName, llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, - bool IsOffloadEntry, + bool IsOffloadEntry, bool IsSPMD, const RegionCodeGenTy &CodeGen) { - ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode); - EntryFunctionState EST; - WorkerFunctionState WST(CGM, D.getBeginLoc()); - Work.clear(); - WrapperFunctionsMap.clear(); - - // Emit target region as a standalone region. - class NVPTXPrePostActionTy : public PrePostActionTy { - CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; - - public: - NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) - : EST(EST), WST(WST) {} - void Enter(CodeGenFunction &CGF) override { - auto &RT = - static_cast(CGF.CGM.getOpenMPRuntime()); - RT.emitNonSPMDEntryHeader(CGF, EST, WST); - // Skip target region initialization. - RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); - } - void Exit(CodeGenFunction &CGF) override { - auto &RT = - static_cast(CGF.CGM.getOpenMPRuntime()); - RT.clearLocThreadIdInsertPt(CGF); - RT.emitNonSPMDEntryFooter(CGF, EST); - } - } Action(EST, WST); - CodeGen.setAction(Action); - IsInTTDRegion = true; - // Reserve place for the globalized memory. - GlobalizedRecords.emplace_back(); - if (!KernelStaticGlobalized) { - KernelStaticGlobalized = new llvm::GlobalVariable( - CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false, - llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(CGM.VoidPtrTy), - "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr, - llvm::GlobalValue::NotThreadLocal, - CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared)); - } - emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, - IsOffloadEntry, CodeGen); - IsInTTDRegion = false; - - // Now change the name of the worker function to correspond to this target - // region's entry function. - WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker")); - - // Create the worker function - emitWorkerFunction(WST); -} - -// Setup NVPTX threads for master-worker OpenMP scheme. -void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF, - EntryFunctionState &EST, - WorkerFunctionState &WST) { - CGBuilderTy &Bld = CGF.Builder; - - llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); - llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); - llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::Value *IsWorker = - Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); - Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); - - CGF.EmitBlock(WorkerBB); - emitCall(CGF, WST.Loc, WST.WorkerFn); - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(MasterCheckBB); - llvm::Value *IsMaster = - Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); - Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); - - CGF.EmitBlock(MasterBB); - IsInTargetMasterThreadRegion = true; - // SEQUENTIAL (MASTER) REGION START - // First action in sequential region: - // Initialize the state of the OpenMP runtime library on the GPU. - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {getThreadLimit(CGF), - Bld.getInt16(/*RequiresOMPRuntime=*/1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); - - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack)); - - emitGenericVarsProlog(CGF, WST.Loc); -} - -void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { - IsInTargetMasterThreadRegion = false; - if (!CGF.HaveInsertPoint()) - return; - - emitGenericVarsEpilog(CGF); - - if (!EST.ExitBB) - EST.ExitBB = CGF.createBasicBlock(".exit"); - - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(TerminateBB); - // Signal termination condition. - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args); - // Barrier to terminate worker threads. - syncCTAThreads(CGF); - // Master thread jumps to exit point. - CGF.EmitBranch(EST.ExitBB); - - CGF.EmitBlock(EST.ExitBB); - EST.ExitBB = nullptr; -} - -void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D, - StringRef ParentName, - llvm::Function *&OutlinedFn, - llvm::Constant *&OutlinedFnID, - bool IsOffloadEntry, - const RegionCodeGenTy &CodeGen) { ExecutionRuntimeModesRAII ModeRAII( CurrentExecutionMode, RequiresFullRuntime, CGM.getLangOpts().OpenMPCUDAForceFullRuntime || - !supportsLightweightRuntime(CGM.getContext(), D)); + !supportsLightweightRuntime(CGM.getContext(), D), IsSPMD); EntryFunctionState EST; + WrapperInfoMap.clear(); // Emit target region as a standalone region. class NVPTXPrePostActionTy : public PrePostActionTy { @@ -1309,21 +1107,24 @@ CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; const OMPExecutableDirective &D; + /// Flag that is set if this is already known to be executed in SPMD mode. + bool IsSPMD; + public: NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, - const OMPExecutableDirective &D) - : RT(RT), EST(EST), D(D) {} + const OMPExecutableDirective &D, bool IsSPMD) + : RT(RT), EST(EST), D(D), IsSPMD(IsSPMD) {} void Enter(CodeGenFunction &CGF) override { - RT.emitSPMDEntryHeader(CGF, EST, D); + RT.emitGenericEntryHeader(CGF, EST, D, IsSPMD); // Skip target region initialization. RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); } void Exit(CodeGenFunction &CGF) override { RT.clearLocThreadIdInsertPt(CGF); - RT.emitSPMDEntryFooter(CGF, EST); + RT.emitGenericEntryFooter(CGF, EST, IsSPMD); } - } Action(*this, EST, D); + } Action(*this, EST, D, IsSPMD); CodeGen.setAction(Action); IsInTTDRegion = true; // Reserve place for the globalized memory. @@ -1342,37 +1143,37 @@ IsInTTDRegion = false; } -void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader( +void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader( CodeGenFunction &CGF, EntryFunctionState &EST, - const OMPExecutableDirective &D) { + const OMPExecutableDirective &D, bool IsSPMD) { CGBuilderTy &Bld = CGF.Builder; // Setup BBs in entry function. llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); EST.ExitBB = CGF.createBasicBlock(".exit"); - llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true), + llvm::Value *Args[] = {/* IsSPMD = */ Bld.getInt16(IsSPMD ? 1 : 0), + /* UseSP */ Bld.getInt16(1), /*RequiresOMPRuntime=*/ Bld.getInt16(RequiresFullRuntime ? 1 : 0), /*RequiresDataSharing=*/Bld.getInt16(0)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); - - if (RequiresFullRuntime) { - // For data sharing, we need to initialize the stack. - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd)); - } + llvm::Value *ExecuteFlag = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_generic_kernel_init), Args); + llvm::Value *ExecuteCnd = Bld.CreateICmpEQ(ExecuteFlag, Bld.getInt16(1)); - CGF.EmitBranch(ExecuteBB); + Bld.CreateCondBr(ExecuteCnd, ExecuteBB, EST.ExitBB); CGF.EmitBlock(ExecuteBB); IsInTargetMasterThreadRegion = true; + + if (!IsSPMD) + emitGenericVarsProlog(CGF, D.getBeginLoc()); } -void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF, - EntryFunctionState &EST) { +void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, + EntryFunctionState &EST, + bool IsSPMD) { IsInTargetMasterThreadRegion = false; if (!CGF.HaveInsertPoint()) return; @@ -1385,11 +1186,16 @@ CGF.EmitBlock(OMPDeInitBB); // DeInitialize the OMP state in the runtime; called by all active threads. - llvm::Value *Args[] = {/*RequiresOMPRuntime=*/ + llvm::Value *Args[] = {/* IsSPMD = */ CGF.Builder.getInt16( + IsSPMD ? 1 : 0), /* RequiresOMPRuntime = */ CGF.Builder.getInt16(RequiresFullRuntime ? 1 : 0)}; + + if (!IsSPMD) + emitGenericVarsEpilog(CGF); + CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2), Args); + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_generic_kernel_deinit), + Args); CGF.EmitBranch(EST.ExitBB); CGF.EmitBlock(EST.ExitBB); @@ -1412,136 +1218,6 @@ CGM.addCompilerUsedGlobal(GVMode); } -void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { - ASTContext &Ctx = CGM.getContext(); - - CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); - CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {}, - WST.Loc, WST.Loc); - emitWorkerLoop(CGF, WST); - CGF.FinishFunction(); -} - -void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, - WorkerFunctionState &WST) { - // - // The workers enter this loop and wait for parallel work from the master. - // When the master encounters a parallel region it sets up the work + variable - // arguments, and wakes up the workers. The workers first check to see if - // they are required for the parallel region, i.e., within the # of requested - // parallel threads. The activated workers load the variable arguments and - // execute the parallel work. - // - - CGBuilderTy &Bld = CGF.Builder; - - llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); - llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); - llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); - llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); - llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); - llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); - - CGF.EmitBranch(AwaitBB); - - // Workers wait for work from master. - CGF.EmitBlock(AwaitBB); - // Wait for parallel work - syncCTAThreads(CGF); - - Address WorkFn = - CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); - Address ExecStatus = - CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); - CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); - CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); - - // TODO: Optimize runtime initialization and pass in correct value. - llvm::Value *Args[] = {WorkFn.getPointer(), - /*RequiresOMPRuntime=*/Bld.getInt16(1)}; - llvm::Value *Ret = CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); - Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); - - // On termination condition (workid == 0), exit loop. - llvm::Value *WorkID = Bld.CreateLoad(WorkFn); - llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate"); - Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); - - // Activate requested workers. - CGF.EmitBlock(SelectWorkersBB); - llvm::Value *IsActive = - Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); - Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); - - // Signal start of parallel region. - CGF.EmitBlock(ExecuteBB); - // Skip initialization. - setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); - - // Process work items: outlined parallel functions. - for (llvm::Function *W : Work) { - // Try to match this outlined function. - llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); - - llvm::Value *WorkFnMatch = - Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); - - llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); - llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); - Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); - - // Execute this outlined function. - CGF.EmitBlock(ExecuteFNBB); - - // Insert call to work function via shared wrapper. The shared - // wrapper takes two arguments: - // - the parallelism level; - // - the thread ID; - emitCall(CGF, WST.Loc, W, - {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); - - // Go to end of parallel region. - CGF.EmitBranch(TerminateBB); - - CGF.EmitBlock(CheckNextBB); - } - // Default case: call to outlined function through pointer if the target - // region makes a declare target call that may contain an orphaned parallel - // directive. - auto *ParallelFnTy = - llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty}, - /*isVarArg=*/false) - ->getPointerTo(); - llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy); - // Insert call to work function via shared wrapper. The shared - // wrapper takes two arguments: - // - the parallelism level; - // - the thread ID; - emitCall(CGF, WST.Loc, WorkFnCast, - {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)}); - // Go to end of parallel region. - CGF.EmitBranch(TerminateBB); - - // Signal end of parallel region. - CGF.EmitBlock(TerminateBB); - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), - llvm::None); - CGF.EmitBranch(BarrierBB); - - // All active and inactive workers wait at a barrier after parallel region. - CGF.EmitBlock(BarrierBB); - // Barrier after parallel region. - syncCTAThreads(CGF); - CGF.EmitBranch(AwaitBB); - - // Exit target region. - CGF.EmitBlock(ExitBB); - // Skip initialization. - clearLocThreadIdInsertPt(CGF); -} - /// Returns specified OpenMP runtime function for the current OpenMP /// implementation. Specialized for the NVPTX device. /// \param Function OpenMP runtime function. @@ -1550,66 +1226,6 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { llvm::Constant *RTLFn = nullptr; switch (static_cast(Function)) { - case OMPRTL_NVPTX__kmpc_kernel_init: { - // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t - // RequiresOMPRuntime); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_deinit: { - // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); - llvm::Type *TypeParams[] = {CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); - break; - } - case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { - // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, - // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); - llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init"); - break; - } - case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit_v2: { - // Build void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); - llvm::Type *TypeParams[] = {CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit_v2"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { - /// Build void __kmpc_kernel_prepare_parallel( - /// void *outlined_function, int16_t IsOMPRuntimeInitialized); - llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_parallel: { - /// Build bool __kmpc_kernel_parallel(void **outlined_function, - /// int16_t IsOMPRuntimeInitialized); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty}; - llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); - auto *FnTy = - llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); - break; - } - case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { - /// Build void __kmpc_kernel_end_parallel(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); - break; - } case OMPRTL_NVPTX__kmpc_serialized_parallel: { // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 // global_tid); @@ -1707,21 +1323,6 @@ FnTy, /*Name=*/"__kmpc_nvptx_teams_end_reduce_nowait_simple"); break; } - case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: { - /// Build void __kmpc_data_sharing_init_stack(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack"); - break; - } - case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: { - /// Build void __kmpc_data_sharing_init_stack_spmd(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = - CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd"); - break; - } case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: { // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size, // int16_t UseSharedMemory); @@ -1741,30 +1342,6 @@ /*Name=*/"__kmpc_data_sharing_pop_stack"); break; } - case OMPRTL_NVPTX__kmpc_begin_sharing_variables: { - /// Build void __kmpc_begin_sharing_variables(void ***args, - /// size_t n_args); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables"); - break; - } - case OMPRTL_NVPTX__kmpc_end_sharing_variables: { - /// Build void __kmpc_end_sharing_variables(); - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables"); - break; - } - case OMPRTL_NVPTX__kmpc_get_shared_variables: { - /// Build void __kmpc_get_shared_variables(void ***GlobalArgs); - llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()}; - auto *FnTy = - llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); - RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables"); - break; - } case OMPRTL_NVPTX__kmpc_parallel_level: { // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid); llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; @@ -1819,6 +1396,35 @@ cast(RTLFn)->addFnAttr(llvm::Attribute::Convergent); break; } + case OMPRTL_NVPTX__kmpc_generic_kernel_init: { + // Build int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t UseSM, + // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty, CGM.Int16Ty, + CGM.Int16Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_generic_kernel_init"); + break; + } + case OMPRTL_NVPTX__kmpc_generic_kernel_deinit: { + // Build void __kmpc_generic_kernel_deinit(int16_t IsSPMD, int16_t + // RequiredOMPRuntime); + llvm::Type *TypeParams[] = {CGM.Int16Ty, CGM.Int16Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_generic_kernel_deinit"); + break; + } + case OMPRTL_NVPTX__kmpc_generic_kernel_parallel: { + // Build void __kmpc_generic_kernel_parallel(void *OutlinedFnWrapper, void + // *Payload, int16_t PayloadBytes, int16_t RequiredOMPRuntime) + llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty}; + auto *FnTy = + llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); + RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_generic_kernel_parallel"); + break; + } } return RTLFn; } @@ -1854,15 +1460,12 @@ assert(!ParentName.empty() && "Invalid target region parent name!"); - bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D); - if (Mode) - emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); - else - emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, - CodeGen); + bool IsSPMD = supportsSPMDExecutionMode(CGM.getContext(), D); + + emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, + IsSPMD, CodeGen); - setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); + setPropertyExecutionMode(CGM, OutlinedFn->getName(), IsSPMD); } namespace { @@ -1958,9 +1561,7 @@ IsInTTDRegion = PrevIsInTTDRegion; if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD && !IsInParallelRegion) { - llvm::Function *WrapperFun = - createParallelDataSharingWrapper(OutlinedFun, D); - WrapperFunctionsMap[OutlinedFun] = WrapperFun; + createParallelDataSharingWrapper(OutlinedFun, D); } return OutlinedFun; @@ -2450,15 +2051,13 @@ if (!CGF.HaveInsertPoint()) return; - if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) - emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); - else - emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); + bool IsSPMD = getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD; + emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond, IsSPMD); } -void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall( +void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { + ArrayRef CapturedVars, const Expr *IfCond, bool IsSPMD) { llvm::Function *Fn = cast(OutlinedFn); // Force inline this outlined function at its call site. @@ -2500,76 +2099,64 @@ auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF, PrePostActionTy &Action) { CGBuilderTy &Bld = CGF.Builder; - llvm::Function *WFn = WrapperFunctionsMap[Fn]; - assert(WFn && "Wrapper function does not exist!"); - llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy); + const WrapperInfo &WFI = WrapperInfoMap[Fn]; + assert(WFI.WrapperFn && "Wrapper function does not exist!"); + + llvm::Value *PayloadBytes = llvm::Constant::getNullValue(CGM.Int16Ty); + llvm::Value *StructAlloca = llvm::Constant::getNullValue(CGM.VoidPtrTy); + if (WFI.SharedStructTy) { + StructAlloca = + CGF.CreateDefaultAlignTempAlloca(WFI.SharedStructTy, ".captured") + .getPointer(); + const llvm::DataLayout &DL = WFI.WrapperFn->getParent()->getDataLayout(); + PayloadBytes = Bld.getInt16(DL.getTypeAllocSize(WFI.SharedStructTy)); + } - // Prepare for parallel region. Indicate the outlined function. - llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), - Args); + llvm::SmallVector Args; + Args.push_back(CGF.EmitCastToVoidPtr(WFI.WrapperFn)); + Args.push_back(CGF.EmitCastToVoidPtr(StructAlloca)); + Args.push_back(PayloadBytes); + Args.push_back(/* RequiresOMPRuntime */Bld.getInt16(1)); // Create a private scope that will globalize the arguments // passed from the outside of the target region. CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF); - // There's something to share. - if (!CapturedVars.empty()) { - // Prepare for parallel region. Indicate the outlined function. - Address SharedArgs = - CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs"); - llvm::Value *SharedArgsPtr = SharedArgs.getPointer(); - - llvm::Value *DataSharingArgs[] = { - SharedArgsPtr, - llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())}; - CGF.EmitRuntimeCall(createNVPTXRuntimeFunction( - OMPRTL_NVPTX__kmpc_begin_sharing_variables), - DataSharingArgs); - - // Store variable address in a list of references to pass to workers. - unsigned Idx = 0; - ASTContext &Ctx = CGF.getContext(); - Address SharedArgListAddress = CGF.EmitLoadOfPointer( - SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy)) - .castAs()); - for (llvm::Value *V : CapturedVars) { - Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, - CGF.getPointerSize()); - llvm::Value *PtrV; - if (V->getType()->isIntegerTy()) - PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy); - else - PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy); - CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false, - Ctx.getPointerType(Ctx.VoidPtrTy)); - ++Idx; - } - } - - // Activate workers. This barrier is used by the master to signal - // work for the workers. - syncCTAThreads(CGF); + assert((CapturedVars.empty() || WFI.SharedStructTy) && + "Expected the shared struct type to be set!"); + assert((CapturedVars.empty() || + CapturedVars.size() == WFI.SharedStructTy->getNumElements()) && + "#elements in shared struct type is not the number of captured " + "variables!"); - // OpenMP [2.5, Parallel Construct, p.49] - // There is an implied barrier at the end of a parallel region. After the - // end of a parallel region, only the master thread of the team resumes - // execution of the enclosing task region. - // - // The master waits at this barrier until all workers are done. - syncCTAThreads(CGF); - - if (!CapturedVars.empty()) - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables)); + unsigned Idx = 0; + for (llvm::Value *V : CapturedVars) { + llvm::Value* GEP = Bld.CreateStructGEP(StructAlloca, Idx++); + Bld.CreateDefaultAlignedStore(V, GEP); + } - // Remember for post-processing in worker loop. - Work.emplace_back(WFn); + CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_generic_kernel_parallel), + Args); }; - auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen]( - CodeGenFunction &CGF, PrePostActionTy &Action) { + auto &&LNParallelGen = [this, Loc, &CodeGen, &SeqGen, &L0ParallelGen, IsSPMD, + &ThreadIDAddr](CodeGenFunction &CGF, + PrePostActionTy &Action) { + if (IsSPMD) { + if (IsInTargetMasterThreadRegion) { + // In the worker need to use the real thread id. + ThreadIDAddr = emitThreadIDAddress(CGF, Loc); + CodeGen(CGF, Action); + } else { + // If we are not in the target region, it is definitely L2 parallelism + // or more, because for SPMD mode we always has L1 parallel level, sowe + // don't need to check for orphaned directives. + SeqGen(CGF, Action); + } + return; + } + if (IsInParallelRegion) { SeqGen(CGF, Action); } else if (IsInTargetMasterThreadRegion) { @@ -2623,76 +2210,6 @@ } } -void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall( - CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, - ArrayRef CapturedVars, const Expr *IfCond) { - // Just call the outlined function to execute the parallel region. - // OutlinedFn(>id, &zero, CapturedStruct); - // - llvm::SmallVector OutlinedFnArgs; - - Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth( - /*DestWidth=*/32, /*Signed=*/1), - ".zero.addr"); - CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); - // ThreadId for serialized parallels is 0. - Address ThreadIDAddr = ZeroAddr; - auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr, - &ThreadIDAddr](CodeGenFunction &CGF, - PrePostActionTy &Action) { - Action.Enter(CGF); - - llvm::SmallVector OutlinedFnArgs; - OutlinedFnArgs.push_back(ThreadIDAddr.getPointer()); - OutlinedFnArgs.push_back(ZeroAddr.getPointer()); - OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); - emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs); - }; - auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF, - PrePostActionTy &) { - - RegionCodeGenTy RCG(CodeGen); - llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc); - llvm::Value *ThreadID = getThreadID(CGF, Loc); - llvm::Value *Args[] = {RTLoc, ThreadID}; - - NVPTXActionTy Action( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), - Args, - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), - Args); - RCG.setAction(Action); - RCG(CGF); - }; - - if (IsInTargetMasterThreadRegion) { - // In the worker need to use the real thread id. - ThreadIDAddr = emitThreadIDAddress(CGF, Loc); - RegionCodeGenTy RCG(CodeGen); - RCG(CGF); - } else { - // If we are not in the target region, it is definitely L2 parallelism or - // more, because for SPMD mode we always has L1 parallel level, sowe don't - // need to check for orphaned directives. - RegionCodeGenTy RCG(SeqGen); - RCG(CGF); - } -} - -void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { - // Always emit simple barriers! - if (!CGF.HaveInsertPoint()) - return; - // Build call __kmpc_barrier_simple_spmd(nullptr, 0); - // This function does not use parameters, so we can emit just default values. - llvm::Value *Args[] = { - llvm::ConstantPointerNull::get( - cast(getIdentTyPointerTy())), - llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args); -} - void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool, @@ -4059,29 +3576,17 @@ CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs); } -/// Emit function which wraps the outline parallel region -/// and controls the arguments which are passed to this function. -/// The wrapper ensures that the outlined function is called -/// with the correct arguments when data is shared. -llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( +void CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper( llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) { ASTContext &Ctx = CGM.getContext(); const auto &CS = *D.getCapturedStmt(OMPD_parallel); // Create a function that takes as argument the source thread. FunctionArgList WrapperArgs; - QualType Int16QTy = - Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false); - QualType Int32QTy = - Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false); - ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), - /*Id=*/nullptr, Int16QTy, - ImplicitParamDecl::Other); - ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), - /*Id=*/nullptr, Int32QTy, + ImplicitParamDecl PayloadArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(), + /*Id=*/nullptr, Ctx.VoidPtrTy, ImplicitParamDecl::Other); - WrapperArgs.emplace_back(&ParallelLevelArg); - WrapperArgs.emplace_back(&WrapperArg); + WrapperArgs.emplace_back(&PayloadArg); const CGFunctionInfo &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs); @@ -4096,35 +3601,29 @@ CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs, D.getBeginLoc(), D.getBeginLoc()); - - const auto *RD = CS.getCapturedRecordDecl(); - auto CurField = RD->field_begin(); + Fn->arg_begin()->setName("payload"); Address ZeroAddr = CGF.CreateMemTemp( CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1), /*Name*/ ".zero.addr"); CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); + + setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true); + // Get the array of arguments. SmallVector Args; - - Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer()); + Args.emplace_back(emitThreadIDAddress(CGF, D.getBeginLoc()).getPointer()); Args.emplace_back(ZeroAddr.getPointer()); CGBuilderTy &Bld = CGF.Builder; - auto CI = CS.capture_begin(); - // Use global memory for data sharing. // Handle passing of global args to workers. - Address GlobalArgs = - CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args"); - llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer(); - llvm::Value *DataSharingArgs[] = {GlobalArgsPtr}; - CGF.EmitRuntimeCall( - createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables), - DataSharingArgs); + Address GlobalArgs = CGF.GetAddrOfLocalVar(&PayloadArg); // Retrieve the shared variables from the list of references returned // by the runtime. Pass the variables to the outlined function. + llvm::StructType *StructTy = nullptr; + llvm::Value *StructPtr = nullptr; Address SharedArgListAddress = Address::invalid(); if (CS.capture_size() > 0 || isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { @@ -4133,56 +3632,38 @@ .getPointerType(CGF.getContext().getPointerType( CGF.getContext().VoidPtrTy)) .castAs()); + + llvm::SmallVector StructMemberTypes; + auto ArgIt = OutlinedParallelFn->arg_begin() + 1; + auto ArgEnd = OutlinedParallelFn->arg_end(); + while (++ArgIt != ArgEnd) + StructMemberTypes.push_back(ArgIt->getType()); + + StructTy = llvm::StructType::create(OutlinedParallelFn->getContext(), + StructMemberTypes, "omp.shared.struct"); + SharedArgListAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( + SharedArgListAddress, StructTy->getPointerTo()); + StructPtr = SharedArgListAddress.getPointer(); } + unsigned Idx = 0; if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) { - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, - CGF.getPointerSize()); - Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.SizeTy->getPointerTo()); - llvm::Value *LB = CGF.EmitLoadOfScalar( - TypedAddress, - /*Volatile=*/false, - CGF.getContext().getPointerType(CGF.getContext().getSizeType()), - cast(D).getLowerBoundVariable()->getExprLoc()); + llvm::Value *LB = + Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1); + llvm::Value *UB = + Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1); Args.emplace_back(LB); - ++Idx; - Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx, - CGF.getPointerSize()); - TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.SizeTy->getPointerTo()); - llvm::Value *UB = CGF.EmitLoadOfScalar( - TypedAddress, - /*Volatile=*/false, - CGF.getContext().getPointerType(CGF.getContext().getSizeType()), - cast(D).getUpperBoundVariable()->getExprLoc()); Args.emplace_back(UB); - ++Idx; - } - if (CS.capture_size() > 0) { - ASTContext &CGFContext = CGF.getContext(); - for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) { - QualType ElemTy = CurField->getType(); - Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx, - CGF.getPointerSize()); - Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast( - Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy))); - llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress, - /*Volatile=*/false, - CGFContext.getPointerType(ElemTy), - CI->getLocation()); - if (CI->capturesVariableByCopy() && - !CI->getCapturedVar()->getType()->isAnyPointerType()) { - Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(), - CI->getLocation()); - } - Args.emplace_back(Arg); - } } + for (unsigned I = 0, E = CS.capture_size(); I < E; ++I) + Args.emplace_back( + Bld.CreateAlignedLoad(Bld.CreateStructGEP(StructPtr, Idx++), 1)); emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args); CGF.FinishFunction(); - return Fn; + + WrapperInfoMap[OutlinedParallelFn] = WrapperInfo({Fn, StructTy}); + clearLocThreadIdInsertPt(CGF); } void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF, Index: llvm/include/llvm/InitializePasses.h =================================================================== --- llvm/include/llvm/InitializePasses.h +++ llvm/include/llvm/InitializePasses.h @@ -290,6 +290,7 @@ void initializeObjCARCContractPass(PassRegistry&); void initializeObjCARCExpandPass(PassRegistry&); void initializeObjCARCOptPass(PassRegistry&); +void initializeOpenMPOptPass(PassRegistry&); void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&); void initializeOptimizePHIsPass(PassRegistry&); void initializePAEvalPass(PassRegistry&); Index: llvm/include/llvm/LinkAllPasses.h =================================================================== --- llvm/include/llvm/LinkAllPasses.h +++ llvm/include/llvm/LinkAllPasses.h @@ -147,6 +147,7 @@ (void) llvm::createObjCARCExpandPass(); (void) llvm::createObjCARCContractPass(); (void) llvm::createObjCARCOptPass(); + (void) llvm::createOpenMPOptPass(); (void) llvm::createPAEvalPass(); (void) llvm::createPromoteMemoryToRegisterPass(); (void) llvm::createDemoteRegisterToMemoryPass(); Index: llvm/include/llvm/Transforms/IPO.h =================================================================== --- llvm/include/llvm/Transforms/IPO.h +++ llvm/include/llvm/Transforms/IPO.h @@ -156,6 +156,11 @@ /// ModulePass *createIPConstantPropagationPass(); +//===----------------------------------------------------------------------===// +/// createOpenMPOpt - This pass performs OpenMP specific optimizations. +/// +ModulePass *createOpenMPOptPass(); + //===----------------------------------------------------------------------===// /// createIPSCCPPass - This pass propagates constants from call sites into the /// bodies of functions, and keeps track of whether basic blocks are executable Index: llvm/lib/Transforms/IPO/CMakeLists.txt =================================================================== --- llvm/lib/Transforms/IPO/CMakeLists.txt +++ llvm/lib/Transforms/IPO/CMakeLists.txt @@ -25,6 +25,7 @@ LoopExtractor.cpp LowerTypeTests.cpp MergeFunctions.cpp + OpenMPOpt.cpp PartialInlining.cpp PassManagerBuilder.cpp PruneEH.cpp Index: llvm/lib/Transforms/IPO/IPO.cpp =================================================================== --- llvm/lib/Transforms/IPO/IPO.cpp +++ llvm/lib/Transforms/IPO/IPO.cpp @@ -35,6 +35,7 @@ initializeGlobalSplitPass(Registry); initializeHotColdSplittingLegacyPassPass(Registry); initializeIPCPPass(Registry); + initializeOpenMPOptPass(Registry); initializeAlwaysInlinerLegacyPassPass(Registry); initializeSimpleInlinerPass(Registry); initializeInferFunctionAttrsLegacyPassPass(Registry); Index: llvm/lib/Transforms/IPO/OpenMPOpt.cpp =================================================================== --- /dev/null +++ llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -0,0 +1,428 @@ +//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// TODO +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/IPO.h" + +using namespace llvm; + +#define DEBUG_TYPE "openmp-opt" + +static cl::opt BuildCustomStateMachines( + "openmp-opt-build-custom-state-machines", cl::ZeroOrMore, + cl::desc("Build custom state machines for non-SPMD kernels."), cl::Hidden, + cl::init(true)); + +STATISTIC(NumKernelsConvertedToSPMD, + "Number of GPU kernels converted to SPMD mode"); +STATISTIC(NumCustomStateMachinesCreated, + "Number of custom GPU kernel non-SPMD mode state machines created"); +STATISTIC(NumCustomStateMachinesNoFallback, + "Number of custom GPU kernel non-SPMD mode state machines without fallback"); + +static Type *getOrCreateStructIdentTypePtr(Module &M) { + // TODO create if not present! + return M.getTypeByName("struct.ident_t")->getPointerTo(); +} + +// TODO: Simplify function declaration +static Function *getOrCreateFn(Type *RT, const char *Name, Module &M) { + Function *Fn = M.getFunction(Name); + if (!Fn) { + FunctionType *FType = FunctionType::get(RT, {}, false); + Fn = Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} +static Function *getOrCreateFn(Type *RT, Type *T0, Type *T1,const char *Name, Module &M) { + Function *Fn = M.getFunction(Name); + if (!Fn) { + FunctionType *FType = FunctionType::get(RT, {T0, T1}, false); + Fn = Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} +static Function *getOrCreateSimpleSPMDBarrierFn(Module &M) { + static const char *Name = "__kmpc_barrier_simple_spmd"; + Function *Fn = M.getFunction(Name); + if (!Fn) { + LLVMContext &Ctx = M.getContext(); + FunctionType *FType = FunctionType::get( + Type::getVoidTy(Ctx), + {getOrCreateStructIdentTypePtr(M), Type::getInt32Ty(Ctx)}, false); + Fn = Function::Create(FType, llvm::GlobalVariable::ExternalLinkage, Name, M); + } + return Fn; +} + +// TODO: This should be done via attributes. +static bool isIgnoredCall(Instruction *I) { + CallInst *CI = dyn_cast(I); + if (!CI || !CI->getCalledFunction()) + return false; + + return StringSwitch(CI->getCalledFunction()->getName()) + .Case("omp_get_team_number", true) + .Case("__kmpc_global_thread_num", true) + .Case("__kmpc_for_static_init_4", true) + .Case("__kmpc_for_static_fini", true) + .Case("__kmpc_get_team_static_memory", true) + .Case("__kmpc_restoe_team_static_memory", true) + .Case("llvm.nvvm.read.ptx.sreg.ntid.x", true) + .Case("llvm.lifetime.start.p0i8", true) + .Case("llvm.lifetime.end.p0i8", true) + .Default(false); +} + +static bool isSPMDRelatedRTCall(Instruction *I) { + CallInst *CI = dyn_cast(I); + if (!CI || !CI->getCalledFunction()) + return false; + + return StringSwitch(CI->getCalledFunction()->getName()) + .Case("__kmpc_generic_kernel_init", true) + .Case("__kmpc_generic_kernel_parallel", true) + .Case("__kmpc_generic_kernel_deinit", true) + .Default(false); +} + +static void +createCustomStateMachine(Module &M, + SmallVectorImpl &SideEffectInst, + SmallVectorImpl &RTCalls) { + + // TODO use reachability to eliminate the loop and if-cascade + + SmallVector ParallelRTCalls; + CallInst *InitCI = nullptr; + for (CallInst *CI : RTCalls) { + const auto &CalleeName = CI->getCalledFunction()->getName(); + if (CalleeName.equals("__kmpc_generic_kernel_init")) { + assert(!InitCI && "Found multiple kernel init calls!"); + InitCI = CI; + continue; + } + if (CalleeName.equals("__kmpc_generic_kernel_parallel")) { + ParallelRTCalls.push_back(CI); + } + } + + assert(InitCI && "No kernel init call found"); + + // TODO: Warn or eliminate the offloading if no parallel regions are present. + + ConstantInt *UseSM = dyn_cast(InitCI->getArgOperand(1)); + if (!UseSM || !UseSM->isOne()) { + LLVM_DEBUG(dbgs() << "No custom state machine because of " << *InitCI + << "\n"); + return; + } + + InitCI->setName("thread_kind"); + LLVMContext &Ctx = InitCI->getContext(); + Function *KernelFn = InitCI->getFunction(); + Type *VoidTy = Type::getVoidTy(Ctx); + Type *BoolTy = Type::getInt1Ty(Ctx); + Type *I16Ty = Type::getInt16Ty(Ctx); + Type *VoidPtrTy = Type::getInt8PtrTy(Ctx); + AllocaInst *WorkFnAI = + new AllocaInst(VoidPtrTy, 0, + "work_fn.addr", &KernelFn->getEntryBlock().front()); + + Instruction *IP = InitCI->getNextNode(); + Constant *ConstZero = ConstantInt::getSigned(UseSM->getType(), 0); + Constant *ConstMOne = ConstantInt::getSigned(UseSM->getType(), -1); + InitCI->setArgOperand(1, ConstZero); + Instruction *WorkerCnd = + new ICmpInst(IP, ICmpInst::ICMP_EQ, InitCI, ConstMOne, "is_worker"); + + Instruction *WaitTI = SplitBlockAndInsertIfThen(WorkerCnd, IP, false); + BasicBlock *WaitBB = WaitTI->getParent(); + WaitBB->setName("worker.wait"); + IP->getParent()->setName("master_check"); + + Function *SimpleBarrierFn = getOrCreateSimpleSPMDBarrierFn(M); + + auto AI = SimpleBarrierFn->arg_begin(); + Instruction *BarrierCall = + CallInst::Create(SimpleBarrierFn, + {Constant::getNullValue((AI++)->getType()), + Constant::getNullValue((AI)->getType())}, + "", WaitTI); + + Function *KernelParallelFn = getOrCreateFn( + BoolTy, VoidPtrTy->getPointerTo(), I16Ty, "__kmpc_kernel_parallel", M); + + Value *RequiresOMPRuntime = InitCI->getArgOperand(2); + Instruction *ActiveCnd = CallInst::Create( + KernelParallelFn, {WorkFnAI, RequiresOMPRuntime}, "is_active", WaitTI); + + Type *WorkFnPrototype = + FunctionType::get(VoidTy, {VoidPtrTy}, false)->getPointerTo(); + Value *WorkFnAICast = BitCastInst::CreatePointerBitCastOrAddrSpaceCast( + WorkFnAI, WorkFnPrototype->getPointerTo(), "Work_fn.addr_cast", WaitTI); + Value *WorkFn = new LoadInst(WorkFnAICast, "work_fn", WaitTI); + + Instruction *WorkFnCnd = + new ICmpInst(WaitTI, ICmpInst::ICMP_EQ, WorkFn, + Constant::getNullValue(WorkFn->getType()), "no_work"); + + Instruction *FinishedTI = SplitBlockAndInsertIfThen(WorkFnCnd, WaitTI, false); + FinishedTI->getParent()->setName("worker.finished"); + WaitTI->getParent()->setName("worker.active_check"); + + Instruction *ActiveTI = SplitBlockAndInsertIfThen(ActiveCnd, WaitTI, false); + ActiveTI->getParent()->setName("worker.active"); + WaitTI->getParent()->setName("worker.inactive"); + + Function *KernelGetSharedVars = + getOrCreateFn(VoidPtrTy, "__kmpc_get_shared_variables", M); + Value *SharedVars = CallInst::Create(KernelGetSharedVars, "", ActiveTI); + + BasicBlock *ExecuteBB = ActiveTI->getParent(); + BasicBlock *ParallelEndBB = SplitBlock(ExecuteBB, ActiveTI); + ParallelEndBB->setName("worker.parallel_end"); + + Function *KernelEndParallelFn = + getOrCreateFn(VoidTy, "__kmpc_kernel_end_parallel", M); + CallInst::Create(KernelEndParallelFn, "", ActiveTI); + + // A fallback is required if we might not see all parallel regions + // (__kmpc_generic_kernel_parallel calls). This could be the case if there is + // an unknown function call with side effects in the target region. + bool RequiresFallback = std::any_of( + SideEffectInst.begin(), SideEffectInst.end(), [](Instruction *I) { + return (isa(I) && I->mayHaveSideEffects() && + !isIgnoredCall(I)); + }); + + auto MayContainParallelKernelCall = [](Function &F) { + for (Instruction &I : instructions(F)) { + if (!isa(I) || !I.mayHaveSideEffects()) + continue; + if (isIgnoredCall(&I)) + continue; + if (isSPMDRelatedRTCall(&I) && + !cast(I).getCalledFunction()->getName().equals( + "__kmpc_generic_kernel_parallel")) + continue; + return true; + } + return false; + }; + + IP = ExecuteBB->getTerminator(); + for (CallInst *ParCI : ParallelRTCalls) { + Function *ParFn = + dyn_cast(ParCI->getArgOperand(0)->stripPointerCasts()); + // We also need to check the parallel regions (behind the + // __kmpc_generic_kernel_parallel calls). + if (!ParFn) { + RequiresFallback = true; + continue; + } + RequiresFallback |= MayContainParallelKernelCall(*ParFn); + + Value *ParFnCnd = + new ICmpInst(IP, ICmpInst::ICMP_EQ, WorkFn, ParFn, "par_fn_check"); + Instruction *ParFnTI = SplitBlockAndInsertIfThen(ParFnCnd, IP, false); + IP->getParent()->setName("worker.check.next"); + ParFnTI->getParent()->setName("worker.execute." + ParFn->getName()); + CallInst::Create(ParFn, {SharedVars}, "", ParFnTI); + ParFnTI->setSuccessor(0, ParallelEndBB); + } + + if (RequiresFallback) { + CallInst::Create(WorkFn, {SharedVars}, "", IP); + } + + BarrierCall->clone()->insertBefore(WaitTI); + + FinishedTI->setSuccessor(0, WaitTI->getSuccessor(0)); + WaitTI->setSuccessor(0, WaitBB); + // TODO: Add the new loop to LI! + + NumCustomStateMachinesCreated++; + NumCustomStateMachinesNoFallback += !RequiresFallback; +} + +static void collectNonParallelGlobalSideEffectsInKernel( + CallInst *CInst, SmallVectorImpl &SideEffectInst, + SmallVectorImpl &RTCalls) { + + SmallVector Worklist; + SmallPtrSet Visited; + + Worklist.push_back(CInst); + while (!Worklist.empty()) { + Instruction *I = Worklist.pop_back_val(); + + if (isSPMDRelatedRTCall(I)) + RTCalls.push_back(cast(I)); + else if (I->mayHaveSideEffects() || I->mayReadFromMemory()) + SideEffectInst.push_back(I); + + if (!I->isTerminator()) { + Worklist.push_back(I->getNextNode()); + continue; + } + + for (BasicBlock *SuccBB : successors(I)) + if (Visited.insert(SuccBB).second) + Worklist.push_back(&SuccBB->front()); + } +} + +static bool +guardAllSideEffects(Module &M, SmallVectorImpl &SideEffectInst) { + bool Guarded = true; + const DataLayout &DL = M.getDataLayout(); + for (Instruction *I : SideEffectInst) { + if (CallInst *CI = dyn_cast(I)) { + if (isIgnoredCall(CI)) + continue; + } else if (StoreInst *SI = dyn_cast(I)) { + if (isa( + SI->getPointerOperand()->stripInBoundsConstantOffsets())) + continue; + } else if (LoadInst *LI = dyn_cast(I)) { + if (isSafeToLoadUnconditionally(LI->getPointerOperand(), + LI->getAlignment(), DL)) + continue; + } + LLVM_DEBUG(dbgs() << "Non-SPMD side effect found: " << *I << "\n"); + Guarded = false; + } + return Guarded; +} + +static bool convertGPUKernelsToSPMD(Module &M) { + bool Changed = false; + + Function *GenericKernelInitFn = M.getFunction("__kmpc_generic_kernel_init"); + + // If the kernel init function is not present or unused, we are done. + if (!GenericKernelInitFn || GenericKernelInitFn->getNumUses() == 0) + return Changed; + + LLVMContext &Ctx = M.getContext(); + for (const Use &U : GenericKernelInitFn->uses()) { + CallSite CS(U.getUser()); + + // Filter out non-callee uses. + if (!CS || !CS.isCallee(&U)) + continue; + + // Filter out non call-inst uses. + if (!isa(CS.getInstruction())) + continue; + + auto *CInst = cast(CS.getInstruction()); + + // Filter out all but explicit non-SPMD cases. + Value *IsSPMDConstVal = CInst->getArgOperand(0); + if (!isa(IsSPMDConstVal) || + !cast(IsSPMDConstVal)->isZero()) + continue; + + Function *KernelFn = CInst->getFunction(); + + // For now we require the init call to be in the entry block, not strictly + // necessary but it makes things easier. + if (CInst->getParent() != &KernelFn->getEntryBlock()) + continue; + + // Traverse the kernel from the init to the deinit call and determine if + // there are any global side effects outside of parallel sections. If so, + // we cannot compute the kernel in SPMD mode (right now). + SmallVector SideEffectInst; + SmallVector RTCalls; + collectNonParallelGlobalSideEffectsInKernel(CInst, SideEffectInst, RTCalls); + if (!guardAllSideEffects(M, SideEffectInst)) { + if (BuildCustomStateMachines) + createCustomStateMachine(M, SideEffectInst, RTCalls); + continue; + } + + ConstantInt *COne = ConstantInt::get(IntegerType::getInt16Ty(Ctx), 1); + for (CallInst *RTCall : RTCalls) { + if (RTCall->getCalledFunction()->getName().equals( + "__kmpc_generic_kernel_parallel")) { + Value *Callee = RTCall->getArgOperand(0)->stripPointerCasts(); + Value *Payload = RTCall->getArgOperand(1); + CallInst::Create(Callee, {Payload}, "", RTCall); + RTCall->eraseFromParent(); + continue; + } + + assert(RTCall->getArgOperand(0)->getType()->isIntegerTy(16) && + "IsSPMD flag with int16_t expected!"); + assert(isa(IsSPMDConstVal) && + "Constant IsSPMD flag expected!"); + assert(cast(IsSPMDConstVal)->isZero() && + "Consistent IsSPMD flags expected!"); + + RTCall->setArgOperand(0, COne); + continue; + } + + GlobalVariable *ExecMode = + M.getGlobalVariable((KernelFn->getName() + "_exec_mode").str()); + assert(ExecMode && + "Assumed to find an execution mode hint among the globals"); + assert(ExecMode->getInitializer()->isOneValue() && + "Assumed generic execution mode prior to 'SPMD'-zation"); + ExecMode->setInitializer( + Constant::getNullValue(ExecMode->getInitializer()->getType())); + + NumKernelsConvertedToSPMD++; + + Changed = true; + } + + return Changed; +} + +namespace { +/// OpenMPOpt - The interprocedural OpenMP optimization pass +struct OpenMPOpt : public ModulePass { + static char ID; // Pass identification, replacement for typeid + OpenMPOpt() : ModulePass(ID) { + initializeOpenMPOptPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + bool Changed = false; + Changed |= convertGPUKernelsToSPMD(M); + return Changed; + } +}; +} // namespace + +char OpenMPOpt::ID = 0; +INITIALIZE_PASS(OpenMPOpt, "openmp-opt", "OpenMP specific optimizations", false, + false) + +ModulePass *llvm::createOpenMPOptPass() { return new OpenMPOpt(); } Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp =================================================================== --- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -546,6 +546,8 @@ addExtensionsToPM(EP_CGSCCOptimizerLate, MPM); addFunctionSimplificationPasses(MPM); + MPM.add(createOpenMPOptPass()); + // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC // pass manager that we are specifically trying to avoid. To prevent this // we must insert a no-op module pass to reset the pass manager. Index: llvm/test/Transforms/OpenMP/target_offload_late_SPMD.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/target_offload_late_SPMD.ll @@ -0,0 +1,344 @@ +; ModuleID = '/tmp/target_offload_new.ll' +source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.shared.struct = type { i64, i64, double*, i32*, float* } +%omp.shared.struct.0 = type { i64, i64, double*, i32*, float* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 0 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #0 { +entry: + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.omp.comb.lb4.i = alloca i32, align 4 + %.omp.comb.ub5.i = alloca i32, align 4 + %.omp.stride6.i = alloca i32, align 4 + %.omp.is_last7.i = alloca i32, align 4 + %.captured18.i = alloca %omp.shared.struct.0, align 8 + %0 = call i16 @__kmpc_generic_kernel_init(i16 1, i16 1, i16 1, i16 0) + %1 = icmp eq i16 %0, 0 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2 + %3 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp.i = icmp sgt i32 %3, 1023 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %4 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 1023, %cond.true.i ], [ %4, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %5 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %.omp.iv.i.0 = phi i32 [ %5, %cond.end.i ], [ %add.i, %omp.inner.for.body.i ] + %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp1.i = icmp sle i32 %.omp.iv.i.0, %6 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %10 = zext i32 %9 to i64 + %11 = bitcast %omp.shared.struct* %.captured.i to i8* + %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %8, i64* %12, !noalias !5 + %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %10, i64* %13, !noalias !5 + %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store double* %c, double** %14, !noalias !5 + %15 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 3 + store i32* %a, i32** %15, !noalias !5 + %16 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 4 + store float* %b, float** %16, !noalias !5 + call void @__omp_outlined__1_wrapper(i8* %11) + %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !5 + %add.i = add nsw i32 %.omp.iv.i.0, %17 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2 + store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #2 + %18 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp9.i = icmp sgt i32 %18, 1023 + br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i + +cond.true10.i: ; preds = %omp.inner.for.end.i + br label %cond.end12.i + +cond.false11.i: ; preds = %omp.inner.for.end.i + %19 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + br label %cond.end12.i + +cond.end12.i: ; preds = %cond.false11.i, %cond.true10.i + %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %19, %cond.false11.i ] + store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %20 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + br label %omp.inner.for.cond14.i + +omp.inner.for.cond14.i: ; preds = %omp.inner.for.body16.i, %cond.end12.i + %.omp.iv2.i.0 = phi i32 [ %20, %cond.end12.i ], [ %add20.i, %omp.inner.for.body16.i ] + %21 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp15.i = icmp sle i32 %.omp.iv2.i.0, %21 + br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit + +omp.inner.for.body16.i: ; preds = %omp.inner.for.cond14.i + %22 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + %23 = zext i32 %22 to i64 + %24 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %25 = zext i32 %24 to i64 + %26 = bitcast %omp.shared.struct.0* %.captured18.i to i8* + %27 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 0 + store i64 %23, i64* %27, !noalias !5 + %28 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 1 + store i64 %25, i64* %28, !noalias !5 + %29 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 2 + store double* %c, double** %29, !noalias !5 + %30 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 3 + store i32* %a, i32** %30, !noalias !5 + %31 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 4 + store float* %b, float** %31, !noalias !5 + call void @__omp_outlined__2_wrapper(i8* %26) + %32 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5 + %add20.i = add nsw i32 %.omp.iv2.i.0, %32 + br label %omp.inner.for.cond14.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond14.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 1, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast i8* %payload to %omp.shared.struct* + %2 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 0 + %3 = load i64, i64* %2, align 1 + %4 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 1 + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 2 + %7 = load double*, double** %6, align 1 + %8 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 3 + %9 = load i32*, i32** %8, align 1 + %10 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 4 + %11 = load float*, float** %10, align 1 + %12 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %12) + %13 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %13) + %14 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %14) + %15 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %15) + store i32 0, i32* %.omp.lb.i, align 4, !noalias !9 + store i32 1023, i32* %.omp.ub.i, align 4, !noalias !9 + %conv.i = trunc i64 %3 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !9 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !9 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !9 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !9 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !9 + %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !9 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %entry + %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ] + %conv2.i = sext i32 %.omp.iv.0.i to i64 + %cmp.i = icmp ule i64 %conv2.i, %5 + br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__1.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %idxprom.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i + %17 = load i32, i32* %arrayidx.i, align 4, !noalias !9 + %conv4.i = sitofp i32 %17 to float + %idxprom5.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i + %18 = load float, float* %arrayidx6.i, align 4, !noalias !9 + %mul7.i = fmul float %conv4.i, %18 + %conv8.i = fpext float %mul7.i to double + %idxprom9.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i + %19 = load double, double* %arrayidx10.i, align 8, !noalias !9 + %add11.i = fadd double %19, %conv8.i + store double %add11.i, double* %arrayidx10.i, align 8, !noalias !9 + %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !9 + %add12.i = add nsw i32 %.omp.iv.0.i, %20 + br label %omp.inner.for.cond.i + +__omp_outlined__1.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !9 + %21 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %21) + %22 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %22) + %23 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %23) + %24 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %24) + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i8* %payload) #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast i8* %payload to %omp.shared.struct.0* + %2 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 0 + %3 = load i64, i64* %2, align 1 + %4 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 1 + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 2 + %7 = load double*, double** %6, align 1 + %8 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 3 + %9 = load i32*, i32** %8, align 1 + %10 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 4 + %11 = load float*, float** %10, align 1 + %12 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %12) + %13 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %13) + %14 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %14) + %15 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %15) + store i32 0, i32* %.omp.lb.i, align 4, !noalias !12 + store i32 1023, i32* %.omp.ub.i, align 4, !noalias !12 + %conv.i = trunc i64 %3 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !12 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !12 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !12 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !12 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !12 + %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !12 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %entry + %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ] + %conv2.i = sext i32 %.omp.iv.0.i to i64 + %cmp.i = icmp ule i64 %conv2.i, %5 + br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__2.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %idxprom.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i + %17 = load i32, i32* %arrayidx.i, align 4, !noalias !12 + %conv4.i = sitofp i32 %17 to float + %idxprom5.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i + %18 = load float, float* %arrayidx6.i, align 4, !noalias !12 + %mul7.i = fmul float %conv4.i, %18 + %conv8.i = fpext float %mul7.i to double + %idxprom9.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i + %19 = load double, double* %arrayidx10.i, align 8, !noalias !12 + %add11.i = fadd double %19, %conv8.i + store double %add11.i, double* %arrayidx10.i, align 8, !noalias !12 + %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !12 + %add12.i = add nsw i32 %.omp.iv.0.i, %20 + br label %omp.inner.for.cond.i + +__omp_outlined__2.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !12 + %21 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %21) + %22 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %22) + %23 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %23) + %24 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %24) + ret void +} + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0} +!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 "} +!5 = !{!6, !8} +!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."} +!7 = distinct !{!7, !"__omp_outlined__"} +!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."} +!9 = !{!10} +!10 = distinct !{!10, !11, !"__omp_outlined__1: %.global_tid."} +!11 = distinct !{!11, !"__omp_outlined__1"} +!12 = !{!13} +!13 = distinct !{!13, !14, !"__omp_outlined__2: %.global_tid."} +!14 = distinct !{!14, !"__omp_outlined__2"} Index: llvm/test/Transforms/OpenMP/target_offload_new.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/target_offload_new.ll @@ -0,0 +1,468 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud +; ModuleID = '../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c' +source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.shared.struct = type { i64, i64, double*, i32*, float* } +%omp.shared.struct.0 = type { i64, i64, double*, i32*, float* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #0 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %a.addr.i = alloca i32*, align 8 + %b.addr.i = alloca float*, align 8 + %c.addr.i = alloca double*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.omp.iv2.i = alloca i32, align 4 + %tmp3.i = alloca i32, align 4 + %.omp.comb.lb4.i = alloca i32, align 4 + %.omp.comb.ub5.i = alloca i32, align 4 + %.omp.stride6.i = alloca i32, align 4 + %.omp.is_last7.i = alloca i32, align 4 + %i8.i = alloca i32, align 4 + %.zero.addr17.i = alloca i32, align 4 + %.captured18.i = alloca %omp.shared.struct.0, align 8 + %a.addr = alloca i32*, align 8 + %b.addr = alloca float*, align 8 + %c.addr = alloca double*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %a, i32** %a.addr, align 8 + store float* %b, float** %b.addr, align 8 + store double* %c, double** %c.addr, align 8 + %0 = call i16 @__kmpc_generic_kernel_init(i16 0, i16 1, i16 1, i16 0) + %1 = icmp eq i16 %0, 0 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %entry + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %3 = load i32*, i32** %a.addr, align 8 + %4 = load float*, float** %b.addr, align 8 + %5 = load double*, double** %c.addr, align 8 + store i32 %2, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr17.i, align 4, !noalias !5 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !5 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !5 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !5 + store i32* %3, i32** %a.addr.i, align 8, !noalias !5 + store float* %4, float** %b.addr.i, align 8, !noalias !5 + store double* %5, double** %c.addr.i, align 8, !noalias !5 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5 + %6 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !5 + %7 = load i32, i32* %6, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %7, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2 + %8 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp.i = icmp sgt i32 %8, 1023 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %9 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 1023, %cond.true.i ], [ %9, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %10 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 %10, i32* %.omp.iv.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %11 = load i32, i32* %.omp.iv.i, align 4, !noalias !5 + %12 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp1.i = icmp sle i32 %11, %12 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %13 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + %14 = zext i32 %13 to i64 + %15 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %16 = zext i32 %15 to i64 + %17 = load double*, double** %c.addr.i, align 8, !noalias !5 + %18 = load i32*, i32** %a.addr.i, align 8, !noalias !5 + %19 = load float*, float** %b.addr.i, align 8, !noalias !5 + %20 = bitcast %omp.shared.struct* %.captured.i to i8* + %21 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %14, i64* %21, !noalias !5 + %22 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %16, i64* %22, !noalias !5 + %23 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store double* %17, double** %23, !noalias !5 + %24 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 3 + store i32* %18, i32** %24, !noalias !5 + %25 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 4 + store float* %19, float** %25, !noalias !5 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %20, i16 40, i16 1) #2 + %26 = load i32, i32* %.omp.iv.i, align 4, !noalias !5 + %27 = load i32, i32* %.omp.stride.i, align 4, !noalias !5 + %add.i = add nsw i32 %26, %27 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %7) #2 + store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %7, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #2 + %28 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp9.i = icmp sgt i32 %28, 1023 + br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i + +cond.true10.i: ; preds = %omp.inner.for.end.i + br label %cond.end12.i + +cond.false11.i: ; preds = %omp.inner.for.end.i + %29 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + br label %cond.end12.i + +cond.end12.i: ; preds = %cond.false11.i, %cond.true10.i + %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %29, %cond.false11.i ] + store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %30 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + store i32 %30, i32* %.omp.iv2.i, align 4, !noalias !5 + br label %omp.inner.for.cond14.i + +omp.inner.for.cond14.i: ; preds = %omp.inner.for.body16.i, %cond.end12.i + %31 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5 + %32 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp15.i = icmp sle i32 %31, %32 + br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit + +omp.inner.for.body16.i: ; preds = %omp.inner.for.cond14.i + %33 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + %34 = zext i32 %33 to i64 + %35 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %36 = zext i32 %35 to i64 + %37 = load double*, double** %c.addr.i, align 8, !noalias !5 + %38 = load i32*, i32** %a.addr.i, align 8, !noalias !5 + %39 = load float*, float** %b.addr.i, align 8, !noalias !5 + %40 = bitcast %omp.shared.struct.0* %.captured18.i to i8* + %41 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 0 + store i64 %34, i64* %41, !noalias !5 + %42 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 1 + store i64 %36, i64* %42, !noalias !5 + %43 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 2 + store double* %37, double** %43, !noalias !5 + %44 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 3 + store i32* %38, i32** %44, !noalias !5 + %45 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 4 + store float* %39, float** %45, !noalias !5 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %40, i16 40, i16 1) #2 + %46 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5 + %47 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5 + %add20.i = add nsw i32 %46, %47 + store i32 %add20.i, i32* %.omp.iv2.i, align 4, !noalias !5 + br label %omp.inner.for.cond14.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond14.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %7) #2 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %entry + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %c.addr = alloca double*, align 8 + %a.addr = alloca i32*, align 8 + %b.addr = alloca float*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store double* %c, double** %c.addr, align 8 + store i32* %a, i32** %a.addr, align 8 + store float* %b, float** %b.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 1023, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %5 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %5 to i64 + %6 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %6 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %8 = load i32*, i32** %a.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4 + %conv4 = sitofp i32 %10 to float + %11 = load float*, float** %b.addr, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom5 = sext i32 %12 to i64 + %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5 + %13 = load float, float* %arrayidx6, align 4 + %mul7 = fmul float %conv4, %13 + %conv8 = fpext float %mul7 to double + %14 = load double*, double** %c.addr, align 8 + %15 = load i32, i32* %i, align 4 + %idxprom9 = sext i32 %15 to i64 + %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9 + %16 = load double, double* %arrayidx10, align 8 + %add11 = fadd double %16, %conv8 + store double %add11, double* %arrayidx10, align 8 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %17 = load i32, i32* %.omp.iv, align 4 + %18 = load i32, i32* %.omp.stride, align 4 + %add12 = add nsw i32 %17, %18 + store i32 %add12, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct* + %3 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 0 + %4 = load i64, i64* %3, align 1 + %5 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 1 + %6 = load i64, i64* %5, align 1 + %7 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 2 + %8 = load double*, double** %7, align 1 + %9 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 3 + %10 = load i32*, i32** %9, align 1 + %11 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %2, i32 0, i32 4 + %12 = load float*, float** %11, align 1 + call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, double* %8, i32* %10, float* %12) #2 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #0 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %c.addr = alloca double*, align 8 + %a.addr = alloca i32*, align 8 + %b.addr = alloca float*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store double* %c, double** %c.addr, align 8 + store i32* %a, i32** %a.addr, align 8 + store float* %b, float** %b.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 1023, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %5 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %5 to i64 + %6 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %6 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %8 = load i32*, i32** %a.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4 + %conv4 = sitofp i32 %10 to float + %11 = load float*, float** %b.addr, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom5 = sext i32 %12 to i64 + %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5 + %13 = load float, float* %arrayidx6, align 4 + %mul7 = fmul float %conv4, %13 + %conv8 = fpext float %mul7 to double + %14 = load double*, double** %c.addr, align 8 + %15 = load i32, i32* %i, align 4 + %idxprom9 = sext i32 %15 to i64 + %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9 + %16 = load double, double* %arrayidx10, align 8 + %add11 = fadd double %16, %conv8 + store double %add11, double* %arrayidx10, align 8 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %17 = load i32, i32* %.omp.iv, align 4 + %18 = load i32, i32* %.omp.stride, align 4 + %add12 = add nsw i32 %17, %18 + store i32 %add12, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3) + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i8* %payload) #1 { +entry: + %.addr = alloca i8*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i8* %payload, i8** %.addr, align 8 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 %0, i32* %.threadid_temp., align 4 + %1 = load i8*, i8** %.addr, align 8 + %2 = bitcast i8* %1 to %omp.shared.struct.0* + %3 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 0 + %4 = load i64, i64* %3, align 1 + %5 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 1 + %6 = load i64, i64* %5, align 1 + %7 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 2 + %8 = load double*, double** %7, align 1 + %9 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 3 + %10 = load i32*, i32** %9, align 1 + %11 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %2, i32 0, i32 4 + %12 = load float*, float** %11, align 1 + call void @__omp_outlined__2(i32* %.threadid_temp., i32* %.zero.addr, i64 %4, i64 %6, double* %8, i32* %10, float* %12) #2 + ret void +} + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0} +!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 "} +!5 = !{!6, !8} +!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."} +!7 = distinct !{!7, !"__omp_outlined__"} +!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud + Index: llvm/test/Transforms/OpenMP/target_offload_no_SPMD_custom_sm.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/target_offload_no_SPMD_custom_sm.ll @@ -0,0 +1,399 @@ +; ModuleID = '/tmp/target_offload_new.ll' +source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } +%omp.shared.struct = type { i64, i64, double*, i32*, float* } +%omp.shared.struct.0 = type { i64, i64, double*, i32*, float* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #0 { +entry: + %work_fn.addr = alloca i8* + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %.captured.i = alloca %omp.shared.struct, align 8 + %.omp.comb.lb4.i = alloca i32, align 4 + %.omp.comb.ub5.i = alloca i32, align 4 + %.omp.stride6.i = alloca i32, align 4 + %.omp.is_last7.i = alloca i32, align 4 + %.captured18.i = alloca %omp.shared.struct.0, align 8 + %thread_kind = call i16 @__kmpc_generic_kernel_init(i16 0, i16 0, i16 1, i16 0) + %is_worker = icmp eq i16 %thread_kind, -1 + br i1 %is_worker, label %worker.wait, label %master_check + +worker.wait: ; preds = %worker.inactive, %entry + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + %is_active = call i1 @__kmpc_kernel_parallel(i8** %work_fn.addr, i16 1) + %Work_fn.addr_cast = bitcast i8** %work_fn.addr to void (i8*)** + %work_fn = load void (i8*)*, void (i8*)** %Work_fn.addr_cast + %no_work = icmp eq void (i8*)* %work_fn, null + br i1 %no_work, label %worker.finished, label %worker.active_check + +worker.finished: ; preds = %worker.wait + br label %master_check + +worker.active_check: ; preds = %worker.wait + br i1 %is_active, label %worker.active, label %worker.inactive + +worker.active: ; preds = %worker.active_check + %0 = call i8* @__kmpc_get_shared_variables() + %par_fn_check = icmp eq void (i8*)* %work_fn, @__omp_outlined__2_wrapper + br i1 %par_fn_check, label %worker.execute.__omp_outlined__2_wrapper, label %worker.check.next + +worker.execute.__omp_outlined__2_wrapper: ; preds = %worker.active + call void @__omp_outlined__2_wrapper(i8* %0) + br label %worker.parallel_end + +worker.check.next: ; preds = %worker.active + %par_fn_check1 = icmp eq void (i8*)* %work_fn, @__omp_outlined__1_wrapper + br i1 %par_fn_check1, label %worker.execute.__omp_outlined__1_wrapper, label %worker.check.next2 + +worker.execute.__omp_outlined__1_wrapper: ; preds = %worker.check.next + call void @__omp_outlined__1_wrapper(i8* %0) + br label %worker.parallel_end + +worker.check.next2: ; preds = %worker.check.next + br label %worker.parallel_end + +worker.parallel_end: ; preds = %worker.execute.__omp_outlined__1_wrapper, %worker.execute.__omp_outlined__2_wrapper, %worker.check.next2 + call void @__kmpc_kernel_end_parallel() + br label %worker.inactive + +worker.inactive: ; preds = %worker.active_check, %worker.parallel_end + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %worker.wait + +master_check: ; preds = %worker.finished, %entry + %1 = icmp eq i16 %thread_kind, 0 + br i1 %1, label %.execute, label %.exit + +.execute: ; preds = %master_check + %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2 + %3 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp.i = icmp sgt i32 %3, 1023 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.execute + br label %cond.end.i + +cond.false.i: ; preds = %.execute + %4 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 1023, %cond.true.i ], [ %4, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %5 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %.omp.iv.i.0 = phi i32 [ %5, %cond.end.i ], [ %add.i, %omp.inner.for.body.i ] + %6 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp1.i = icmp sle i32 %.omp.iv.i.0, %6 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %7 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + %8 = zext i32 %7 to i64 + %9 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %10 = zext i32 %9 to i64 + %11 = bitcast %omp.shared.struct* %.captured.i to i8* + %12 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 0 + store i64 %8, i64* %12, !noalias !5 + %13 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 1 + store i64 %10, i64* %13, !noalias !5 + %14 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 2 + store double* %c, double** %14, !noalias !5 + %15 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 3 + store i32* %a, i32** %15, !noalias !5 + %16 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %.captured.i, i32 0, i32 4 + store float* %b, float** %16, !noalias !5 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__1_wrapper to i8*), i8* %11, i16 40, i16 1) #2 + %17 = load i32, i32* %.omp.stride.i, align 4, !noalias !5 + %add.i = add nsw i32 %.omp.iv.i.0, %17 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2 + store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %2, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #2 + %18 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp9.i = icmp sgt i32 %18, 1023 + br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i + +cond.true10.i: ; preds = %omp.inner.for.end.i + br label %cond.end12.i + +cond.false11.i: ; preds = %omp.inner.for.end.i + %19 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + br label %cond.end12.i + +cond.end12.i: ; preds = %cond.false11.i, %cond.true10.i + %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %19, %cond.false11.i ] + store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %20 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + br label %omp.inner.for.cond14.i + +omp.inner.for.cond14.i: ; preds = %omp.inner.for.body16.i, %cond.end12.i + %.omp.iv2.i.0 = phi i32 [ %20, %cond.end12.i ], [ %add20.i, %omp.inner.for.body16.i ] + %21 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp15.i = icmp sle i32 %.omp.iv2.i.0, %21 + br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit + +omp.inner.for.body16.i: ; preds = %omp.inner.for.cond14.i + %22 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + %23 = zext i32 %22 to i64 + %24 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %25 = zext i32 %24 to i64 + %26 = bitcast %omp.shared.struct.0* %.captured18.i to i8* + %27 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 0 + store i64 %23, i64* %27, !noalias !5 + %28 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 1 + store i64 %25, i64* %28, !noalias !5 + %29 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 2 + store double* %c, double** %29, !noalias !5 + %30 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 3 + store i32* %a, i32** %30, !noalias !5 + %31 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %.captured18.i, i32 0, i32 4 + store float* %b, float** %31, !noalias !5 + call void @__kmpc_generic_kernel_parallel(i8* bitcast (void (i8*)* @__omp_outlined__2_wrapper to i8*), i8* %26, i16 40, i16 1) #2 + %32 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5 + %add20.i = add nsw i32 %.omp.iv2.i.0, %32 + br label %omp.inner.for.cond14.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond14.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %2) #2 + br label %.omp.deinit + +.omp.deinit: ; preds = %__omp_outlined__.exit + call void @__kmpc_generic_kernel_deinit(i16 0, i16 1) + br label %.exit + +.exit: ; preds = %.omp.deinit, %master_check + ret void +} + +declare i16 @__kmpc_generic_kernel_init(i16, i16, i16, i16) + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i8* %payload) #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast i8* %payload to %omp.shared.struct* + %2 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 0 + %3 = load i64, i64* %2, align 1 + %4 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 1 + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 2 + %7 = load double*, double** %6, align 1 + %8 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 3 + %9 = load i32*, i32** %8, align 1 + %10 = getelementptr inbounds %omp.shared.struct, %omp.shared.struct* %1, i32 0, i32 4 + %11 = load float*, float** %10, align 1 + %12 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %12) + %13 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %13) + %14 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %14) + %15 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %15) + store i32 0, i32* %.omp.lb.i, align 4, !noalias !9 + store i32 1023, i32* %.omp.ub.i, align 4, !noalias !9 + %conv.i = trunc i64 %3 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !9 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !9 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !9 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !9 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !9 + %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !9 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %entry + %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ] + %conv2.i = sext i32 %.omp.iv.0.i to i64 + %cmp.i = icmp ule i64 %conv2.i, %5 + br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__1.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %idxprom.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i + %17 = load i32, i32* %arrayidx.i, align 4, !noalias !9 + %conv4.i = sitofp i32 %17 to float + %idxprom5.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i + %18 = load float, float* %arrayidx6.i, align 4, !noalias !9 + %mul7.i = fmul float %conv4.i, %18 + %conv8.i = fpext float %mul7.i to double + %idxprom9.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i + %19 = load double, double* %arrayidx10.i, align 8, !noalias !9 + %add11.i = fadd double %19, %conv8.i + store double %add11.i, double* %arrayidx10.i, align 8, !noalias !9 + %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !9 + %add12.i = add nsw i32 %.omp.iv.0.i, %20 + br label %omp.inner.for.cond.i + +__omp_outlined__1.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !9 + %21 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %21) + %22 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %22) + %23 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %23) + %24 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %24) + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_generic_kernel_parallel(i8*, i8*, i16, i16) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i8* %payload) #0 { +entry: + %.omp.lb.i = alloca i32, align 4 + %.omp.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %1 = bitcast i8* %payload to %omp.shared.struct.0* + %2 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 0 + %3 = load i64, i64* %2, align 1 + %4 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 1 + %5 = load i64, i64* %4, align 1 + %6 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 2 + %7 = load double*, double** %6, align 1 + %8 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 3 + %9 = load i32*, i32** %8, align 1 + %10 = getelementptr inbounds %omp.shared.struct.0, %omp.shared.struct.0* %1, i32 0, i32 4 + %11 = load float*, float** %10, align 1 + %12 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %12) + %13 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %13) + %14 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %14) + %15 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %15) + store i32 0, i32* %.omp.lb.i, align 4, !noalias !12 + store i32 1023, i32* %.omp.ub.i, align 4, !noalias !12 + %conv.i = trunc i64 %3 to i32 + %conv1.i = trunc i64 %5 to i32 + store i32 %conv.i, i32* %.omp.lb.i, align 4, !noalias !12 + store i32 %conv1.i, i32* %.omp.ub.i, align 4, !noalias !12 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !12 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !12 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %0, i32 33, i32* %.omp.is_last.i, i32* %.omp.lb.i, i32* %.omp.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #2, !noalias !12 + %16 = load i32, i32* %.omp.lb.i, align 4, !noalias !12 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %entry + %.omp.iv.0.i = phi i32 [ %16, %entry ], [ %add12.i, %omp.inner.for.body.i ] + %conv2.i = sext i32 %.omp.iv.0.i to i64 + %cmp.i = icmp ule i64 %conv2.i, %5 + br i1 %cmp.i, label %omp.inner.for.body.i, label %__omp_outlined__2.exit + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %idxprom.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx.i = getelementptr inbounds i32, i32* %9, i64 %idxprom.i + %17 = load i32, i32* %arrayidx.i, align 4, !noalias !12 + %conv4.i = sitofp i32 %17 to float + %idxprom5.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx6.i = getelementptr inbounds float, float* %11, i64 %idxprom5.i + %18 = load float, float* %arrayidx6.i, align 4, !noalias !12 + %mul7.i = fmul float %conv4.i, %18 + %conv8.i = fpext float %mul7.i to double + %idxprom9.i = sext i32 %.omp.iv.0.i to i64 + %arrayidx10.i = getelementptr inbounds double, double* %7, i64 %idxprom9.i + %19 = load double, double* %arrayidx10.i, align 8, !noalias !12 + %add11.i = fadd double %19, %conv8.i + store double %add11.i, double* %arrayidx10.i, align 8, !noalias !12 + %20 = load i32, i32* %.omp.stride.i, align 4, !noalias !12 + %add12.i = add nsw i32 %.omp.iv.0.i, %20 + br label %omp.inner.for.cond.i + +__omp_outlined__2.exit: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %0) #2, !noalias !12 + %21 = bitcast i32* %.omp.lb.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %21) + %22 = bitcast i32* %.omp.ub.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %22) + %23 = bitcast i32* %.omp.stride.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %23) + %24 = bitcast i32* %.omp.is_last.i to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %24) + ret void +} + +declare void @__kmpc_generic_kernel_deinit(i16, i16) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) + +declare i1 @__kmpc_kernel_parallel(i8**, i16) + +declare i8* @__kmpc_get_shared_variables() + +declare void @__kmpc_kernel_end_parallel() + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } +attributes #2 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0} +!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 "} +!5 = !{!6, !8} +!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."} +!7 = distinct !{!7, !"__omp_outlined__"} +!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."} +!9 = !{!10} +!10 = distinct !{!10, !11, !"__omp_outlined__1: %.global_tid."} +!11 = distinct !{!11, !"__omp_outlined__1"} +!12 = !{!13} +!13 = distinct !{!13, !14, !"__omp_outlined__2: %.global_tid."} +!14 = distinct !{!14, !"__omp_outlined__2"} Index: llvm/test/Transforms/OpenMP/target_offload_old.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/OpenMP/target_offload_old.ll @@ -0,0 +1,600 @@ + +; __CLANG_OFFLOAD_BUNDLE____START__ openmp-nvptx64-nvida-cud +; ModuleID = '../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c' +source_filename = "../llvm/test/Transforms/OpenMP/target_offload_to_SPMD.c" +target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64" +target triple = "nvptx64-nvida-cud" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@"_openmp_kernel_static_glob_rd$ptr" = internal addrspace(3) global i8* null +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2050, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 514, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@__omp_offloading_18_280394b_foo_l3_exec_mode = weak constant i8 1 +@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_18_280394b_foo_l3_exec_mode], section "llvm.metadata" + +; Function Attrs: norecurse nounwind +define internal void @__omp_offloading_18_280394b_foo_l3_worker() #0 { +entry: + %work_fn = alloca i8*, align 8 + %exec_status = alloca i8, align 1 + store i8* null, i8** %work_fn, align 8 + store i8 0, i8* %exec_status, align 1 + br label %.await.work + +.await.work: ; preds = %.barrier.parallel, %entry + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn, i16 1) + %1 = zext i1 %0 to i8 + store i8 %1, i8* %exec_status, align 1 + %2 = load i8*, i8** %work_fn, align 8 + %should_terminate = icmp eq i8* %2, null + br i1 %should_terminate, label %.exit, label %.select.workers + +.select.workers: ; preds = %.await.work + %3 = load i8, i8* %exec_status, align 1 + %is_active = icmp ne i8 %3, 0 + br i1 %is_active, label %.execute.parallel, label %.barrier.parallel + +.execute.parallel: ; preds = %.select.workers + %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %5 = load i8*, i8** %work_fn, align 8 + %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*) + br i1 %work_match, label %.execute.fn, label %.check.next + +.execute.fn: ; preds = %.execute.parallel + call void @__omp_outlined__1_wrapper(i16 0, i32 %4) #4 + br label %.terminate.parallel + +.check.next: ; preds = %.execute.parallel + %6 = load i8*, i8** %work_fn, align 8 + %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*) + br i1 %work_match1, label %.execute.fn2, label %.check.next3 + +.execute.fn2: ; preds = %.check.next + call void @__omp_outlined__2_wrapper(i16 0, i32 %4) #4 + br label %.terminate.parallel + +.check.next3: ; preds = %.check.next + %7 = bitcast i8* %2 to void (i16, i32)* + call void %7(i16 0, i32 %4) + br label %.terminate.parallel + +.terminate.parallel: ; preds = %.check.next3, %.execute.fn2, %.execute.fn + call void @__kmpc_kernel_end_parallel() + br label %.barrier.parallel + +.barrier.parallel: ; preds = %.terminate.parallel, %.select.workers + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.await.work + +.exit: ; preds = %.await.work + ret void +} + +; Function Attrs: norecurse nounwind +define weak void @__omp_offloading_18_280394b_foo_l3(i32* %a, float* %b, double* %c) #1 { +entry: + %.global_tid..addr.i = alloca i32*, align 8 + %.bound_tid..addr.i = alloca i32*, align 8 + %a.addr.i = alloca i32*, align 8 + %b.addr.i = alloca float*, align 8 + %c.addr.i = alloca double*, align 8 + %.omp.iv.i = alloca i32, align 4 + %tmp.i = alloca i32, align 4 + %.omp.comb.lb.i = alloca i32, align 4 + %.omp.comb.ub.i = alloca i32, align 4 + %.omp.stride.i = alloca i32, align 4 + %.omp.is_last.i = alloca i32, align 4 + %i.i = alloca i32, align 4 + %.zero.addr.i = alloca i32, align 4 + %shared_arg_refs.i = alloca i8**, align 8 + %.omp.iv2.i = alloca i32, align 4 + %tmp3.i = alloca i32, align 4 + %.omp.comb.lb4.i = alloca i32, align 4 + %.omp.comb.ub5.i = alloca i32, align 4 + %.omp.stride6.i = alloca i32, align 4 + %.omp.is_last7.i = alloca i32, align 4 + %i8.i = alloca i32, align 4 + %.zero.addr17.i = alloca i32, align 4 + %shared_arg_refs18.i = alloca i8**, align 8 + %a.addr = alloca i32*, align 8 + %b.addr = alloca float*, align 8 + %c.addr = alloca double*, align 8 + %.zero.addr = alloca i32, align 4 + %.threadid_temp. = alloca i32, align 4 + store i32 0, i32* %.zero.addr, align 4 + store i32* %a, i32** %a.addr, align 8 + store float* %b, float** %b.addr, align 8 + store double* %c, double** %c.addr, align 8 + %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size + %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %0 = icmp ult i32 %nvptx_tid, %thread_limit + br i1 %0, label %.worker, label %.mastercheck + +.worker: ; preds = %entry + call void @__omp_offloading_18_280394b_foo_l3_worker() #4 + br label %.exit + +.mastercheck: ; preds = %entry + %nvptx_num_threads1 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %nvptx_warp_size2 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %1 = sub nuw i32 %nvptx_warp_size2, 1 + %2 = xor i32 %1, -1 + %3 = sub nuw i32 %nvptx_num_threads1, 1 + %master_tid = and i32 %3, %2 + %nvptx_tid3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %4 = icmp eq i32 %nvptx_tid3, %master_tid + br i1 %4, label %.master, label %.exit + +.master: ; preds = %.mastercheck + %nvptx_warp_size4 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + %nvptx_num_threads5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %thread_limit6 = sub nuw i32 %nvptx_num_threads5, %nvptx_warp_size4 + call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1) + call void @__kmpc_data_sharing_init_stack() + %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) + %6 = load i32*, i32** %a.addr, align 8 + %7 = load float*, float** %b.addr, align 8 + %8 = load double*, double** %c.addr, align 8 + store i32 %5, i32* %.threadid_temp., align 4 + store i32 0, i32* %.zero.addr17.i, align 4, !noalias !5 + store i32 0, i32* %.zero.addr.i, align 4, !noalias !5 + store i32* %.threadid_temp., i32** %.global_tid..addr.i, align 8, !noalias !5 + store i32* %.zero.addr, i32** %.bound_tid..addr.i, align 8, !noalias !5 + store i32* %6, i32** %a.addr.i, align 8, !noalias !5 + store float* %7, float** %b.addr.i, align 8, !noalias !5 + store double* %8, double** %c.addr.i, align 8, !noalias !5 + store i32 0, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last.i, align 4, !noalias !5 + %9 = load i32*, i32** %.global_tid..addr.i, align 8, !noalias !5 + %10 = load i32, i32* %9, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 92, i32* %.omp.is_last.i, i32* %.omp.comb.lb.i, i32* %.omp.comb.ub.i, i32* %.omp.stride.i, i32 1, i32 1) #4 + %11 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp.i = icmp sgt i32 %11, 1023 + br i1 %cmp.i, label %cond.true.i, label %cond.false.i + +cond.true.i: ; preds = %.master + br label %cond.end.i + +cond.false.i: ; preds = %.master + %12 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + br label %cond.end.i + +cond.end.i: ; preds = %cond.false.i, %cond.true.i + %cond.i = phi i32 [ 1023, %cond.true.i ], [ %12, %cond.false.i ] + store i32 %cond.i, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %13 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + store i32 %13, i32* %.omp.iv.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.cond.i: ; preds = %omp.inner.for.body.i, %cond.end.i + %14 = load i32, i32* %.omp.iv.i, align 4, !noalias !5 + %15 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %cmp1.i = icmp sle i32 %14, %15 + br i1 %cmp1.i, label %omp.inner.for.body.i, label %omp.inner.for.end.i + +omp.inner.for.body.i: ; preds = %omp.inner.for.cond.i + %16 = load i32, i32* %.omp.comb.lb.i, align 4, !noalias !5 + %17 = zext i32 %16 to i64 + %18 = load i32, i32* %.omp.comb.ub.i, align 4, !noalias !5 + %19 = zext i32 %18 to i64 + %20 = load double*, double** %c.addr.i, align 8, !noalias !5 + %21 = load i32*, i32** %a.addr.i, align 8, !noalias !5 + %22 = load float*, float** %b.addr.i, align 8, !noalias !5 + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i16 1) #4 + call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs.i, i64 5) #4 + %23 = load i8**, i8*** %shared_arg_refs.i, align 8, !noalias !5 + %24 = inttoptr i64 %17 to i8* + store i8* %24, i8** %23, align 8 + %25 = getelementptr inbounds i8*, i8** %23, i64 1 + %26 = inttoptr i64 %19 to i8* + store i8* %26, i8** %25, align 8 + %27 = getelementptr inbounds i8*, i8** %23, i64 2 + %28 = bitcast double* %20 to i8* + store i8* %28, i8** %27, align 8 + %29 = getelementptr inbounds i8*, i8** %23, i64 3 + %30 = bitcast i32* %21 to i8* + store i8* %30, i8** %29, align 8 + %31 = getelementptr inbounds i8*, i8** %23, i64 4 + %32 = bitcast float* %22 to i8* + store i8* %32, i8** %31, align 8 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4 + call void @__kmpc_end_sharing_variables() #4 + %33 = load i32, i32* %.omp.iv.i, align 4, !noalias !5 + %34 = load i32, i32* %.omp.stride.i, align 4, !noalias !5 + %add.i = add nsw i32 %33, %34 + store i32 %add.i, i32* %.omp.iv.i, align 4, !noalias !5 + br label %omp.inner.for.cond.i + +omp.inner.for.end.i: ; preds = %omp.inner.for.cond.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) #4 + store i32 0, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + store i32 1023, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + store i32 1, i32* %.omp.stride6.i, align 4, !noalias !5 + store i32 0, i32* %.omp.is_last7.i, align 4, !noalias !5 + call void @__kmpc_for_static_init_4(%struct.ident_t* @0, i32 %10, i32 92, i32* %.omp.is_last7.i, i32* %.omp.comb.lb4.i, i32* %.omp.comb.ub5.i, i32* %.omp.stride6.i, i32 1, i32 1) #4 + %35 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp9.i = icmp sgt i32 %35, 1023 + br i1 %cmp9.i, label %cond.true10.i, label %cond.false11.i + +cond.true10.i: ; preds = %omp.inner.for.end.i + br label %cond.end12.i + +cond.false11.i: ; preds = %omp.inner.for.end.i + %36 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + br label %cond.end12.i + +cond.end12.i: ; preds = %cond.false11.i, %cond.true10.i + %cond13.i = phi i32 [ 1023, %cond.true10.i ], [ %36, %cond.false11.i ] + store i32 %cond13.i, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %37 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + store i32 %37, i32* %.omp.iv2.i, align 4, !noalias !5 + br label %omp.inner.for.cond14.i + +omp.inner.for.cond14.i: ; preds = %omp.inner.for.body16.i, %cond.end12.i + %38 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5 + %39 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %cmp15.i = icmp sle i32 %38, %39 + br i1 %cmp15.i, label %omp.inner.for.body16.i, label %__omp_outlined__.exit + +omp.inner.for.body16.i: ; preds = %omp.inner.for.cond14.i + %40 = load i32, i32* %.omp.comb.lb4.i, align 4, !noalias !5 + %41 = zext i32 %40 to i64 + %42 = load i32, i32* %.omp.comb.ub5.i, align 4, !noalias !5 + %43 = zext i32 %42 to i64 + %44 = load double*, double** %c.addr.i, align 8, !noalias !5 + %45 = load i32*, i32** %a.addr.i, align 8, !noalias !5 + %46 = load float*, float** %b.addr.i, align 8, !noalias !5 + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i16 1) #4 + call void @__kmpc_begin_sharing_variables(i8*** %shared_arg_refs18.i, i64 5) #4 + %47 = load i8**, i8*** %shared_arg_refs18.i, align 8, !noalias !5 + %48 = inttoptr i64 %41 to i8* + store i8* %48, i8** %47, align 8 + %49 = getelementptr inbounds i8*, i8** %47, i64 1 + %50 = inttoptr i64 %43 to i8* + store i8* %50, i8** %49, align 8 + %51 = getelementptr inbounds i8*, i8** %47, i64 2 + %52 = bitcast double* %44 to i8* + store i8* %52, i8** %51, align 8 + %53 = getelementptr inbounds i8*, i8** %47, i64 3 + %54 = bitcast i32* %45 to i8* + store i8* %54, i8** %53, align 8 + %55 = getelementptr inbounds i8*, i8** %47, i64 4 + %56 = bitcast float* %46 to i8* + store i8* %56, i8** %55, align 8 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4 + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) #4 + call void @__kmpc_end_sharing_variables() #4 + %57 = load i32, i32* %.omp.iv2.i, align 4, !noalias !5 + %58 = load i32, i32* %.omp.stride6.i, align 4, !noalias !5 + %add20.i = add nsw i32 %57, %58 + store i32 %add20.i, i32* %.omp.iv2.i, align 4, !noalias !5 + br label %omp.inner.for.cond14.i + +__omp_outlined__.exit: ; preds = %omp.inner.for.cond14.i + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %10) #4 + br label %.termination.notifier + +.termination.notifier: ; preds = %__omp_outlined__.exit + call void @__kmpc_kernel_deinit(i16 1) + call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0) + br label %.exit + +.exit: ; preds = %.termination.notifier, %.mastercheck, %.worker + ret void +} + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #2 + +; Function Attrs: nounwind readnone +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #2 + +declare void @__kmpc_kernel_init(i32, i16) + +declare void @__kmpc_data_sharing_init_stack() + +declare void @__kmpc_for_static_init_4(%struct.ident_t*, i32, i32, i32*, i32*, i32*, i32*, i32, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %c.addr = alloca double*, align 8 + %a.addr = alloca i32*, align 8 + %b.addr = alloca float*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store double* %c, double** %c.addr, align 8 + store i32* %a, i32** %a.addr, align 8 + store float* %b, float** %b.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 1023, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %5 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %5 to i64 + %6 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %6 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %8 = load i32*, i32** %a.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4 + %conv4 = sitofp i32 %10 to float + %11 = load float*, float** %b.addr, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom5 = sext i32 %12 to i64 + %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5 + %13 = load float, float* %arrayidx6, align 4 + %mul7 = fmul float %conv4, %13 + %conv8 = fpext float %mul7 to double + %14 = load double*, double** %c.addr, align 8 + %15 = load i32, i32* %i, align 4 + %idxprom9 = sext i32 %15 to i64 + %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9 + %16 = load double, double* %arrayidx10, align 8 + %add11 = fadd double %16, %conv8 + store double %add11, double* %arrayidx10, align 8 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %17 = load i32, i32* %.omp.iv, align 4 + %18 = load i32, i32* %.omp.stride, align 4 + %add12 = add nsw i32 %17, %18 + store i32 %add12, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3) + ret void +} + +declare void @__kmpc_for_static_fini(%struct.ident_t*, i32) + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__1_wrapper(i16 zeroext, i32) #0 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + %2 = load i8**, i8*** %global_args, align 8 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i64* + %5 = load i64, i64* %4, align 8 + %6 = getelementptr inbounds i8*, i8** %2, i64 1 + %7 = bitcast i8** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds i8*, i8** %2, i64 2 + %10 = bitcast i8** %9 to double** + %11 = load double*, double** %10, align 8 + %12 = getelementptr inbounds i8*, i8** %2, i64 3 + %13 = bitcast i8** %12 to i32** + %14 = load i32*, i32** %13, align 8 + %15 = getelementptr inbounds i8*, i8** %2, i64 4 + %16 = bitcast i8** %15 to float** + %17 = load float*, float** %16, align 8 + call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, double* %11, i32* %14, float* %17) #4 + ret void +} + +declare void @__kmpc_get_shared_variables(i8***) + +declare void @__kmpc_kernel_prepare_parallel(i8*, i16) + +declare void @__kmpc_begin_sharing_variables(i8***, i64) + +; Function Attrs: convergent +declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) #3 + +declare void @__kmpc_end_sharing_variables() + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid., i64 %.previous.lb., i64 %.previous.ub., double* %c, i32* %a, float* %b) #1 { +entry: + %.global_tid..addr = alloca i32*, align 8 + %.bound_tid..addr = alloca i32*, align 8 + %.previous.lb..addr = alloca i64, align 8 + %.previous.ub..addr = alloca i64, align 8 + %c.addr = alloca double*, align 8 + %a.addr = alloca i32*, align 8 + %b.addr = alloca float*, align 8 + %.omp.iv = alloca i32, align 4 + %tmp = alloca i32, align 4 + %.omp.lb = alloca i32, align 4 + %.omp.ub = alloca i32, align 4 + %.omp.stride = alloca i32, align 4 + %.omp.is_last = alloca i32, align 4 + %i = alloca i32, align 4 + store i32* %.global_tid., i32** %.global_tid..addr, align 8 + store i32* %.bound_tid., i32** %.bound_tid..addr, align 8 + store i64 %.previous.lb., i64* %.previous.lb..addr, align 8 + store i64 %.previous.ub., i64* %.previous.ub..addr, align 8 + store double* %c, double** %c.addr, align 8 + store i32* %a, i32** %a.addr, align 8 + store float* %b, float** %b.addr, align 8 + store i32 0, i32* %.omp.lb, align 4 + store i32 1023, i32* %.omp.ub, align 4 + %0 = load i64, i64* %.previous.lb..addr, align 8 + %conv = trunc i64 %0 to i32 + %1 = load i64, i64* %.previous.ub..addr, align 8 + %conv1 = trunc i64 %1 to i32 + store i32 %conv, i32* %.omp.lb, align 4 + store i32 %conv1, i32* %.omp.ub, align 4 + store i32 1, i32* %.omp.stride, align 4 + store i32 0, i32* %.omp.is_last, align 4 + %2 = load i32*, i32** %.global_tid..addr, align 8 + %3 = load i32, i32* %2, align 4 + call void @__kmpc_for_static_init_4(%struct.ident_t* @1, i32 %3, i32 33, i32* %.omp.is_last, i32* %.omp.lb, i32* %.omp.ub, i32* %.omp.stride, i32 1, i32 1) + %4 = load i32, i32* %.omp.lb, align 4 + store i32 %4, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.cond: ; preds = %omp.inner.for.inc, %entry + %5 = load i32, i32* %.omp.iv, align 4 + %conv2 = sext i32 %5 to i64 + %6 = load i64, i64* %.previous.ub..addr, align 8 + %cmp = icmp ule i64 %conv2, %6 + br i1 %cmp, label %omp.inner.for.body, label %omp.inner.for.end + +omp.inner.for.body: ; preds = %omp.inner.for.cond + %7 = load i32, i32* %.omp.iv, align 4 + %mul = mul nsw i32 %7, 1 + %add = add nsw i32 0, %mul + store i32 %add, i32* %i, align 4 + %8 = load i32*, i32** %a.addr, align 8 + %9 = load i32, i32* %i, align 4 + %idxprom = sext i32 %9 to i64 + %arrayidx = getelementptr inbounds i32, i32* %8, i64 %idxprom + %10 = load i32, i32* %arrayidx, align 4 + %conv4 = sitofp i32 %10 to float + %11 = load float*, float** %b.addr, align 8 + %12 = load i32, i32* %i, align 4 + %idxprom5 = sext i32 %12 to i64 + %arrayidx6 = getelementptr inbounds float, float* %11, i64 %idxprom5 + %13 = load float, float* %arrayidx6, align 4 + %mul7 = fmul float %conv4, %13 + %conv8 = fpext float %mul7 to double + %14 = load double*, double** %c.addr, align 8 + %15 = load i32, i32* %i, align 4 + %idxprom9 = sext i32 %15 to i64 + %arrayidx10 = getelementptr inbounds double, double* %14, i64 %idxprom9 + %16 = load double, double* %arrayidx10, align 8 + %add11 = fadd double %16, %conv8 + store double %add11, double* %arrayidx10, align 8 + br label %omp.body.continue + +omp.body.continue: ; preds = %omp.inner.for.body + br label %omp.inner.for.inc + +omp.inner.for.inc: ; preds = %omp.body.continue + %17 = load i32, i32* %.omp.iv, align 4 + %18 = load i32, i32* %.omp.stride, align 4 + %add12 = add nsw i32 %17, %18 + store i32 %add12, i32* %.omp.iv, align 4 + br label %omp.inner.for.cond + +omp.inner.for.end: ; preds = %omp.inner.for.cond + br label %omp.loop.exit + +omp.loop.exit: ; preds = %omp.inner.for.end + call void @__kmpc_for_static_fini(%struct.ident_t* @0, i32 %3) + ret void +} + +; Function Attrs: norecurse nounwind +define internal void @__omp_outlined__2_wrapper(i16 zeroext, i32) #0 { +entry: + %.addr = alloca i16, align 2 + %.addr1 = alloca i32, align 4 + %.zero.addr = alloca i32, align 4 + %global_args = alloca i8**, align 8 + store i32 0, i32* %.zero.addr, align 4 + store i16 %0, i16* %.addr, align 2 + store i32 %1, i32* %.addr1, align 4 + call void @__kmpc_get_shared_variables(i8*** %global_args) + %2 = load i8**, i8*** %global_args, align 8 + %3 = getelementptr inbounds i8*, i8** %2, i64 0 + %4 = bitcast i8** %3 to i64* + %5 = load i64, i64* %4, align 8 + %6 = getelementptr inbounds i8*, i8** %2, i64 1 + %7 = bitcast i8** %6 to i64* + %8 = load i64, i64* %7, align 8 + %9 = getelementptr inbounds i8*, i8** %2, i64 2 + %10 = bitcast i8** %9 to double** + %11 = load double*, double** %10, align 8 + %12 = getelementptr inbounds i8*, i8** %2, i64 3 + %13 = bitcast i8** %12 to i32** + %14 = load i32*, i32** %13, align 8 + %15 = getelementptr inbounds i8*, i8** %2, i64 4 + %16 = bitcast i8** %15 to float** + %17 = load float*, float** %16, align 8 + call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr, i64 %5, i64 %8, double* %11, i32* %14, float* %17) #4 + ret void +} + +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) + +declare void @__kmpc_kernel_deinit(i16) + +declare i1 @__kmpc_kernel_parallel(i8**, i16) + +declare void @__kmpc_kernel_end_parallel() + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } +attributes #3 = { convergent } +attributes #4 = { nounwind } + +!omp_offload.info = !{!0} +!nvvm.annotations = !{!1} +!llvm.module.flags = !{!2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 0, i32 24, i32 41957707, !"foo", i32 3, i32 0} +!1 = !{void (i32*, float*, double*)* @__omp_offloading_18_280394b_foo_l3, !"kernel", i32 1} +!2 = !{i32 1, !"wchar_size", i32 4} +!3 = !{i32 7, !"PIC Level", i32 2} +!4 = !{!"clang version 9.0.0 (http://llvm.org/git/clang.git c6f1d4e0e14fbd11f4cc61068c429a067faf86ef) (http://llvm.org/git/llvm.git 0f783294e2ea6fe630e7655f303b4bc33bfd6167)"} +!5 = !{!6, !8} +!6 = distinct !{!6, !7, !"__omp_outlined__: %.global_tid."} +!7 = distinct !{!7, !"__omp_outlined__"} +!8 = distinct !{!8, !7, !"__omp_outlined__: %.bound_tid."} + +; __CLANG_OFFLOAD_BUNDLE____END__ openmp-nvptx64-nvida-cud Index: openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu +++ openmp/libomptarget/deviceRTLs/nvptx/src/data_sharing.cu @@ -346,7 +346,7 @@ // statically allocated shared memory slots. The size of a shared memory // slot is pre-determined to be 256 bytes. data_sharing_init_stack_common(); - omptarget_nvptx_globalArgs.Init(); + omptarget_nvptx_globalArgBuffer.Init(); } // Initialize data sharing data structure. This function needs to be called @@ -506,14 +506,11 @@ } } -// Begin a data sharing context. Maintain a list of references to shared -// variables. This list of references to shared variables will be passed -// to one or more threads. -// In L0 data sharing this is called by master thread. -// In L1 data sharing this is called by active warp master thread. -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { - omptarget_nvptx_globalArgs.EnsureSize(nArgs); - *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +/// Enuse the data sharing context has at least \p NumBytes and return a pointer +/// to the beginning of the shared memory. +EXTERN char *__kmpc_begin_sharing_variables(size_t NumBytes) { + omptarget_nvptx_globalArgBuffer.EnsureSize(NumBytes); + return omptarget_nvptx_globalArgBuffer.begin(); } // End a data sharing context. There is no need to have a list of refs @@ -523,7 +520,7 @@ // In L0 data sharing this is called by master thread. // In L1 data sharing this is called by active warp master thread. EXTERN void __kmpc_end_sharing_variables() { - omptarget_nvptx_globalArgs.DeInit(); + omptarget_nvptx_globalArgBuffer.DeInit(); } // This function will return a list of references to global variables. This @@ -531,8 +528,8 @@ // members of this list will be passed to the outlined parallel function // preserving the order. // Called by all workers. -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +EXTERN char *__kmpc_get_shared_variables() { + return omptarget_nvptx_globalArgBuffer.begin(); } // This function is used to init static memory manager. This manager is used to Index: openmp/libomptarget/deviceRTLs/nvptx/src/interface.h =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/interface.h +++ openmp/libomptarget/deviceRTLs/nvptx/src/interface.h @@ -513,16 +513,15 @@ int32_t *LaneId, int32_t *NumLanes); EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); - EXTERN void __kmpc_data_sharing_init_stack(); EXTERN void __kmpc_data_sharing_init_stack_spmd(); EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); EXTERN void __kmpc_data_sharing_pop_stack(void *a); -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); +EXTERN char *__kmpc_begin_sharing_variables(size_t NumBytes); EXTERN void __kmpc_end_sharing_variables(); -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); +EXTERN char *__kmpc_get_shared_variables(); // The slot used for data sharing by the master and worker threads. We use a // complete (default size version and an incomplete one so that we allow sizes @@ -560,4 +559,20 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, int16_t is_shared); +/// Generic kernel initialization which defers to __kmpc_spmd_kernel_init if \p +/// IsSPMD is true, and to __kmpc_kernel_init otherwise. +EXTERN int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t UseSM, + int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing); +/// TODO +EXTERN void __kmpc_generic_kernel_deinit(int16_t IsSPMD, + int16_t RequiredOMPRuntime); +/// TODO +/// +/// NOTE: Changing this type will require changes in the Clang NVPTX code +/// generation as well as the LLVM OpenMPOpt pass! +EXTERN void __kmpc_generic_kernel_parallel(void *OutlinedFn, void *Payload, + int16_t PayloadBytes, + int16_t RequiredOMPRuntime); + #endif Index: openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu +++ openmp/libomptarget/deviceRTLs/nvptx/src/omp_data.cu @@ -62,4 +62,5 @@ //////////////////////////////////////////////////////////////////////////////// // Data sharing related variables. //////////////////////////////////////////////////////////////////////////////// -__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +__device__ __shared__ omptarget_nvptx_SharedBuffer + omptarget_nvptx_globalArgBuffer; Index: openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h +++ openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.h @@ -64,45 +64,45 @@ #define __SYNCTHREADS() __SYNCTHREADS_N(0) // arguments needed for L0 parallelism only. -class omptarget_nvptx_SharedArgs { +class omptarget_nvptx_SharedBuffer { public: // All these methods must be called by the master thread only. INLINE void Init() { - args = buffer; - nArgs = MAX_SHARED_ARGS; + UsedBuffer = &FixedBuffer[0]; + NumBytes = MAX_SHARED_BYTES; } INLINE void DeInit() { // Free any memory allocated for outlined parallel function with a large // number of arguments. - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, (char *)"new extended args"); + if (NumBytes > MAX_SHARED_BYTES) { + SafeFree(UsedBuffer, (char *)"deinit extended shared buffer"); Init(); } } - INLINE void EnsureSize(size_t size) { - if (size > nArgs) { - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, (char *)"new extended args"); + INLINE void EnsureSize(size_t RequestedBytes) { + if (RequestedBytes > NumBytes) { + if (NumBytes > MAX_SHARED_BYTES) { + SafeFree(UsedBuffer, (char *)"new extended shared buffer"); } - args = (void **) SafeMalloc(size * sizeof(void *), - (char *)"new extended args"); - nArgs = size; + UsedBuffer = (char *)SafeMalloc(RequestedBytes, + (char *)"new extended shared buffer"); + NumBytes = RequestedBytes; } } // Called by all threads. - INLINE void **GetArgs() const { return args; }; + INLINE char *begin() const { return UsedBuffer; }; private: - // buffer of pre-allocated arguments. - void *buffer[MAX_SHARED_ARGS]; - // pointer to arguments buffer. - // starts off as a pointer to 'buffer' but can be dynamically allocated. - void **args; - // starts off as MAX_SHARED_ARGS but can increase in size. - uint32_t nArgs; + // FixedBuffer of pre-allocated arguments. + char FixedBuffer[MAX_SHARED_BYTES * sizeof(void *)]; + // pointer to arguments FixedBuffer. + // starts off as a pointer to 'FixedBuffer' but can be dynamically allocated. + char *UsedBuffer; + // starts off as MAX_SHARED_BYTES but can increase in size. + uint32_t NumBytes; }; -extern __device__ __shared__ omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; +extern __device__ __shared__ omptarget_nvptx_SharedBuffer + omptarget_nvptx_globalArgBuffer; // Data sharing related quantities, need to match what is used in the compiler. enum DATA_SHARING_SIZES { Index: openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu +++ openmp/libomptarget/deviceRTLs/nvptx/src/omptarget-nvptx.cu @@ -182,3 +182,128 @@ PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n"); return isSPMDMode(); } + +typedef void (*WorkFnTy)(void*); + +INLINE static void +__kmpc_generic_kernel_state_machine(int16_t IsOMPRuntimeInitialized) { + + do { + void *WorkFn = 0; + + __kmpc_barrier_simple_spmd(NULL, 0); + + bool IsActive = __kmpc_kernel_parallel(&WorkFn, IsOMPRuntimeInitialized); + + // If there is nothing more to do, break out of the state machine by + // returning to the caller. + if (!WorkFn) + return; + + if (IsActive) { + char *Args = omptarget_nvptx_globalArgBuffer.begin(); + + ((WorkFnTy) WorkFn)((void *)Args); + + __kmpc_kernel_end_parallel(); + } + + __kmpc_barrier_simple_spmd(NULL, 0); + + } while (true); +} + +/// Filter threads if \p UseSM is true. Workers will enter the state machine +/// through __kmpc_generic_kernel_state_machine and be trapped there. The master +/// and non working threads will return from this function. The return value +/// indicates if the thread is the master. +INLINE static int16_t +__kmpc_generic_kernel_thread_filter(unsigned ThreadLimit, + int16_t UseSM, int16_t IsOMPRuntimeInitialized) { + + unsigned TId = GetThreadIdInBlock(); + bool IsWorker = TId < ThreadLimit; + + if (IsWorker) { + if (UseSM) + __kmpc_generic_kernel_state_machine(IsOMPRuntimeInitialized); + return -1; + } + + return TId == GetMasterThreadID(); +} + +EXTERN int16_t __kmpc_generic_kernel_init(int16_t IsSPMD, int16_t UseSM, + int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing) { + unsigned NumThreads = GetNumberOfThreadsInBlock();// GetNumberOfWorkersInTeam(); + + // Handle the SPMD case first. + if (IsSPMD) { + + __kmpc_spmd_kernel_init(NumThreads, RequiresOMPRuntime, + RequiresDataSharing); + + // TODO: This was copied from the clang code but it seems odd that we use + // RequiresOMPRuntime and not RequiresDataSharing. The latter seems to be + // always false anyway. + // + // For data sharing, we need to initialize the stack. + if (RequiresOMPRuntime) + __kmpc_data_sharing_init_stack_spmd(); + + return 1; + } + + unsigned ThreadLimit = NumThreads - WARPSIZE; + int16_t FilterVal = __kmpc_generic_kernel_thread_filter( + ThreadLimit, UseSM, RequiresOMPRuntime); + + if (FilterVal == 1) { + __kmpc_kernel_init(ThreadLimit, RequiresOMPRuntime); + __kmpc_data_sharing_init_stack(); + } + + return FilterVal; +} + +EXTERN void __kmpc_generic_kernel_deinit(int16_t IsSPMD, + int16_t RequiredOMPRuntime) { + if (IsSPMD) { + __kmpc_spmd_kernel_deinit_v2(RequiredOMPRuntime); + } else { + // TODO port vars epilog to the runtime. + + __kmpc_kernel_deinit(RequiredOMPRuntime); + + // Barrier to terminate worker threads. + __kmpc_barrier_simple_spmd(NULL, 0); + } +} + +EXTERN void __kmpc_generic_kernel_parallel(void *OutlinedFn, + void *Payload, int16_t PayloadBytes, + int16_t RequiredOMPRuntime) { + __kmpc_kernel_prepare_parallel(OutlinedFn, RequiredOMPRuntime); + + if (PayloadBytes) { + omptarget_nvptx_globalArgBuffer.EnsureSize(PayloadBytes); + char *Args = omptarget_nvptx_globalArgBuffer.begin(); + memcpy(Args, Payload, PayloadBytes); + } + + // Activate workers. This barrier is used by the master to signal + // work for the workers. + __kmpc_barrier_simple_spmd(NULL, 0); + + // OpenMP [2.5, Parallel Construct, p.49] + // There is an implied barrier at the end of a parallel region. After the + // end of a parallel region, only the master thread of the team resumes + // execution of the enclosing task region. + // + // The master waits at this barrier until all workers are done. + __kmpc_barrier_simple_spmd(NULL, 0); + + if (PayloadBytes) + __kmpc_end_sharing_variables(); +} Index: openmp/libomptarget/deviceRTLs/nvptx/src/option.h =================================================================== --- openmp/libomptarget/deviceRTLs/nvptx/src/option.h +++ openmp/libomptarget/deviceRTLs/nvptx/src/option.h @@ -27,9 +27,9 @@ // region to synchronize with each other. #define L1_BARRIER (1) -// Maximum number of preallocated arguments to an outlined parallel/simd function. -// Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 +// Maximum number of preallocated bytes sharable with an outlined parallel/simd +// function. Anything more requires dynamic memory allocation. +#define MAX_SHARED_BYTES 20 // Maximum number of omp state objects per SM allocated statically in global // memory.