diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -31,7 +31,7 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -41,7 +41,7 @@ // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp --- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp +++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp @@ -21,28 +21,28 @@ // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1 void foo() { -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams distribute parallel for simd if(a) @@ -67,28 +67,28 @@ for (int i = 0; i < 10; ++i) ; int a; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams distribute parallel for lastprivate(a) @@ -112,25 +112,25 @@ #pragma omp target teams distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK: call i32 @__kmpc_target_init( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams @@ -172,28 +172,28 @@ #pragma omp distribute parallel for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target teams @@ -224,28 +224,28 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[DISTR_LIGHT]] // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[DISTR_FULL]] // CHECK-DAG: [[FULL]] #pragma omp target @@ -283,22 +283,22 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] #pragma omp target parallel for if(a) for (int i = 0; i < 10; ++i) @@ -321,28 +321,28 @@ #pragma omp target parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] #pragma omp target parallel if(a) @@ -373,27 +373,27 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] // CHECK-DAG: [[BAR_LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] // CHECK-DAG: [[BAR_FULL]] #pragma omp target @@ -431,22 +431,22 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-DAG: [[FOR_LIGHT]] // CHECK-DAG: [[LIGHT]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) // CHECK-DAG: [[FULL]] #pragma omp target #pragma omp parallel for diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp --- a/clang/test/OpenMP/nvptx_data_sharing.cpp +++ b/clang/test/OpenMP/nvptx_data_sharing.cpp @@ -393,7 +393,7 @@ // CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK-NEXT: [[C:%.*]] = alloca i32, align 4 // CHECK-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [2 x i8*], align 8 -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK: user_code.entry: @@ -420,7 +420,7 @@ // CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i64 2) // CHECK-NEXT: call void @__kmpc_free_shared(i8* [[B]], i64 4) // CHECK-NEXT: call void @__kmpc_free_shared(i8* [[A]], i64 4) -// CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-NEXT: ret void // CHECK: worker.exit: // CHECK-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -2995,7 +2995,7 @@ // CHECK4-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* // CHECK4-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 8 -// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -3006,7 +3006,7 @@ // CHECK4-NEXT: [[TMP7:%.*]] = load i64, i64* [[ARGC_CASTED]], align 8 // CHECK4-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i64 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR1:[0-9]+]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -3328,7 +3328,7 @@ // CHECK5-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 // CHECK5-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK5-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK5-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1 // CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK5: user_code.entry: @@ -3338,7 +3338,7 @@ // CHECK5-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 // CHECK5-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR1:[0-9]+]] -// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK5-NEXT: ret void // CHECK5: worker.exit: // CHECK5-NEXT: ret void @@ -3651,7 +3651,7 @@ // CHECK6-NEXT: [[TMP1:%.*]] = load [10 x i32]*, [10 x i32]** [[C_ADDR]], align 4 // CHECK6-NEXT: [[TMP2:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK6-NEXT: [[TMP3:%.*]] = load [10 x i32]*, [10 x i32]** [[D_ADDR]], align 4 -// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK6-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1 // CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK6: user_code.entry: @@ -3661,7 +3661,7 @@ // CHECK6-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARGC_CASTED]], align 4 // CHECK6-NEXT: store i32 [[TMP5]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]], [10 x i32]* [[TMP1]], i32* [[TMP2]], i32 [[TMP7]], [10 x i32]* [[TMP3]]) #[[ATTR1:[0-9]+]] -// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK6-NEXT: ret void // CHECK6: worker.exit: // CHECK6-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp --- a/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp +++ b/clang/test/OpenMP/nvptx_force_full_runtime_SPMD_codegen.cpp @@ -11,13 +11,13 @@ // CHECK-NOT: @__omp_offloading_{{.+}}_exec_mode = weak constant i8 1 void foo() { -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target teams distribute parallel for simd for (int i = 0; i < 10; ++i) ; @@ -40,13 +40,13 @@ for (int i = 0; i < 10; ++i) ; int a; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target teams distribute parallel for lastprivate(a) for (int i = 0; i < 10; ++i) a = i; @@ -68,13 +68,13 @@ #pragma omp target teams distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target teams #pragma omp distribute parallel for simd for (int i = 0; i < 10; ++i) @@ -103,13 +103,13 @@ #pragma omp distribute parallel for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target teams #pragma omp distribute parallel for for (int i = 0; i < 10; ++i) @@ -138,13 +138,13 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target #pragma omp teams #pragma omp distribute parallel for @@ -180,13 +180,13 @@ #pragma omp distribute parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target parallel for for (int i = 0; i < 10; ++i) ; @@ -208,13 +208,13 @@ #pragma omp target parallel for schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target parallel #pragma omp for simd for (int i = 0; i < 10; ++i) @@ -243,13 +243,13 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target #pragma omp parallel #pragma omp for simd ordered @@ -285,13 +285,13 @@ #pragma omp for simd schedule(guided) for (int i = 0; i < 10; ++i) ; -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 true) #pragma omp target #pragma omp parallel for for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp @@ -29,14 +29,14 @@ // CHECK1-SAME: () #[[ATTR0:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK1-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP2]], i64 0) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -66,12 +66,12 @@ // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 // CHECK1-SAME: () #[[ATTR1]] { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: call void @_Z3usev() #[[ATTR6]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -107,14 +107,14 @@ // CHECK2-SAME: () #[[ATTR0:[0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK2-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP2]], i32 0) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -144,12 +144,12 @@ // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 // CHECK2-SAME: () #[[ATTR1]] { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: call void @_Z3usev() #[[ATTR6]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -185,14 +185,14 @@ // CHECK3-SAME: () #[[ATTR0:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK3-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP2]], i32 0) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -222,12 +222,12 @@ // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23 // CHECK3-SAME: () #[[ATTR1]] { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: call void @_Z3usev() #[[ATTR6]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp @@ -40,7 +40,7 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -52,7 +52,7 @@ // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i64 1) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -173,7 +173,7 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -185,7 +185,7 @@ // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -306,7 +306,7 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -318,7 +318,7 @@ // CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP5]], i32 1) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp @@ -1446,7 +1446,7 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 8 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -1460,7 +1460,7 @@ // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[CONV]], align 8 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 1 // CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8 -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -1563,7 +1563,7 @@ // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[A_ADDR]] to i32* // CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AA_ADDR]] to i16* // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -1585,7 +1585,7 @@ // CHECK1-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP8]], 1 // CHECK1-NEXT: store i32 [[ADD6]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -1628,7 +1628,7 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -1646,7 +1646,7 @@ // CHECK1-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK1-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A1]], i64 4) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -1721,7 +1721,7 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x i8*], align 4 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x i8*], align 4 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -1735,7 +1735,7 @@ // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP5]], 1 // CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4 -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -1836,7 +1836,7 @@ // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -1858,7 +1858,7 @@ // CHECK2-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP8]], 1 // CHECK2-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -1900,7 +1900,7 @@ // CHECK2-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -1918,7 +1918,7 @@ // CHECK2-NEXT: [[INC:%.*]] = add nsw i32 [[TMP6]], 1 // CHECK2-NEXT: store i32 [[INC]], i32* [[A_ON_STACK]], align 4 // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[A1]], i32 4) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp @@ -465,7 +465,7 @@ // CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 // CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK: user_code.entry: @@ -487,7 +487,7 @@ // CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], 1 // CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4 // CHECK-NEXT: call void @__kmpc_free_shared(i8* [[D]], i64 4) -// CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-NEXT: ret void // CHECK: worker.exit: // CHECK-NEXT: ret void @@ -584,7 +584,7 @@ // // // CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper -// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 // CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 @@ -601,6 +601,6 @@ // CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1 // CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32** // CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8 -// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR1:[0-9]+]] +// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR2:[0-9]+]] // CHECK-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_codegen.cpp @@ -153,7 +153,7 @@ // CHECK1-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 8 // CHECK1-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -166,7 +166,7 @@ // CHECK1-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i64 2) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -195,11 +195,11 @@ // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39 // CHECK1-SAME: () #[[ATTR1]] { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -211,7 +211,7 @@ // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -225,7 +225,7 @@ // CHECK1-NEXT: [[ADD4:%.*]] = add nsw i32 [[CONV3]], 2 // CHECK1-NEXT: [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16 // CHECK1-NEXT: store i16 [[CONV5]], i16* [[CONV]], align 8 -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -261,7 +261,7 @@ // CHECK1-NEXT: [[TMP5:%.*]] = load i64, i64* [[VLA_ADDR4]], align 8 // CHECK1-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 8 // CHECK1-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 8 -// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -305,7 +305,7 @@ // CHECK1-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8 // CHECK1-NEXT: [[ADD22:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK1-NEXT: store i64 [[ADD22]], i64* [[CALL]], align 8 -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -338,7 +338,7 @@ // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16* // CHECK1-NEXT: [[CONV2:%.*]] = bitcast i64* [[AAA_ADDR]] to i8* // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -359,7 +359,7 @@ // CHECK1-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP5]], 1 // CHECK1-NEXT: store i32 [[ADD9]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -383,7 +383,7 @@ // CHECK1-NEXT: [[TMP1:%.*]] = load i64, i64* [[VLA_ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load i64, i64* [[VLA_ADDR2]], align 8 // CHECK1-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -406,7 +406,7 @@ // CHECK1-NEXT: [[CONV8:%.*]] = fptosi double [[TMP8]] to i32 // CHECK1-NEXT: [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 // CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV8]], double* nonnull align 8 dereferenceable(8) [[A9]]) #[[ATTR6]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -439,7 +439,7 @@ // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142 // CHECK1-SAME: () #[[ATTR1]] { // CHECK1-NEXT: entry: -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -448,7 +448,7 @@ // CHECK1: worker.exit: // CHECK1-NEXT: ret void // CHECK1: 1: -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // // @@ -464,7 +464,7 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[AA_ADDR]] to i16* // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -480,7 +480,7 @@ // CHECK1-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK1-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK1-NEXT: store i32 [[ADD5]], i32* [[ARRAYIDX]], align 4 -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -540,7 +540,7 @@ // CHECK2-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4 // CHECK2-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -553,7 +553,7 @@ // CHECK2-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -582,11 +582,11 @@ // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39 // CHECK2-SAME: () #[[ATTR1]] { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -598,7 +598,7 @@ // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -612,7 +612,7 @@ // CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[CONV3]], 2 // CHECK2-NEXT: [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16 // CHECK2-NEXT: store i16 [[CONV5]], i16* [[CONV]], align 4 -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -647,7 +647,7 @@ // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4 // CHECK2-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4 -// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -691,7 +691,7 @@ // CHECK2-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8 // CHECK2-NEXT: [[ADD21:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK2-NEXT: store i64 [[ADD21]], i64* [[CALL]], align 8 -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -723,7 +723,7 @@ // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8* // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -744,7 +744,7 @@ // CHECK2-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK2-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP5]], 1 // CHECK2-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -767,7 +767,7 @@ // CHECK2-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4 // CHECK2-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4 -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -790,7 +790,7 @@ // CHECK2-NEXT: [[CONV7:%.*]] = fptosi double [[TMP8]] to i32 // CHECK2-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 // CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR6]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -823,7 +823,7 @@ // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142 // CHECK2-SAME: () #[[ATTR1]] { // CHECK2-NEXT: entry: -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -832,7 +832,7 @@ // CHECK2: worker.exit: // CHECK2-NEXT: ret void // CHECK2: 1: -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // // @@ -847,7 +847,7 @@ // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -863,7 +863,7 @@ // CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK2-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK2-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4 -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -923,7 +923,7 @@ // CHECK3-NEXT: store i32* [[PTR1]], i32** [[PTR1_ADDR]], align 4 // CHECK3-NEXT: store i32** [[PTR2]], i32*** [[PTR2_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -936,7 +936,7 @@ // CHECK3-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32**, i32**)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP7]], i32 2) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -965,11 +965,11 @@ // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39 // CHECK3-SAME: () #[[ATTR1]] { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -981,7 +981,7 @@ // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -995,7 +995,7 @@ // CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[CONV3]], 2 // CHECK3-NEXT: [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16 // CHECK3-NEXT: store i16 [[CONV5]], i16* [[CONV]], align 4 -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1030,7 +1030,7 @@ // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[VLA_ADDR4]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load double*, double** [[CN_ADDR]], align 4 // CHECK3-NEXT: [[TMP7:%.*]] = load %struct.TT*, %struct.TT** [[D_ADDR]], align 4 -// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1074,7 +1074,7 @@ // CHECK3-NEXT: [[TMP17:%.*]] = load i64, i64* [[CALL]], align 8 // CHECK3-NEXT: [[ADD21:%.*]] = add nsw i64 [[TMP17]], 1 // CHECK3-NEXT: store i64 [[ADD21]], i64* [[CALL]], align 8 -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1106,7 +1106,7 @@ // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[AAA_ADDR]] to i8* // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1127,7 +1127,7 @@ // CHECK3-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK3-NEXT: [[ADD8:%.*]] = add nsw i32 [[TMP5]], 1 // CHECK3-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1150,7 +1150,7 @@ // CHECK3-NEXT: [[TMP1:%.*]] = load i32, i32* [[VLA_ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load i32, i32* [[VLA_ADDR2]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = load i16*, i16** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1173,7 +1173,7 @@ // CHECK3-NEXT: [[CONV7:%.*]] = fptosi double [[TMP8]] to i32 // CHECK3-NEXT: [[A8:%.*]] = getelementptr inbounds [[STRUCT_S1]], %struct.S1* [[TMP0]], i32 0, i32 0 // CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3baziRd(i32 [[CONV7]], double* nonnull align 8 dereferenceable(8) [[A8]]) #[[ATTR6]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1206,7 +1206,7 @@ // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142 // CHECK3-SAME: () #[[ATTR1]] { // CHECK3-NEXT: entry: -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1215,7 +1215,7 @@ // CHECK3: worker.exit: // CHECK3-NEXT: ret void // CHECK3: 1: -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // // @@ -1230,7 +1230,7 @@ // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1246,7 +1246,7 @@ // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 // CHECK3-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP4]], 1 // CHECK3-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4 -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp @@ -58,7 +58,7 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -68,7 +68,7 @@ // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -105,7 +105,7 @@ // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -121,7 +121,7 @@ // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i64 3) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -165,7 +165,7 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -175,7 +175,7 @@ // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -212,7 +212,7 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -228,7 +228,7 @@ // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -272,7 +272,7 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -282,7 +282,7 @@ // CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -319,7 +319,7 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -335,7 +335,7 @@ // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -379,7 +379,7 @@ // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -389,7 +389,7 @@ // CHECK4-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -426,7 +426,7 @@ // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -442,7 +442,7 @@ // CHECK4-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 // CHECK4-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i64 3) -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -486,7 +486,7 @@ // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK5: user_code.entry: @@ -496,7 +496,7 @@ // CHECK5-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK5-NEXT: ret void // CHECK5: worker.exit: // CHECK5-NEXT: ret void @@ -533,7 +533,7 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK5: user_code.entry: @@ -549,7 +549,7 @@ // CHECK5-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 // CHECK5-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) -// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK5-NEXT: ret void // CHECK5: worker.exit: // CHECK5-NEXT: ret void @@ -593,7 +593,7 @@ // CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK6: user_code.entry: @@ -603,7 +603,7 @@ // CHECK6-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK6-NEXT: ret void // CHECK6: worker.exit: // CHECK6-NEXT: ret void @@ -640,7 +640,7 @@ // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK6: user_code.entry: @@ -656,7 +656,7 @@ // CHECK6-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 // CHECK6-NEXT: [[TMP11:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP11]], i32 3) -// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK6-NEXT: ret void // CHECK6: worker.exit: // CHECK6-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp @@ -53,7 +53,7 @@ // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK1-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -63,7 +63,7 @@ // CHECK1-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -103,7 +103,7 @@ // CHECK1-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* -// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -120,7 +120,7 @@ // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -164,7 +164,7 @@ // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK2-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -174,7 +174,7 @@ // CHECK2-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK2-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -213,7 +213,7 @@ // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK2-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -230,7 +230,7 @@ // CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -274,7 +274,7 @@ // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK3-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -284,7 +284,7 @@ // CHECK3-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -323,7 +323,7 @@ // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK3-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -340,7 +340,7 @@ // CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -384,7 +384,7 @@ // CHECK4-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 // CHECK4-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -394,7 +394,7 @@ // CHECK4-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 // CHECK4-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -434,7 +434,7 @@ // CHECK4-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 8 // CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 // CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* -// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -451,7 +451,7 @@ // CHECK4-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8 // CHECK4-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK4-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i64 3) -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -495,7 +495,7 @@ // CHECK5-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK5-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK5: user_code.entry: @@ -505,7 +505,7 @@ // CHECK5-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK5-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK5-NEXT: ret void // CHECK5: worker.exit: // CHECK5-NEXT: ret void @@ -544,7 +544,7 @@ // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK5-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK5: user_code.entry: @@ -561,7 +561,7 @@ // CHECK5-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK5-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK5-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) -// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK5-NEXT: ret void // CHECK5: worker.exit: // CHECK5-NEXT: ret void @@ -605,7 +605,7 @@ // CHECK6-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 // CHECK6-NEXT: store i16* [[AA]], i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) // CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK6: user_code.entry: @@ -615,7 +615,7 @@ // CHECK6-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 // CHECK6-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i16*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) -// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK6-NEXT: ret void // CHECK6: worker.exit: // CHECK6-NEXT: ret void @@ -654,7 +654,7 @@ // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 // CHECK6-NEXT: [[TMP1:%.*]] = load i16*, i16** [[AA_ADDR]], align 4 // CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1 // CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK6: user_code.entry: @@ -671,7 +671,7 @@ // CHECK6-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 4 // CHECK6-NEXT: [[TMP12:%.*]] = bitcast [3 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** // CHECK6-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP4]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*, [10 x i32]*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP12]], i32 3) -// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK6-NEXT: ret void // CHECK6: worker.exit: // CHECK6-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp @@ -1,3 +1,4 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ // Test target codegen - host bc file has to be created first. // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc // RUN: %clang_cc1 -verify -fopenmp -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 @@ -9,12 +10,8 @@ #define HEADER // Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. -// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = weak addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32] // Check that the execution mode of all 3 target regions is set to Spmd Mode. -// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 2 -// CHECK-DAG: {{@__omp_offloading_.+l32}}_exec_mode = weak constant i8 2 -// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 2 template tx ftemplate(int n) { @@ -52,741 +49,3746 @@ return a; } -// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}( -// -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true) -// -// // define internal void [[PFN]]( -// CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align -// CHECK: [[EV:%.+]] = load double, double* [[E]], align -// CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 -// CHECK: store double [[ADD]], double* [[E]], align -// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* -// CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align -// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* -// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 1, i{{32|64}} {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) -// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 -// CHECK: br i1 [[CMP]], label -// CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align -// CHECK: [[EV:%.+]] = load double, double* [[E]], align -// CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] -// CHECK: store double [[ADD]], double* [[E_IN]], align -// CHECK: call void @__kmpc_nvptx_end_reduce_nowait( -// CHECK: br label -// -// CHECK: ret -// // Reduction function -// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) -// CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], -// CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* -// -// CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], -// CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* -// -// CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], -// CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], -// CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] -// CHECK: store double [[RES]], double* [[VAR_LHS]], -// CHECK: ret void -// // Shuffle and reduce function -// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) -// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align -// CHECK: [[REMOTE_ELT:%.+]] = alloca double -// -// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align -// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align -// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align -// -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* -// -// CHECK: [[ELT_CAST:%.+]] = bitcast double* [[ELT]] to i64* -// CHECK: [[REMOTE_ELT_CAST:%.+]] = bitcast double* [[REMOTE_ELT]] to i64* -// CHECK: [[ELT_VAL:%.+]] = load i64, i64* [[ELT_CAST]], align -// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 -// CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) -// -// CHECK: store i64 [[REMOTE_ELT_VAL64]], i64* [[REMOTE_ELT_CAST]], align -// CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* -// CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align -// // Condition to reduce -// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 -// -// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 -// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] -// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] -// -// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 -// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 -// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 -// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] -// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 -// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] -// -// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] -// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] -// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] -// -// CHECK: [[DO_REDUCE]] -// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* -// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* -// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) -// CHECK: br label {{%?}}[[REDUCE_CONT:.+]] -// -// CHECK: [[REDUCE_ELSE]] -// CHECK: br label {{%?}}[[REDUCE_CONT]] -// -// CHECK: [[REDUCE_CONT]] // Now check if we should just copy over the remote reduction list -// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 -// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] -// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] -// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// -// CHECK: [[DO_COPY]] -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* -// CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align -// CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// -// CHECK: [[COPY_CONT]] -// CHECK: void -// // Inter warp copy function -// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) -// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 -// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 -// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* -// CHECK: store i32 0, i32* [[CNT_ADDR:%.+]], -// CHECK: br label -// CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]], -// CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2 -// CHECK: br i1 [[DONE_COPY]], label -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 -// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// // [[DO_COPY]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[BASE_ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[BASE_ELT]], i32 [[CNT]] -// -// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] -// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], -// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// // Barrier after copy to shared memory storage medium. -// CHECK: [[COPY_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* -// // Read into warp 0. -// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] -// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] -// -// CHECK: [[DO_READ]] -// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT_BASE:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// CHECK: [[ELT:%.+]] = getelementptr i32, i32* [[ELT_BASE]], i32 [[CNT]] -// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], -// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], -// CHECK: br label {{%?}}[[READ_CONT:.+]] -// -// CHECK: [[READ_ELSE]] -// CHECK: br label {{%?}}[[READ_CONT]] -// -// CHECK: [[READ_CONT]] -// CHECK: [[NEXT:%.+]] = add nsw i32 [[CNT]], 1 -// CHECK: store i32 [[NEXT]], i32* [[CNT_ADDR]], -// CHECK: br label -// CHECK: ret -// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}( -// -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true) -// -// // define internal void [[PFN1]]( -// CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align -// CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align -// CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 -// CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 -// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 -// CHECK: store i8 [[TRUNC]], i8* [[C]], align -// CHECK: [[DV:%.+]] = load float, float* [[D]], align -// CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} -// CHECK: store float [[MUL]], float* [[D]], align -// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: store i8* [[C]], i8** [[PTR1]], align -// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* -// CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align -// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* -// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) -// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 -// CHECK: br i1 [[CMP]], label -// CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align -// CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 -// CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align -// CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 -// CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] -// CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 -// CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align -// CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align -// CHECK: [[DV:%.+]] = load float, float* [[D]], align -// CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] -// CHECK: store float [[MUL]], float* [[D_IN]], align -// CHECK: call void @__kmpc_nvptx_end_reduce_nowait( -// CHECK: br label -// -// CHECK: ret -// // Reduction function -// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) -// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], -// -// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], -// -// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], -// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* -// -// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], -// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* -// -// CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], -// CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 -// CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], -// CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 -// CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] -// CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 -// CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], -// -// CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], -// CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], -// CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] -// CHECK: store float [[RES]], float* [[VAR2_LHS]], -// CHECK: ret void -// // Shuffle and reduce function -// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) -// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align -// CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 -// CHECK: [[REMOTE_ELT2:%.+]] = alloca float -// -// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align -// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align -// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align -// -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align -// -// CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 -// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 -// CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) -// CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 -// -// CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align -// CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align -// -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* -// -// CHECK: [[ELT_CAST:%.+]] = bitcast float* [[ELT]] to i32* -// CHECK: [[REMOTE_ELT2_CAST:%.+]] = bitcast float* [[REMOTE_ELT2]] to i32* -// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT_CAST]], align -// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 -// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) -// -// CHECK: store i32 [[REMOTE_ELT2_VAL32]], i32* [[REMOTE_ELT2_CAST]], align -// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* -// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align -// // Condition to reduce -// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 -// -// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 -// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] -// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] -// -// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 -// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 -// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 -// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] -// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 -// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] -// -// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] -// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] -// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] -// -// CHECK: [[DO_REDUCE]] -// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* -// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* -// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) -// CHECK: br label {{%?}}[[REDUCE_CONT:.+]] -// -// CHECK: [[REDUCE_ELSE]] -// CHECK: br label {{%?}}[[REDUCE_CONT]] -// -// CHECK: [[REDUCE_CONT]] // Now check if we should just copy over the remote reduction list -// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 -// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] -// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] -// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// -// CHECK: [[DO_COPY]] -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align -// CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align -// -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* -// CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align -// CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// -// CHECK: [[COPY_CONT]] -// CHECK: void -// // Inter warp copy function -// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) -// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 -// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 -// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 -// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// // [[DO_COPY]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// -// CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] -// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* -// CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align -// CHECK: store volatile i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// // Barrier after copy to shared memory storage medium. -// CHECK: [[COPY_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* -// // Read into warp 0. -// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] -// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] -// -// CHECK: [[DO_READ]] -// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] -// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i8 addrspace([[SHARED_ADDRSPACE]])* -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align -// CHECK: br label {{%?}}[[READ_CONT:.+]] -// -// CHECK: [[READ_ELSE]] -// CHECK: br label {{%?}}[[READ_CONT]] -// -// CHECK: [[READ_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 -// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// // [[DO_COPY]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// -// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] -// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align -// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// // Barrier after copy to shared memory storage medium. -// CHECK: [[COPY_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* -// // Read into warp 0. -// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] -// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] -// -// CHECK: [[DO_READ]] -// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align -// CHECK: br label {{%?}}[[READ_CONT:.+]] -// -// CHECK: [[READ_ELSE]] -// CHECK: br label {{%?}}[[READ_CONT]] -// -// CHECK: [[READ_CONT]] -// CHECK: ret -// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}( -// -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 true) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 true) -// -// // define internal void [[PFN2]]( -// CHECK: store i32 0, i32* [[A:%.+]], align -// CHECK: store i16 -32768, i16* [[B:%.+]], align -// CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align -// CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 -// CHECK: store i32 [[OR]], i32* [[A]], align -// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align -// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 -// CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] -// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] -// -// CHECK: [[DO_MAX]] -// CHECK: br label {{%?}}[[MAX_CONT:.+]] -// -// CHECK: [[MAX_ELSE]] -// CHECK: [[BV:%.+]] = load i16, i16* [[B]], align -// CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 -// CHECK: br label {{%?}}[[MAX_CONT]] -// -// CHECK: [[MAX_CONT]] -// CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] -// CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 -// CHECK: store i16 [[TRUNC]], i16* [[B]], align -// CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* -// CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align -// CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* -// CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align -// CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* -// CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @{{.+}}, i32 {{.+}}, i32 2, i{{32|64}} {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]]) -// CHECK: [[CMP:%.+]] = icmp eq i32 [[RET]], 1 -// CHECK: br i1 [[CMP]], label -// CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align -// CHECK: [[AV:%.+]] = load i32, i32* [[A]], align -// CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] -// CHECK: store i32 [[OR]], i32* [[A_IN]], align -// CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align -// CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 -// CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align -// CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 -// CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] -// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] -// -// CHECK: [[DO_MAX]] -// CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align -// CHECK: br label {{%?}}[[MAX_CONT:.+]] -// -// CHECK: [[MAX_ELSE]] -// CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align -// CHECK: br label {{%?}}[[MAX_CONT]] -// -// CHECK: [[MAX_CONT]] -// CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] -// CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align -// CHECK: call void @__kmpc_nvptx_end_reduce_nowait( -// CHECK: br label -// -// CHECK: ret -// // Reduction function -// CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8* %0, i8* %1) -// CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], -// CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* -// -// CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], -// CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* -// -// CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], -// CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* -// -// CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], -// CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* -// -// CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], -// CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], -// CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] -// CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], -// -// CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], -// CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 -// CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], -// CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 -// -// CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] -// CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] -// -// CHECK: [[DO_MAX]] -// CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align -// CHECK: br label {{%?}}[[MAX_CONT:.+]] -// -// CHECK: [[MAX_ELSE]] -// CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align -// CHECK: br label {{%?}}[[MAX_CONT]] -// -// CHECK: [[MAX_CONT]] -// CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] -// CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], -// CHECK: ret void -// // Shuffle and reduce function -// CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8* %0, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) -// CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align -// CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 -// CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 -// -// CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align -// CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align -// CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align -// -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align -// -// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 -// CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) -// -// CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align -// CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* -// CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align -// -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* -// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align -// -// CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 -// CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() -// CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 -// CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) -// CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 -// -// CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align -// CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* -// CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align -// // Condition to reduce -// CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 -// -// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 -// CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] -// CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] -// -// CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 -// CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 -// CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 -// CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] -// CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 -// CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] -// -// CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] -// CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] -// CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] -// -// CHECK: [[DO_REDUCE]] -// CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* -// CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* -// CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) -// CHECK: br label {{%?}}[[REDUCE_CONT:.+]] -// -// CHECK: [[REDUCE_ELSE]] -// CHECK: br label {{%?}}[[REDUCE_CONT]] -// -// CHECK: [[REDUCE_CONT]] // Now check if we should just copy over the remote reduction list -// CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 -// CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] -// CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] -// CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// -// CHECK: [[DO_COPY]] -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align -// CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align -// -// CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* -// CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align -// CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// -// CHECK: [[COPY_CONT]] -// CHECK: void -// // Inter warp copy function -// CHECK: define internal void [[WARP_COPY_FN]](i8* %0, i32 %1) -// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 -// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 -// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 -// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// // [[DO_COPY]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// -// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] -// CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align -// CHECK: store volatile i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// // Barrier after copy to shared memory storage medium. -// CHECK: [[COPY_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* -// // Read into warp 0. -// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] -// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] -// -// CHECK: [[DO_READ]] -// CHECK: [[MEDIUM_ELT:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* -// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align -// CHECK: br label {{%?}}[[READ_CONT:.+]] -// -// CHECK: [[READ_ELSE]] -// CHECK: br label {{%?}}[[READ_CONT]] -// -// CHECK: [[READ_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 -// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] -// // [[DO_COPY]] -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* -// -// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] -// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* -// CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align -// CHECK: store volatile i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: br label {{%?}}[[COPY_CONT:.+]] -// -// CHECK: [[COPY_ELSE]] -// CHECK: br label {{%?}}[[COPY_CONT]] -// // Barrier after copy to shared memory storage medium. -// CHECK: [[COPY_CONT]] -// CHECK: call void @__kmpc_barrier(%struct.ident_t* @ -// CHECK: [[ACTIVE_WARPS:%.+]] = load i32, i32* -// // Read into warp 0. -// CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] -// CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] -// -// CHECK: [[DO_READ]] -// CHECK: [[MEDIUM_ELT32:%.+]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] -// CHECK: [[MEDIUM_ELT:%.+]] = bitcast i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT32]] to i16 addrspace([[SHARED_ADDRSPACE]])* -// CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i{{32|64}} 0, i{{32|64}} 1 -// CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], -// CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* -// CHECK: [[MEDIUM_ELT_VAL:%.+]] = load volatile i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align -// CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align -// CHECK: br label {{%?}}[[READ_CONT:.+]] -// -// CHECK: [[READ_ELSE]] -// CHECK: br label {{%?}}[[READ_CONT]] -// -// CHECK: [[READ_CONT]] -// CHECK: ret #endif +// +// +// +// +// +// +// +// +// +// +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK-64-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 +// CHECK-64-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 +// CHECK-64-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) +// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-64: user_code.entry: +// CHECK-64-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK-64-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8* +// CHECK-64-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 8 +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-64-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i64 1) +// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-64-NEXT: ret void +// CHECK-64: worker.exit: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-64-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[E_ADDR:%.*]] = alloca double*, align 8 +// CHECK-64-NEXT: [[E1:%.*]] = alloca double, align 8 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-64-NEXT: store double* [[E]], double** [[E_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 8 +// CHECK-64-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK-64-NEXT: [[TMP1:%.*]] = load double, double* [[E1]], align 8 +// CHECK-64-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK-64-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK-64-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-64-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast double* [[E1]] to i8* +// CHECK-64-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK-64-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-64-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i64 8, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func) +// CHECK-64-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1 +// CHECK-64-NEXT: br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-64: .omp.reduction.then: +// CHECK-64-NEXT: [[TMP9:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK-64-NEXT: [[TMP10:%.*]] = load double, double* [[E1]], align 8 +// CHECK-64-NEXT: [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]] +// CHECK-64-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK-64-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK-64-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-64: .omp.reduction.done: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK-64-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 8 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-64-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-64-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-64-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK-64-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-64-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-64-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK-64-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i64 1 +// CHECK-64-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK-64-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK-64-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK-64-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-64-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-64-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK-64-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK-64-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i64 1 +// CHECK-64-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i64 1 +// CHECK-64-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-64-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 8 +// CHECK-64-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-64-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-64-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-64-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK-64-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-64-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK-64-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK-64-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK-64-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-64-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK-64-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK-64-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK-64-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-64: then: +// CHECK-64-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK-64-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-64-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4:[0-9]+]] +// CHECK-64-NEXT: br label [[IFCONT:%.*]] +// CHECK-64: else: +// CHECK-64-NEXT: br label [[IFCONT]] +// CHECK-64: ifcont: +// CHECK-64-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-64-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-64-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK-64-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-64: then4: +// CHECK-64-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 8 +// CHECK-64-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 8 +// CHECK-64-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK-64-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK-64-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK-64-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK-64-NEXT: br label [[IFCONT6:%.*]] +// CHECK-64: else5: +// CHECK-64-NEXT: br label [[IFCONT6]] +// CHECK-64: ifcont6: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK-64-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-64-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK-64-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-64-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-64-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK-64-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-64-NEXT: br label [[PRECOND:%.*]] +// CHECK-64: precond: +// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-64-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK-64-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK-64: body: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-64-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-64: then: +// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 8 +// CHECK-64-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK-64-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-64-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-64-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-64-NEXT: br label [[IFCONT:%.*]] +// CHECK-64: else: +// CHECK-64-NEXT: br label [[IFCONT]] +// CHECK-64: ifcont: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK-64-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-64: then4: +// CHECK-64-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-64-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 8 +// CHECK-64-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK-64-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK-64-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK-64-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK-64-NEXT: br label [[IFCONT6:%.*]] +// CHECK-64: else5: +// CHECK-64-NEXT: br label [[IFCONT6]] +// CHECK-64: ifcont6: +// CHECK-64-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-64-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK-64-NEXT: br label [[PRECOND]] +// CHECK-64: exit: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l32 +// CHECK-64-SAME: (i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK-64-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK-64-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 +// CHECK-64-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-64: user_code.entry: +// CHECK-64-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[TMP4]], align 8 +// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8* +// CHECK-64-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 8 +// CHECK-64-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-64-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i64 2) +// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-64-NEXT: ret void +// CHECK-64: worker.exit: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-64-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[D_ADDR:%.*]] = alloca float*, align 8 +// CHECK-64-NEXT: [[C1:%.*]] = alloca i8, align 1 +// CHECK-64-NEXT: [[D2:%.*]] = alloca float, align 4 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-64-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 8 +// CHECK-64-NEXT: store float* [[D]], float** [[D_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 8 +// CHECK-64-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK-64-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK-64-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-64-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK-64-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK-64-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK-64-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK-64-NEXT: [[TMP3:%.*]] = load float, float* [[D2]], align 4 +// CHECK-64-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK-64-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK-64-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-64-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: store i8* [[C1]], i8** [[TMP6]], align 8 +// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast float* [[D2]] to i8* +// CHECK-64-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK-64-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-64-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 2, i64 16, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4) +// CHECK-64-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK-64-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-64: .omp.reduction.then: +// CHECK-64-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK-64-NEXT: [[CONV4:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK-64-NEXT: [[TMP13:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-64-NEXT: [[CONV5:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-64-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK-64-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK-64-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK-64-NEXT: [[TMP14:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK-64-NEXT: [[TMP15:%.*]] = load float, float* [[D2]], align 4 +// CHECK-64-NEXT: [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]] +// CHECK-64-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK-64-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK-64-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-64: .omp.reduction.done: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK-64-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-64-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-64-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-64-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-64-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-64-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-64-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK-64-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK-64-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-64-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-64-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK-64-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK-64-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK-64-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i64 1 +// CHECK-64-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK-64-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 8 +// CHECK-64-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 8 +// CHECK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK-64-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i64 1 +// CHECK-64-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK-64-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK-64-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK-64-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-64-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-64-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-64-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK-64-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i64 1 +// CHECK-64-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i64 1 +// CHECK-64-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-64-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 8 +// CHECK-64-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-64-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-64-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-64-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK-64-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-64-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK-64-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK-64-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK-64-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-64-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK-64-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK-64-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK-64-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-64: then: +// CHECK-64-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-64-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-64-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] +// CHECK-64-NEXT: br label [[IFCONT:%.*]] +// CHECK-64: else: +// CHECK-64-NEXT: br label [[IFCONT]] +// CHECK-64: ifcont: +// CHECK-64-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-64-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-64-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK-64-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-64: then6: +// CHECK-64-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 8 +// CHECK-64-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 8 +// CHECK-64-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK-64-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK-64-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 8 +// CHECK-64-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 8 +// CHECK-64-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK-64-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK-64-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK-64-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK-64-NEXT: br label [[IFCONT8:%.*]] +// CHECK-64: else7: +// CHECK-64-NEXT: br label [[IFCONT8]] +// CHECK-64: ifcont8: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK-64-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-64-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-64-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-64-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-64-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-64: then: +// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK-64-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK-64-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK-64-NEXT: br label [[IFCONT:%.*]] +// CHECK-64: else: +// CHECK-64-NEXT: br label [[IFCONT]] +// CHECK-64: ifcont: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-64-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-64: then4: +// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-64-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK-64-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 8 +// CHECK-64-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK-64-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK-64-NEXT: br label [[IFCONT6:%.*]] +// CHECK-64: else5: +// CHECK-64-NEXT: br label [[IFCONT6]] +// CHECK-64: ifcont6: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-64-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-64: then8: +// CHECK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK-64-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-64-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK-64-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK-64-NEXT: br label [[IFCONT10:%.*]] +// CHECK-64: else9: +// CHECK-64-NEXT: br label [[IFCONT10]] +// CHECK-64: ifcont10: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK-64-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-64: then12: +// CHECK-64-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 8 +// CHECK-64-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK-64-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK-64-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK-64-NEXT: br label [[IFCONT14:%.*]] +// CHECK-64: else13: +// CHECK-64-NEXT: br label [[IFCONT14]] +// CHECK-64: ifcont14: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l38 +// CHECK-64-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK-64-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8 +// CHECK-64-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-64-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-64: user_code.entry: +// CHECK-64-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-64-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK-64-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8 +// CHECK-64-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK-64-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8 +// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-64-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i64 2) +// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-64-NEXT: ret void +// CHECK-64: worker.exit: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK-64-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8 +// CHECK-64-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 8 +// CHECK-64-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK-64-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK-64-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-64-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-64-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8 +// CHECK-64-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 +// CHECK-64-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 8 +// CHECK-64-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK-64-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK-64-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-64-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK-64-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK-64-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-64-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK-64-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK-64-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK-64: cond.true: +// CHECK-64-NEXT: br label [[COND_END:%.*]] +// CHECK-64: cond.false: +// CHECK-64-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-64-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK-64-NEXT: br label [[COND_END]] +// CHECK-64: cond.end: +// CHECK-64-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK-64-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK-64-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK-64-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-64-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-64-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK-64-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 +// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK-64-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8 +// CHECK-64-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-64-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2, i64 16, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK-64-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK-64-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-64: .omp.reduction.then: +// CHECK-64-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-64-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-64-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK-64-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK-64-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-64-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-64-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-64-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK-64-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK-64-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK-64: cond.true9: +// CHECK-64-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-64-NEXT: br label [[COND_END11:%.*]] +// CHECK-64: cond.false10: +// CHECK-64-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-64-NEXT: br label [[COND_END11]] +// CHECK-64: cond.end11: +// CHECK-64-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK-64-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK-64-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK-64-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-64: .omp.reduction.done: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK-64-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 8 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK-64-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-64-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-64-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-64-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-64-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-64-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-64-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-64-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 8 +// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK-64-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK-64-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK-64-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-64-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-64-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK-64-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK-64-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 1 +// CHECK-64-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i64 1 +// CHECK-64-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-64-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 8 +// CHECK-64-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 8 +// CHECK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK-64-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK-64-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK-64-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK-64-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK-64-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-64-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-64-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-64-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK-64-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK-64-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i64 1 +// CHECK-64-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i64 1 +// CHECK-64-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-64-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 8 +// CHECK-64-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-64-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-64-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-64-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK-64-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-64-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK-64-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK-64-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK-64-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-64-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK-64-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK-64-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK-64-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-64: then: +// CHECK-64-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-64-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-64-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] +// CHECK-64-NEXT: br label [[IFCONT:%.*]] +// CHECK-64: else: +// CHECK-64-NEXT: br label [[IFCONT]] +// CHECK-64: ifcont: +// CHECK-64-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-64-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-64-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK-64-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-64: then6: +// CHECK-64-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 8 +// CHECK-64-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 8 +// CHECK-64-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK-64-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK-64-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK-64-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK-64-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 8 +// CHECK-64-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 8 +// CHECK-64-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK-64-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK-64-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK-64-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK-64-NEXT: br label [[IFCONT8:%.*]] +// CHECK-64: else7: +// CHECK-64-NEXT: br label [[IFCONT8]] +// CHECK-64: ifcont8: +// CHECK-64-NEXT: ret void +// +// +// CHECK-64-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK-64-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-64-NEXT: entry: +// CHECK-64-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 8 +// CHECK-64-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-64-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-64-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-64-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-64-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-64-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 8 +// CHECK-64-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-64-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-64: then: +// CHECK-64-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 8 +// CHECK-64-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK-64-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-64-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-64-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK-64-NEXT: br label [[IFCONT:%.*]] +// CHECK-64: else: +// CHECK-64-NEXT: br label [[IFCONT]] +// CHECK-64: ifcont: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-64-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-64: then4: +// CHECK-64-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-64-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 0 +// CHECK-64-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 8 +// CHECK-64-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK-64-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-64-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK-64-NEXT: br label [[IFCONT6:%.*]] +// CHECK-64: else5: +// CHECK-64-NEXT: br label [[IFCONT6]] +// CHECK-64: ifcont6: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-64-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-64: then8: +// CHECK-64-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 8 +// CHECK-64-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK-64-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-64-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK-64-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK-64-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK-64-NEXT: br label [[IFCONT10:%.*]] +// CHECK-64: else9: +// CHECK-64-NEXT: br label [[IFCONT10]] +// CHECK-64: ifcont10: +// CHECK-64-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-64-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-64-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK-64-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-64: then12: +// CHECK-64-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-64-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK-64-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i64 0, i64 1 +// CHECK-64-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 8 +// CHECK-64-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK-64-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK-64-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK-64-NEXT: br label [[IFCONT14:%.*]] +// CHECK-64: else13: +// CHECK-64-NEXT: br label [[IFCONT14]] +// CHECK-64: ifcont14: +// CHECK-64-NEXT: ret void +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK-32-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[E1:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP1:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK-32-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast double* [[E1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func) +// CHECK-32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1 +// CHECK-32-NEXT: br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP9:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK-32-NEXT: [[TMP10:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]] +// CHECK-32-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK-32-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK-32-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK-32-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK-32-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK-32-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK-32-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK-32-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK-32-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK-32-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4:[0-9]+]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK-32-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK-32-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK-32-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK-32-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK-32-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK-32-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK-32-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND:%.*]] +// CHECK-32: precond: +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK-32-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK-32: body: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3:[0-9]+]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK-32-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK-32-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK-32-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-32-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND]] +// CHECK-32: exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l32 +// CHECK-32-SAME: (i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[C1:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[D2:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK-32-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK-32-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK-32-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK-32-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast float* [[D2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 2, i32 8, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4) +// CHECK-32-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK-32-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[CONV4:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV5:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK-32-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK-32-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK-32-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK-32-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK-32-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK-32-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK-32-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK-32-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK-32-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK-32-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK-32-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK-32-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK-32-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK-32-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK-32-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK-32-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK-32-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK-32-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK-32-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK-32-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK-32-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK-32-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK-32-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l38 +// CHECK-32-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK-32-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK-32-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK-32: cond.true: +// CHECK-32-NEXT: br label [[COND_END:%.*]] +// CHECK-32: cond.false: +// CHECK-32-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK-32-NEXT: br label [[COND_END]] +// CHECK-32: cond.end: +// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK-32-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK-32-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK-32-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK-32-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK-32-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK-32-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK-32: cond.true9: +// CHECK-32-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: br label [[COND_END11:%.*]] +// CHECK-32: cond.false10: +// CHECK-32-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: br label [[COND_END11]] +// CHECK-32: cond.end11: +// CHECK-32-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK-32-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK-32-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK-32-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK-32-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK-32-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK-32-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK-32-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK-32-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK-32-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK-32-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK-32-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK-32-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK-32-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK-32-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK-32-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK-32-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK-32-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK-32-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK-32-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK-32-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK-32-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK-32-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK-32-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK-32-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK-32-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[E1:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP1:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK-32-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast double* [[E1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func) +// CHECK-32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1 +// CHECK-32-NEXT: br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP9:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK-32-NEXT: [[TMP10:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]] +// CHECK-32-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK-32-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK-32-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK-32-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK-32-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK-32-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK-32-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK-32-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK-32-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK-32-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK-32-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK-32-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK-32-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK-32-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK-32-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK-32-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK-32-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND:%.*]] +// CHECK-32: precond: +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK-32-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK-32: body: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK-32-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK-32-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK-32-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-32-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND]] +// CHECK-32: exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l32 +// CHECK-32-SAME: (i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[C1:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[D2:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK-32-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK-32-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK-32-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK-32-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast float* [[D2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 2, i32 8, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4) +// CHECK-32-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK-32-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[CONV4:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV5:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK-32-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK-32-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK-32-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK-32-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK-32-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK-32-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK-32-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK-32-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK-32-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK-32-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK-32-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK-32-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK-32-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK-32-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK-32-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK-32-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK-32-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK-32-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK-32-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK-32-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK-32-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK-32-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK-32-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l38 +// CHECK-32-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK-32-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK-32-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK-32: cond.true: +// CHECK-32-NEXT: br label [[COND_END:%.*]] +// CHECK-32: cond.false: +// CHECK-32-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK-32-NEXT: br label [[COND_END]] +// CHECK-32: cond.end: +// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK-32-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK-32-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK-32-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK-32-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK-32-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK-32-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK-32: cond.true9: +// CHECK-32-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: br label [[COND_END11:%.*]] +// CHECK-32: cond.false10: +// CHECK-32-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: br label [[COND_END11]] +// CHECK-32: cond.end11: +// CHECK-32-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK-32-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK-32-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK-32-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK-32-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK-32-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK-32-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK-32-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK-32-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK-32-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK-32-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK-32-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK-32-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK-32-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK-32-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK-32-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK-32-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK-32-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK-32-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK-32-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK-32-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK-32-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK-32-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK-32-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK-32-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// +// +// +// +// +// +// +// +// +// +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK-32-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[E1:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP1:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK-32-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast double* [[E1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func) +// CHECK-32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1 +// CHECK-32-NEXT: br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP9:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK-32-NEXT: [[TMP10:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]] +// CHECK-32-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK-32-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK-32-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK-32-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK-32-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK-32-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK-32-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK-32-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK-32-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK-32-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK-32-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK-32-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK-32-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK-32-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK-32-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK-32-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK-32-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND:%.*]] +// CHECK-32: precond: +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK-32-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK-32: body: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK-32-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK-32-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK-32-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-32-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND]] +// CHECK-32: exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l32 +// CHECK-32-SAME: (i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[C1:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[D2:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK-32-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK-32-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK-32-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK-32-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast float* [[D2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 2, i32 8, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4) +// CHECK-32-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK-32-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[CONV4:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV5:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK-32-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK-32-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK-32-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK-32-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK-32-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK-32-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK-32-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK-32-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK-32-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK-32-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK-32-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK-32-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK-32-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK-32-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK-32-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK-32-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK-32-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK-32-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK-32-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK-32-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK-32-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK-32-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK-32-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l38 +// CHECK-32-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK-32-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK-32-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK-32: cond.true: +// CHECK-32-NEXT: br label [[COND_END:%.*]] +// CHECK-32: cond.false: +// CHECK-32-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK-32-NEXT: br label [[COND_END]] +// CHECK-32: cond.end: +// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK-32-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK-32-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK-32-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK-32-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK-32-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK-32-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK-32: cond.true9: +// CHECK-32-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: br label [[COND_END11:%.*]] +// CHECK-32: cond.false10: +// CHECK-32-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: br label [[COND_END11]] +// CHECK-32: cond.end11: +// CHECK-32-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK-32-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK-32-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK-32-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK-32-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK-32-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK-32-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK-32-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK-32-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK-32-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK-32-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK-32-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK-32-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK-32-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK-32-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK-32-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK-32-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK-32-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK-32-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK-32-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK-32-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK-32-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK-32-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK-32-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK-32-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l27 +// CHECK-32-SAME: (double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP4]], i8** [[TMP3]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, double*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP5]], i32 1) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__ +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], double* nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[E_ADDR:%.*]] = alloca double*, align 4 +// CHECK-32-NEXT: [[E1:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 +// CHECK-32-NEXT: store double 0.000000e+00, double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP1:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD:%.*]] = fadd double [[TMP1]], 5.000000e+00 +// CHECK-32-NEXT: store double [[ADD]], double* [[E1]], align 8 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast double* [[E1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP7:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 4, i8* [[TMP6]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func) +// CHECK-32-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP7]], 1 +// CHECK-32-NEXT: br i1 [[TMP8]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP9:%.*]] = load double, double* [[TMP0]], align 8 +// CHECK-32-NEXT: [[TMP10:%.*]] = load double, double* [[E1]], align 8 +// CHECK-32-NEXT: [[ADD2:%.*]] = fadd double [[TMP9]], [[TMP10]] +// CHECK-32-NEXT: store double [[ADD2]], double* [[TMP0]], align 8 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca double, align 8 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [1 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to double* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP12]] to i64* +// CHECK-32-NEXT: [[TMP16:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i64* +// CHECK-32-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP15]], align 8 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP18:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP19:%.*]] = call i64 @__kmpc_shuffle_int64(i64 [[TMP17]], i16 [[TMP7]], i16 [[TMP18]]) +// CHECK-32-NEXT: store i64 [[TMP19]], i64* [[TMP16]], align 8 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr i64, i64* [[TMP15]], i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr i64, i64* [[TMP16]], i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = bitcast double* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP22]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP24:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP26:%.*]] = and i1 [[TMP24]], [[TMP25]] +// CHECK-32-NEXT: [[TMP27:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP29:%.*]] = icmp eq i16 [[TMP28]], 0 +// CHECK-32-NEXT: [[TMP30:%.*]] = and i1 [[TMP27]], [[TMP29]] +// CHECK-32-NEXT: [[TMP31:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP32:%.*]] = and i1 [[TMP30]], [[TMP31]] +// CHECK-32-NEXT: [[TMP33:%.*]] = or i1 [[TMP23]], [[TMP26]] +// CHECK-32-NEXT: [[TMP34:%.*]] = or i1 [[TMP33]], [[TMP32]] +// CHECK-32-NEXT: br i1 [[TMP34]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP39:%.*]] = and i1 [[TMP37]], [[TMP38]] +// CHECK-32-NEXT: br i1 [[TMP39]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = load i8*, i8** [[TMP40]], align 4 +// CHECK-32-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = load i8*, i8** [[TMP42]], align 4 +// CHECK-32-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP41]] to double* +// CHECK-32-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP43]] to double* +// CHECK-32-NEXT: [[TMP46:%.*]] = load double, double* [[TMP44]], align 8 +// CHECK-32-NEXT: store double [[TMP46]], double* [[TMP45]], align 8 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTCNT_ADDR:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [1 x i8*]* +// CHECK-32-NEXT: store i32 0, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND:%.*]] +// CHECK-32: precond: +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], 2 +// CHECK-32-NEXT: br i1 [[TMP6]], label [[BODY:%.*]], label [[EXIT:%.*]] +// CHECK-32: body: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i8*, i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to i32* +// CHECK-32-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP10]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP12]], i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP13:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP13]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i8*, i8** [[TMP15]], align 4 +// CHECK-32-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to i32* +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP17]], i32 [[TMP5]] +// CHECK-32-NEXT: [[TMP19:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP14]], align 4 +// CHECK-32-NEXT: store i32 [[TMP19]], i32* [[TMP18]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: [[TMP20:%.*]] = add nsw i32 [[TMP5]], 1 +// CHECK-32-NEXT: store i32 [[TMP20]], i32* [[DOTCNT_ADDR]], align 4 +// CHECK-32-NEXT: br label [[PRECOND]] +// CHECK-32: exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l32 +// CHECK-32-SAME: (i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP6]], i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i8*, float*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP7]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__1 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i8* nonnull align 1 dereferenceable(1) [[C:%.*]], float* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[C_ADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[D_ADDR:%.*]] = alloca float*, align 4 +// CHECK-32-NEXT: [[C1:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[D2:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i8* [[C]], i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: store float* [[D]], float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i8*, i8** [[C_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load float*, float** [[D_ADDR]], align 4 +// CHECK-32-NEXT: store i8 0, i8* [[C1]], align 1 +// CHECK-32-NEXT: store float 1.000000e+00, float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i8 [[TMP2]] to i32 +// CHECK-32-NEXT: [[XOR:%.*]] = xor i32 [[CONV]], 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = trunc i32 [[XOR]] to i8 +// CHECK-32-NEXT: store i8 [[CONV3]], i8* [[C1]], align 1 +// CHECK-32-NEXT: [[TMP3:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 3.300000e+01 +// CHECK-32-NEXT: store float [[MUL]], float* [[D2]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: store i8* [[C1]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast float* [[D2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP10:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP5]], i32 2, i32 8, i8* [[TMP9]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func3, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func4) +// CHECK-32-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 1 +// CHECK-32-NEXT: br i1 [[TMP11]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP12:%.*]] = load i8, i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[CONV4:%.*]] = sext i8 [[TMP12]] to i32 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[C1]], align 1 +// CHECK-32-NEXT: [[CONV5:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]] +// CHECK-32-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8 +// CHECK-32-NEXT: store i8 [[CONV7]], i8* [[TMP0]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = load float, float* [[TMP1]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load float, float* [[D2]], align 4 +// CHECK-32-NEXT: [[MUL8:%.*]] = fmul float [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store float [[MUL8]], float* [[TMP1]], align 4 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func3 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i8, align 1 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca float, align 4 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8, i8* [[TMP10]], align 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = sext i8 [[TMP13]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP15:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP16:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP14]], i16 [[TMP7]], i16 [[TMP15]]) +// CHECK-32-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i8 +// CHECK-32-NEXT: store i8 [[TMP17]], i8* [[DOTOMP_REDUCTION_ELEMENT]], align 1 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP10]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i8, i8* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: store i8* [[DOTOMP_REDUCTION_ELEMENT]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP20:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP21:%.*]] = load i8*, i8** [[TMP20]], align 4 +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP21]] to float* +// CHECK-32-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP23]], i32 1 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to i8* +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP23]] to i32* +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i32* +// CHECK-32-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP26]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: store i32 [[TMP30]], i32* [[TMP27]], align 4 +// CHECK-32-NEXT: [[TMP31:%.*]] = getelementptr i32, i32* [[TMP26]], i32 1 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP27]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = bitcast float* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP33]], i8** [[TMP22]], align 4 +// CHECK-32-NEXT: [[TMP34:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP37:%.*]] = and i1 [[TMP35]], [[TMP36]] +// CHECK-32-NEXT: [[TMP38:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP39:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP40:%.*]] = icmp eq i16 [[TMP39]], 0 +// CHECK-32-NEXT: [[TMP41:%.*]] = and i1 [[TMP38]], [[TMP40]] +// CHECK-32-NEXT: [[TMP42:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP43:%.*]] = and i1 [[TMP41]], [[TMP42]] +// CHECK-32-NEXT: [[TMP44:%.*]] = or i1 [[TMP34]], [[TMP37]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP44]], [[TMP43]] +// CHECK-32-NEXT: br i1 [[TMP45]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP48:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP50:%.*]] = and i1 [[TMP48]], [[TMP49]] +// CHECK-32-NEXT: br i1 [[TMP50]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP52:%.*]] = load i8*, i8** [[TMP51]], align 4 +// CHECK-32-NEXT: [[TMP53:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP54:%.*]] = load i8*, i8** [[TMP53]], align 4 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8, i8* [[TMP52]], align 1 +// CHECK-32-NEXT: store i8 [[TMP55]], i8* [[TMP54]], align 1 +// CHECK-32-NEXT: [[TMP56:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP57:%.*]] = load i8*, i8** [[TMP56]], align 4 +// CHECK-32-NEXT: [[TMP58:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP59:%.*]] = load i8*, i8** [[TMP58]], align 4 +// CHECK-32-NEXT: [[TMP60:%.*]] = bitcast i8* [[TMP57]] to float* +// CHECK-32-NEXT: [[TMP61:%.*]] = bitcast i8* [[TMP59]] to float* +// CHECK-32-NEXT: [[TMP62:%.*]] = load float, float* [[TMP60]], align 4 +// CHECK-32-NEXT: store float [[TMP62]], float* [[TMP61]], align 4 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func4 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32 addrspace(3)* [[TMP7]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP9:%.*]] = load i8, i8* [[TMP6]], align 1 +// CHECK-32-NEXT: store volatile i8 [[TMP9]], i8 addrspace(3)* [[TMP8]], align 1 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(3)* [[TMP11]] to i8 addrspace(3)* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP14:%.*]] = load i8*, i8** [[TMP13]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i8, i8 addrspace(3)* [[TMP12]], align 1 +// CHECK-32-NEXT: store i8 [[TMP15]], i8* [[TMP14]], align 1 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i32* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP18]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP20]], i32 addrspace(3)* [[TMP19]], align 4 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP21:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP21]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP22:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = load i8*, i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to i32* +// CHECK-32-NEXT: [[TMP26:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP22]], align 4 +// CHECK-32-NEXT: store i32 [[TMP26]], i32* [[TMP25]], align 4 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l38 +// CHECK-32-SAME: (i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) +// CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1 +// CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] +// CHECK-32: user_code.entry: +// CHECK-32-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP0]] to i8* +// CHECK-32-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 4 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** +// CHECK-32-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP3]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i16*)* @__omp_outlined__5 to i8*), i8* null, i8** [[TMP8]], i32 2) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) +// CHECK-32-NEXT: ret void +// CHECK-32: worker.exit: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@__omp_outlined__5 +// CHECK-32-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]], i16* nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR1]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 4 +// CHECK-32-NEXT: [[B_ADDR:%.*]] = alloca i16*, align 4 +// CHECK-32-NEXT: [[A1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[B2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 4 +// CHECK-32-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: store i16* [[B]], i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 +// CHECK-32-NEXT: [[TMP1:%.*]] = load i16*, i16** [[B_ADDR]], align 4 +// CHECK-32-NEXT: store i32 0, i32* [[A1]], align 4 +// CHECK-32-NEXT: store i16 -32768, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP2:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR:%.*]] = or i32 [[TMP2]], 1 +// CHECK-32-NEXT: store i32 [[OR]], i32* [[A1]], align 4 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV:%.*]] = sext i16 [[TMP3]] to i32 +// CHECK-32-NEXT: [[CMP:%.*]] = icmp sgt i32 99, [[CONV]] +// CHECK-32-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK-32: cond.true: +// CHECK-32-NEXT: br label [[COND_END:%.*]] +// CHECK-32: cond.false: +// CHECK-32-NEXT: [[TMP4:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV3:%.*]] = sext i16 [[TMP4]] to i32 +// CHECK-32-NEXT: br label [[COND_END]] +// CHECK-32: cond.end: +// CHECK-32-NEXT: [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[CONV3]], [[COND_FALSE]] ] +// CHECK-32-NEXT: [[CONV4:%.*]] = trunc i32 [[COND]] to i16 +// CHECK-32-NEXT: store i16 [[CONV4]], i16* [[B2]], align 2 +// CHECK-32-NEXT: [[TMP5:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 4 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP8:%.*]] = bitcast i32* [[A1]] to i8* +// CHECK-32-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP10:%.*]] = bitcast i16* [[B2]] to i8* +// CHECK-32-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* +// CHECK-32-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2, i32 8, i8* [[TMP11]], void (i8*, i16, i16, i16)* @_omp_reduction_shuffle_and_reduce_func7, void (i8*, i32)* @_omp_reduction_inter_warp_copy_func8) +// CHECK-32-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP12]], 1 +// CHECK-32-NEXT: br i1 [[TMP13]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]] +// CHECK-32: .omp.reduction.then: +// CHECK-32-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[A1]], align 4 +// CHECK-32-NEXT: [[OR5:%.*]] = or i32 [[TMP14]], [[TMP15]] +// CHECK-32-NEXT: store i32 [[OR5]], i32* [[TMP0]], align 4 +// CHECK-32-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: [[CONV6:%.*]] = sext i16 [[TMP16]] to i32 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: [[CONV7:%.*]] = sext i16 [[TMP17]] to i32 +// CHECK-32-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[CONV6]], [[CONV7]] +// CHECK-32-NEXT: br i1 [[CMP8]], label [[COND_TRUE9:%.*]], label [[COND_FALSE10:%.*]] +// CHECK-32: cond.true9: +// CHECK-32-NEXT: [[TMP18:%.*]] = load i16, i16* [[TMP1]], align 2 +// CHECK-32-NEXT: br label [[COND_END11:%.*]] +// CHECK-32: cond.false10: +// CHECK-32-NEXT: [[TMP19:%.*]] = load i16, i16* [[B2]], align 2 +// CHECK-32-NEXT: br label [[COND_END11]] +// CHECK-32: cond.end11: +// CHECK-32-NEXT: [[COND12:%.*]] = phi i16 [ [[TMP18]], [[COND_TRUE9]] ], [ [[TMP19]], [[COND_FALSE10]] ] +// CHECK-32-NEXT: store i16 [[COND12]], i16* [[TMP1]], align 2 +// CHECK-32-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP6]]) +// CHECK-32-NEXT: br label [[DOTOMP_REDUCTION_DONE]] +// CHECK-32: .omp.reduction.done: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_shuffle_and_reduce_func7 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i16 signext [[TMP1:%.*]], i16 signext [[TMP2:%.*]], i16 signext [[TMP3:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR2:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTADDR3:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [2 x i8*], align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[DOTOMP_REDUCTION_ELEMENT4:%.*]] = alloca i16, align 2 +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i16 [[TMP1]], i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: store i16 [[TMP2]], i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: store i16 [[TMP3]], i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP4:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to [2 x i8*]* +// CHECK-32-NEXT: [[TMP6:%.*]] = load i16, i16* [[DOTADDR1]], align 2 +// CHECK-32-NEXT: [[TMP7:%.*]] = load i16, i16* [[DOTADDR2]], align 2 +// CHECK-32-NEXT: [[TMP8:%.*]] = load i16, i16* [[DOTADDR3]], align 2 +// CHECK-32-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP10:%.*]] = load i8*, i8** [[TMP9]], align 4 +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP10]] to i32* +// CHECK-32-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to i8* +// CHECK-32-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP12]], align 4 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP16:%.*]] = trunc i32 [[NVPTX_WARP_SIZE]] to i16 +// CHECK-32-NEXT: [[TMP17:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP15]], i16 [[TMP7]], i16 [[TMP16]]) +// CHECK-32-NEXT: store i32 [[TMP17]], i32* [[DOTOMP_REDUCTION_ELEMENT]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i32 1 +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr i32, i32* [[DOTOMP_REDUCTION_ELEMENT]], i32 1 +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32* [[DOTOMP_REDUCTION_ELEMENT]] to i8* +// CHECK-32-NEXT: store i8* [[TMP20]], i8** [[TMP11]], align 4 +// CHECK-32-NEXT: [[TMP21:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP22:%.*]] = load i8*, i8** [[TMP21]], align 4 +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP22]] to i16* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = bitcast i16* [[TMP25]] to i8* +// CHECK-32-NEXT: [[TMP27:%.*]] = load i16, i16* [[TMP24]], align 2 +// CHECK-32-NEXT: [[TMP28:%.*]] = sext i16 [[TMP27]] to i32 +// CHECK-32-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() +// CHECK-32-NEXT: [[TMP29:%.*]] = trunc i32 [[NVPTX_WARP_SIZE5]] to i16 +// CHECK-32-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_shuffle_int32(i32 [[TMP28]], i16 [[TMP7]], i16 [[TMP29]]) +// CHECK-32-NEXT: [[TMP31:%.*]] = trunc i32 [[TMP30]] to i16 +// CHECK-32-NEXT: store i16 [[TMP31]], i16* [[DOTOMP_REDUCTION_ELEMENT4]], align 2 +// CHECK-32-NEXT: [[TMP32:%.*]] = getelementptr i16, i16* [[TMP24]], i32 1 +// CHECK-32-NEXT: [[TMP33:%.*]] = getelementptr i16, i16* [[DOTOMP_REDUCTION_ELEMENT4]], i32 1 +// CHECK-32-NEXT: [[TMP34:%.*]] = bitcast i16* [[DOTOMP_REDUCTION_ELEMENT4]] to i8* +// CHECK-32-NEXT: store i8* [[TMP34]], i8** [[TMP23]], align 4 +// CHECK-32-NEXT: [[TMP35:%.*]] = icmp eq i16 [[TMP8]], 0 +// CHECK-32-NEXT: [[TMP36:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP37:%.*]] = icmp ult i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP38:%.*]] = and i1 [[TMP36]], [[TMP37]] +// CHECK-32-NEXT: [[TMP39:%.*]] = icmp eq i16 [[TMP8]], 2 +// CHECK-32-NEXT: [[TMP40:%.*]] = and i16 [[TMP6]], 1 +// CHECK-32-NEXT: [[TMP41:%.*]] = icmp eq i16 [[TMP40]], 0 +// CHECK-32-NEXT: [[TMP42:%.*]] = and i1 [[TMP39]], [[TMP41]] +// CHECK-32-NEXT: [[TMP43:%.*]] = icmp sgt i16 [[TMP7]], 0 +// CHECK-32-NEXT: [[TMP44:%.*]] = and i1 [[TMP42]], [[TMP43]] +// CHECK-32-NEXT: [[TMP45:%.*]] = or i1 [[TMP35]], [[TMP38]] +// CHECK-32-NEXT: [[TMP46:%.*]] = or i1 [[TMP45]], [[TMP44]] +// CHECK-32-NEXT: br i1 [[TMP46]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* +// CHECK-32-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* +// CHECK-32-NEXT: call void @"_omp$reduction$reduction_func6"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: [[TMP49:%.*]] = icmp eq i16 [[TMP8]], 1 +// CHECK-32-NEXT: [[TMP50:%.*]] = icmp uge i16 [[TMP6]], [[TMP7]] +// CHECK-32-NEXT: [[TMP51:%.*]] = and i1 [[TMP49]], [[TMP50]] +// CHECK-32-NEXT: br i1 [[TMP51]], label [[THEN6:%.*]], label [[ELSE7:%.*]] +// CHECK-32: then6: +// CHECK-32-NEXT: [[TMP52:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP53:%.*]] = load i8*, i8** [[TMP52]], align 4 +// CHECK-32-NEXT: [[TMP54:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP55:%.*]] = load i8*, i8** [[TMP54]], align 4 +// CHECK-32-NEXT: [[TMP56:%.*]] = bitcast i8* [[TMP53]] to i32* +// CHECK-32-NEXT: [[TMP57:%.*]] = bitcast i8* [[TMP55]] to i32* +// CHECK-32-NEXT: [[TMP58:%.*]] = load i32, i32* [[TMP56]], align 4 +// CHECK-32-NEXT: store i32 [[TMP58]], i32* [[TMP57]], align 4 +// CHECK-32-NEXT: [[TMP59:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP60:%.*]] = load i8*, i8** [[TMP59]], align 4 +// CHECK-32-NEXT: [[TMP61:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP5]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP62:%.*]] = load i8*, i8** [[TMP61]], align 4 +// CHECK-32-NEXT: [[TMP63:%.*]] = bitcast i8* [[TMP60]] to i16* +// CHECK-32-NEXT: [[TMP64:%.*]] = bitcast i8* [[TMP62]] to i16* +// CHECK-32-NEXT: [[TMP65:%.*]] = load i16, i16* [[TMP63]], align 2 +// CHECK-32-NEXT: store i16 [[TMP65]], i16* [[TMP64]], align 2 +// CHECK-32-NEXT: br label [[IFCONT8:%.*]] +// CHECK-32: else7: +// CHECK-32-NEXT: br label [[IFCONT8]] +// CHECK-32: ifcont8: +// CHECK-32-NEXT: ret void +// +// +// CHECK-32-LABEL: define {{[^@]+}}@_omp_reduction_inter_warp_copy_func8 +// CHECK-32-SAME: (i8* [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] { +// CHECK-32-NEXT: entry: +// CHECK-32-NEXT: [[DOTADDR:%.*]] = alloca i8*, align 4 +// CHECK-32-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK-32-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) +// CHECK-32-NEXT: store i8* [[TMP0]], i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_TID2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID2]], 31 +// CHECK-32-NEXT: [[NVPTX_TID3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK-32-NEXT: [[NVPTX_WARP_ID:%.*]] = ashr i32 [[NVPTX_TID3]], 5 +// CHECK-32-NEXT: [[TMP3:%.*]] = load i8*, i8** [[DOTADDR]], align 4 +// CHECK-32-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to [2 x i8*]* +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]] +// CHECK-32: then: +// CHECK-32-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP6:%.*]] = load i8*, i8** [[TMP5]], align 4 +// CHECK-32-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to i32* +// CHECK-32-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP7]], align 4 +// CHECK-32-NEXT: store volatile i32 [[TMP9]], i32 addrspace(3)* [[TMP8]], align 4 +// CHECK-32-NEXT: br label [[IFCONT:%.*]] +// CHECK-32: else: +// CHECK-32-NEXT: br label [[IFCONT]] +// CHECK-32: ifcont: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP10]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD]], label [[THEN4:%.*]], label [[ELSE5:%.*]] +// CHECK-32: then4: +// CHECK-32-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 0 +// CHECK-32-NEXT: [[TMP13:%.*]] = load i8*, i8** [[TMP12]], align 4 +// CHECK-32-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to i32* +// CHECK-32-NEXT: [[TMP15:%.*]] = load volatile i32, i32 addrspace(3)* [[TMP11]], align 4 +// CHECK-32-NEXT: store i32 [[TMP15]], i32* [[TMP14]], align 4 +// CHECK-32-NEXT: br label [[IFCONT6:%.*]] +// CHECK-32: else5: +// CHECK-32-NEXT: br label [[IFCONT6]] +// CHECK-32: ifcont6: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[WARP_MASTER7:%.*]] = icmp eq i32 [[NVPTX_LANE_ID]], 0 +// CHECK-32-NEXT: br i1 [[WARP_MASTER7]], label [[THEN8:%.*]], label [[ELSE9:%.*]] +// CHECK-32: then8: +// CHECK-32-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP17:%.*]] = load i8*, i8** [[TMP16]], align 4 +// CHECK-32-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP17]] to i16* +// CHECK-32-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]] +// CHECK-32-NEXT: [[TMP20:%.*]] = bitcast i32 addrspace(3)* [[TMP19]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP21:%.*]] = load i16, i16* [[TMP18]], align 2 +// CHECK-32-NEXT: store volatile i16 [[TMP21]], i16 addrspace(3)* [[TMP20]], align 2 +// CHECK-32-NEXT: br label [[IFCONT10:%.*]] +// CHECK-32: else9: +// CHECK-32-NEXT: br label [[IFCONT10]] +// CHECK-32: ifcont10: +// CHECK-32-NEXT: call void @__kmpc_barrier(%struct.ident_t* @[[GLOB3]], i32 [[TMP2]]) +// CHECK-32-NEXT: [[TMP22:%.*]] = load i32, i32* [[DOTADDR1]], align 4 +// CHECK-32-NEXT: [[IS_ACTIVE_THREAD11:%.*]] = icmp ult i32 [[NVPTX_TID]], [[TMP22]] +// CHECK-32-NEXT: br i1 [[IS_ACTIVE_THREAD11]], label [[THEN12:%.*]], label [[ELSE13:%.*]] +// CHECK-32: then12: +// CHECK-32-NEXT: [[TMP23:%.*]] = getelementptr inbounds [32 x i32], [32 x i32] addrspace(3)* @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_TID]] +// CHECK-32-NEXT: [[TMP24:%.*]] = bitcast i32 addrspace(3)* [[TMP23]] to i16 addrspace(3)* +// CHECK-32-NEXT: [[TMP25:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[TMP4]], i32 0, i32 1 +// CHECK-32-NEXT: [[TMP26:%.*]] = load i8*, i8** [[TMP25]], align 4 +// CHECK-32-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to i16* +// CHECK-32-NEXT: [[TMP28:%.*]] = load volatile i16, i16 addrspace(3)* [[TMP24]], align 2 +// CHECK-32-NEXT: store i16 [[TMP28]], i16* [[TMP27]], align 2 +// CHECK-32-NEXT: br label [[IFCONT14:%.*]] +// CHECK-32: else13: +// CHECK-32-NEXT: br label [[IFCONT14]] +// CHECK-32: ifcont14: +// CHECK-32-NEXT: ret void +// diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp --- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp +++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp @@ -36,14 +36,14 @@ // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8:![0-9]+]] // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -603,14 +603,14 @@ // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8]] // CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -1270,14 +1270,14 @@ // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8:![0-9]+]] // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -1837,14 +1837,14 @@ // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8]] // CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -2504,14 +2504,14 @@ // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8:![0-9]+]] // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -3071,14 +3071,14 @@ // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8]] // CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_printf_codegen.c b/clang/test/OpenMP/nvptx_target_printf_codegen.c --- a/clang/test/OpenMP/nvptx_target_printf_codegen.c +++ b/clang/test/OpenMP/nvptx_target_printf_codegen.c @@ -48,7 +48,7 @@ // CHECK-64-NEXT: entry: // CHECK-64-NEXT: [[FMT:%.*]] = alloca i8*, align 8 // CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8 -// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-64: user_code.entry: @@ -62,7 +62,7 @@ // CHECK-64-NEXT: store double 3.000000e+00, double* [[TMP4]], align 8 // CHECK-64-NEXT: [[TMP5:%.*]] = bitcast %printf_args* [[TMP]] to i8* // CHECK-64-NEXT: [[TMP6:%.*]] = call i32 @vprintf(i8* [[TMP1]], i8* [[TMP5]]) -// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-64-NEXT: ret void // CHECK-64: worker.exit: // CHECK-64-NEXT: ret void @@ -71,12 +71,12 @@ // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25 // CHECK-64-SAME: () #[[ATTR0]] { // CHECK-64-NEXT: entry: -// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-64: user_code.entry: // CHECK-64-NEXT: [[TMP1:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i64 0, i64 0), i8* null) -// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-64-NEXT: ret void // CHECK-64: worker.exit: // CHECK-64-NEXT: ret void @@ -89,7 +89,7 @@ // CHECK-64-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8 // CHECK-64-NEXT: store i64 [[FOO]], i64* [[FOO_ADDR]], align 8 // CHECK-64-NEXT: [[CONV:%.*]] = bitcast i64* [[FOO_ADDR]] to i32* -// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK-64-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK-64-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-64-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-64: user_code.entry: @@ -105,7 +105,7 @@ // CHECK-64: worker.exit: // CHECK-64-NEXT: ret void // CHECK-64: if.end: -// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-64-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-64-NEXT: ret void // // @@ -117,7 +117,7 @@ // CHECK-32-NEXT: entry: // CHECK-32-NEXT: [[FMT:%.*]] = alloca i8*, align 4 // CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8 -// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-32: user_code.entry: @@ -131,7 +131,7 @@ // CHECK-32-NEXT: store double 3.000000e+00, double* [[TMP4]], align 8 // CHECK-32-NEXT: [[TMP5:%.*]] = bitcast %printf_args* [[TMP]] to i8* // CHECK-32-NEXT: [[TMP6:%.*]] = call i32 @vprintf(i8* [[TMP1]], i8* [[TMP5]]) -// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-32-NEXT: ret void // CHECK-32: worker.exit: // CHECK-32-NEXT: ret void @@ -140,12 +140,12 @@ // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25 // CHECK-32-SAME: () #[[ATTR0]] { // CHECK-32-NEXT: entry: -// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-32: user_code.entry: // CHECK-32-NEXT: [[TMP1:%.*]] = call i32 @vprintf(i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str1, i32 0, i32 0), i8* null) -// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-32-NEXT: ret void // CHECK-32: worker.exit: // CHECK-32-NEXT: ret void @@ -157,7 +157,7 @@ // CHECK-32-NEXT: [[FOO_ADDR:%.*]] = alloca i32, align 4 // CHECK-32-NEXT: [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8 // CHECK-32-NEXT: store i32 [[FOO]], i32* [[FOO_ADDR]], align 4 -// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK-32-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK-32-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK-32-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK-32: user_code.entry: @@ -173,6 +173,6 @@ // CHECK-32: worker.exit: // CHECK-32-NEXT: ret void // CHECK-32: if.end: -// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK-32-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK-32-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp @@ -61,32 +61,32 @@ } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l32}}( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l37}}( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l42}}( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+l47}}( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) // CHECK-NOT: call void @__kmpc_for_static_init // CHECK-NOT: call void @__kmpc_for_static_fini // CHECK-NOT: call void @__kmpc_nvptx_end_reduce_nowait( -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: ret void #endif diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp @@ -59,7 +59,7 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[A]], i64* [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i8* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -70,7 +70,7 @@ // CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[A_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]]) #[[ATTR1:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -100,7 +100,7 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -111,7 +111,7 @@ // CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[AA_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -141,7 +141,7 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[AA]], i64* [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -152,7 +152,7 @@ // CHECK1-NEXT: [[TMP3:%.*]] = load i64, i64* [[AA_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP3]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -224,7 +224,7 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -235,7 +235,7 @@ // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[A_CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR1:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -265,7 +265,7 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -276,7 +276,7 @@ // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[AA_CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -306,7 +306,7 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -317,7 +317,7 @@ // CHECK2-NEXT: [[TMP3:%.*]] = load i32, i32* [[AA_CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -389,7 +389,7 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[A_ADDR]] to i8* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -400,7 +400,7 @@ // CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[A_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR1:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -430,7 +430,7 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -441,7 +441,7 @@ // CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[AA_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -471,7 +471,7 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store i32 [[AA]], i32* [[AA_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -482,7 +482,7 @@ // CHECK3-NEXT: [[TMP3:%.*]] = load i32, i32* [[AA_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP3]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp @@ -580,14 +580,14 @@ // CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -692,7 +692,7 @@ // CHECK1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0 // CHECK1-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8 -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR1]] +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -702,14 +702,14 @@ // CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -814,7 +814,7 @@ // CHECK2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 // CHECK2-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR1]] +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -824,14 +824,14 @@ // CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR1:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -936,6 +936,6 @@ // CHECK3-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i32 0 // CHECK3-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32** // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR1]] +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR4]] // CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp @@ -18484,7 +18484,7 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -18498,8 +18498,8 @@ // CHECK1-NEXT: store i32 [[TMP5]], i32* [[CONV3]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[L_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i64 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -18795,7 +18795,7 @@ // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 @@ -18807,7 +18807,7 @@ // CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -18817,8 +18817,8 @@ // CHECK1-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -19062,14 +19062,14 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -19253,7 +19253,7 @@ // CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -19263,8 +19263,8 @@ // CHECK1-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[F_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP4]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP4]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -19486,7 +19486,7 @@ // CHECK1-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -19496,8 +19496,8 @@ // CHECK1-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -19805,7 +19805,7 @@ // CHECK1-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -19816,8 +19816,8 @@ // CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[V_ADDR]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -20078,7 +20078,7 @@ // CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 // CHECK2-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -20092,8 +20092,8 @@ // CHECK2-NEXT: store i32 [[TMP5]], i32* [[CONV3]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load i64, i64* [[L_CASTED]], align 8 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i64 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -20389,7 +20389,7 @@ // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 -// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK2-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 @@ -20401,7 +20401,7 @@ // CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -20411,8 +20411,8 @@ // CHECK2-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -20656,14 +20656,14 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -20847,7 +20847,7 @@ // CHECK2-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -20857,8 +20857,8 @@ // CHECK2-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i64, i64* [[F_CASTED]], align 8 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP4]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -21080,7 +21080,7 @@ // CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 8 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -21090,8 +21090,8 @@ // CHECK2-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -21394,7 +21394,7 @@ // CHECK2-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 8 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -21405,8 +21405,8 @@ // CHECK2-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[V_ADDR]], align 8 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -21665,7 +21665,7 @@ // CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 // CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -21677,8 +21677,8 @@ // CHECK3-NEXT: store i32 [[TMP5]], i32* [[L_CASTED]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[L_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -21961,7 +21961,7 @@ // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 -// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 @@ -21972,7 +21972,7 @@ // CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -21981,8 +21981,8 @@ // CHECK3-NEXT: store i32 [[TMP3]], i32* [[N_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -22217,14 +22217,14 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -22401,7 +22401,7 @@ // CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -22410,8 +22410,8 @@ // CHECK3-NEXT: store i32 [[TMP3]], i32* [[F_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[F_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -22622,7 +22622,7 @@ // CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -22631,8 +22631,8 @@ // CHECK3-NEXT: store i32 [[TMP3]], i32* [[N_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -22939,7 +22939,7 @@ // CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 // CHECK3-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -22949,8 +22949,8 @@ // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK3-NEXT: [[TMP5:%.*]] = load i32*, i32** [[V_ADDR]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -23199,7 +23199,7 @@ // CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 // CHECK4-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -23211,8 +23211,8 @@ // CHECK4-NEXT: store i32 [[TMP5]], i32* [[L_CASTED]], align 4 // CHECK4-NEXT: [[TMP6:%.*]] = load i32, i32* [[L_CASTED]], align 4 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -23495,7 +23495,7 @@ // // // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34 -// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK4-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK4-NEXT: entry: // CHECK4-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK4-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 @@ -23506,7 +23506,7 @@ // CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK4-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -23515,8 +23515,8 @@ // CHECK4-NEXT: store i32 [[TMP3]], i32* [[N_CASTED]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -23751,14 +23751,14 @@ // CHECK4-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK4-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -23935,7 +23935,7 @@ // CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK4-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -23944,8 +23944,8 @@ // CHECK4-NEXT: store i32 [[TMP3]], i32* [[F_CASTED]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[F_CASTED]], align 4 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR1]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR2]] +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -24156,7 +24156,7 @@ // CHECK4-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK4-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -24165,8 +24165,8 @@ // CHECK4-NEXT: store i32 [[TMP3]], i32* [[N_CASTED]], align 4 // CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR1]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__omp_outlined__8(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [10 x [10 x i32]]* [[TMP0]]) #[[ATTR2]] +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -24473,7 +24473,7 @@ // CHECK4-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 // CHECK4-NEXT: store i32* [[V]], i32** [[V_ADDR]], align 4 // CHECK4-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -24483,8 +24483,8 @@ // CHECK4-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK4-NEXT: [[TMP5:%.*]] = load i32*, i32** [[V_ADDR]], align 4 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR1]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__omp_outlined__10(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32* [[TMP5]]) #[[ATTR2]] +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp @@ -46,7 +46,7 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -61,7 +61,7 @@ // CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], i32* [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -304,10 +304,10 @@ // CHECK1-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK1-NEXT: store i32 [[ADD]], i32* [[I7]], align 4 -// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR4:[0-9]+]] -// CHECK1-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] // CHECK1-NEXT: [[ADD14:%.*]] = add nsw i32 [[CALL]], [[CALL13]] -// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR4]] +// CHECK1-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR5]] // CHECK1-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD14]], [[CALL15]] // CHECK1-NEXT: store i32 [[ADD16]], i32* [[TMP0]], align 4 // CHECK1-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -354,7 +354,7 @@ // CHECK2-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK2-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -367,7 +367,7 @@ // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -598,10 +598,10 @@ // CHECK2-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK2-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] -// CHECK2-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK2-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] +// CHECK2-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] // CHECK2-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK2-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK2-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] // CHECK2-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK2-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK2-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -648,7 +648,7 @@ // CHECK3-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK3-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -661,7 +661,7 @@ // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -892,10 +892,10 @@ // CHECK3-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK3-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] -// CHECK3-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK3-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] +// CHECK3-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] // CHECK3-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK3-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK3-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] // CHECK3-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK3-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK3-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -944,7 +944,7 @@ // CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* // CHECK4-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8 // CHECK4-NEXT: [[CONV1:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32* -// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -959,7 +959,7 @@ // CHECK4-NEXT: [[TMP6:%.*]] = load i64, i64* [[DOTCAPTURE_EXPR__CASTED]], align 8 // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], i32* [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]] -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -1202,10 +1202,10 @@ // CHECK4-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK4-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK4-NEXT: store i32 [[ADD]], i32* [[I7]], align 4 -// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR4:[0-9]+]] -// CHECK4-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK4-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I7]]) #[[ATTR5:[0-9]+]] +// CHECK4-NEXT: [[CALL13:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] // CHECK4-NEXT: [[ADD14:%.*]] = add nsw i32 [[CALL]], [[CALL13]] -// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR4]] +// CHECK4-NEXT: [[CALL15:%.*]] = call i32 @_Z3fooPi(i32* [[CONV]]) #[[ATTR5]] // CHECK4-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD14]], [[CALL15]] // CHECK4-NEXT: store i32 [[ADD16]], i32* [[TMP0]], align 4 // CHECK4-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -1252,7 +1252,7 @@ // CHECK5-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK5-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 // CHECK5-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK5-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK5-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK5: user_code.entry: @@ -1265,7 +1265,7 @@ // CHECK5-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK5-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK5-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] -// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK5-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK5-NEXT: ret void // CHECK5: worker.exit: // CHECK5-NEXT: ret void @@ -1496,10 +1496,10 @@ // CHECK5-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK5-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK5-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] -// CHECK5-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK5-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] +// CHECK5-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] // CHECK5-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK5-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK5-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] // CHECK5-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK5-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK5-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] @@ -1546,7 +1546,7 @@ // CHECK6-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 4 // CHECK6-NEXT: store i32 [[DOTCAPTURE_EXPR_]], i32* [[DOTCAPTURE_EXPR__ADDR]], align 4 // CHECK6-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 4 -// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK6-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK6-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK6: user_code.entry: @@ -1559,7 +1559,7 @@ // CHECK6-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTCAPTURE_EXPR__CASTED]], align 4 // CHECK6-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 // CHECK6-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], i32* [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]] -// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK6-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK6-NEXT: ret void // CHECK6: worker.exit: // CHECK6-NEXT: ret void @@ -1790,10 +1790,10 @@ // CHECK6-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP19]], 1 // CHECK6-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] // CHECK6-NEXT: store i32 [[ADD]], i32* [[I4]], align 4 -// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR4:[0-9]+]] -// CHECK6-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR4]] +// CHECK6-NEXT: [[CALL:%.*]] = call i32 @_Z3fooPi(i32* [[I4]]) #[[ATTR5:[0-9]+]] +// CHECK6-NEXT: [[CALL8:%.*]] = call i32 @_Z3fooPi(i32* [[TMP0]]) #[[ATTR5]] // CHECK6-NEXT: [[ADD9:%.*]] = add nsw i32 [[CALL]], [[CALL8]] -// CHECK6-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR4]] +// CHECK6-NEXT: [[CALL10:%.*]] = call i32 @_Z3fooPi(i32* [[ARGC_ADDR]]) #[[ATTR5]] // CHECK6-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD9]], [[CALL10]] // CHECK6-NEXT: store i32 [[ADD11]], i32* [[TMP0]], align 4 // CHECK6-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp @@ -9397,7 +9397,7 @@ // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 8 // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[L_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -9411,8 +9411,8 @@ // CHECK1-NEXT: store i32 [[TMP5]], i32* [[CONV3]], align 4 // CHECK1-NEXT: [[TMP6:%.*]] = load i64, i64* [[L_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i64 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i32]* [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -9732,7 +9732,7 @@ // // // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 -// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK1-SAME: (i64 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8 // CHECK1-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 8 @@ -9744,7 +9744,7 @@ // CHECK1-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32* // CHECK1-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -9754,8 +9754,8 @@ // CHECK1-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[N_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i64 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -10023,14 +10023,14 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8 -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -10228,7 +10228,7 @@ // CHECK1-NEXT: store i64 [[F]], i64* [[F_ADDR]], align 8 // CHECK1-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[F_ADDR]] to i32* -// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -10238,8 +10238,8 @@ // CHECK1-NEXT: store i32 [[TMP3]], i32* [[CONV1]], align 4 // CHECK1-NEXT: [[TMP4:%.*]] = load i64, i64* [[F_CASTED]], align 8 // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP4]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK1-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i64 [[TMP4]]) #[[ATTR2]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -10479,7 +10479,7 @@ // CHECK2-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 // CHECK2-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -10491,8 +10491,8 @@ // CHECK2-NEXT: store i32 [[TMP5]], i32* [[L_CASTED]], align 4 // CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[L_CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -10799,7 +10799,7 @@ // // // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 -// CHECK2-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK2-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK2-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 @@ -10810,7 +10810,7 @@ // CHECK2-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK2-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -10819,8 +10819,8 @@ // CHECK2-NEXT: store i32 [[TMP3]], i32* [[N_CASTED]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -11079,14 +11079,14 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -11277,7 +11277,7 @@ // CHECK2-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK2-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -11286,8 +11286,8 @@ // CHECK2-NEXT: store i32 [[TMP3]], i32* [[F_CASTED]], align 4 // CHECK2-NEXT: [[TMP4:%.*]] = load i32, i32* [[F_CASTED]], align 4 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK2-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR2]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -11517,7 +11517,7 @@ // CHECK3-NEXT: store [1000 x i32]* [[A]], [1000 x i32]** [[A_ADDR]], align 4 // CHECK3-NEXT: store i32 [[L]], i32* [[L_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i32]*, [1000 x i32]** [[A_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -11529,8 +11529,8 @@ // CHECK3-NEXT: store i32 [[TMP5]], i32* [[L_CASTED]], align 4 // CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[L_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR1:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i32]* [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -11837,7 +11837,7 @@ // // // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32 -// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK3-SAME: (i32 [[N:%.*]], [1000 x i16]* nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 // CHECK3-NEXT: [[AA_ADDR:%.*]] = alloca [1000 x i16]*, align 4 @@ -11848,7 +11848,7 @@ // CHECK3-NEXT: store i32 [[N]], i32* [[N_ADDR]], align 4 // CHECK3-NEXT: store [1000 x i16]* [[AA]], [1000 x i16]** [[AA_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [1000 x i16]*, [1000 x i16]** [[AA_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -11857,8 +11857,8 @@ // CHECK3-NEXT: store i32 [[TMP3]], i32* [[N_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[N_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__2(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32 [[TMP4]], [1000 x i16]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -12117,14 +12117,14 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB4]]) // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__4(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP0]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -12315,7 +12315,7 @@ // CHECK3-NEXT: store [10 x [10 x i32]]* [[C]], [10 x [10 x i32]]** [[C_ADDR]], align 4 // CHECK3-NEXT: store i32 [[F]], i32* [[F_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load [10 x [10 x i32]]*, [10 x [10 x i32]]** [[C_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 false) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 false) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -12324,8 +12324,8 @@ // CHECK3-NEXT: store i32 [[TMP3]], i32* [[F_CASTED]], align 4 // CHECK3-NEXT: [[TMP4:%.*]] = load i32, i32* [[F_CASTED]], align 4 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +// CHECK3-NEXT: call void @__omp_outlined__6(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], [10 x [10 x i32]]* [[TMP0]], i32 [[TMP4]]) #[[ATTR2]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 false) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp --- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp +++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp @@ -70,24 +70,24 @@ } // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l37( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l43( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: call void @__kmpc_for_static_fini( // CHECK: ret void // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l48( -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, // CHECK: call void @__kmpc_for_static_fini( @@ -95,8 +95,8 @@ // CHECK: define {{.*}}void {{@__omp_offloading_.+}}_l53({{.+}}, i{{32|64}} [[F_IN:%.+]]) // CHECK: store {{.+}} [[F_IN]], {{.+}}* {{.+}}, -// CHECK: call i32 @__kmpc_target_init({{.*}}, i1 true, i1 false, i1 false) -// CHECK: call void @__kmpc_target_deinit({{.*}}, i1 true, i1 false) +// CHECK: call i32 @__kmpc_target_init({{.*}}, i8 2, i1 false, i1 false) +// CHECK: call void @__kmpc_target_deinit({{.*}}, i8 2, i1 false) // CHECK: store {{.+}} 99, {{.+}}* [[COMB_UB:%.+]], align // CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91, {{.+}}, {{.+}}, {{.+}}* [[COMB_UB]], diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp @@ -899,7 +899,7 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[ARGC]], i64* [[ARGC_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -909,9 +909,9 @@ // CHECK1-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]] +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR2:[0-9]+]] // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[ARGC1]], i64 4) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -939,7 +939,7 @@ // CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -949,9 +949,9 @@ // CHECK1-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]] +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR2]] // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[ARGC1]], i64 8) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -979,7 +979,7 @@ // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -989,9 +989,9 @@ // CHECK2-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]] +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR2:[0-9]+]] // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -1019,7 +1019,7 @@ // CHECK2-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -1029,9 +1029,9 @@ // CHECK2-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4 // CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]] +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR2]] // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -1066,7 +1066,7 @@ // CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* // CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* // CHECK3-NEXT: [[CONV2:%.*]] = bitcast i64* [[ARGC_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1076,9 +1076,9 @@ // CHECK3-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]] +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR2:[0-9]+]] // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[ARGC3]], i64 4) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1112,7 +1112,7 @@ // CHECK3-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 8 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* // CHECK3-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i32* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -1122,9 +1122,9 @@ // CHECK3-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 8 // CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]] +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR2]] // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[ARGC2]], i64 8) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -1156,7 +1156,7 @@ // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 // CHECK4-NEXT: store i32 [[ARGC]], i32* [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -1166,9 +1166,9 @@ // CHECK4-NEXT: store i32 [[TMP1]], i32* [[ARGC_ON_STACK]], align 4 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR1:[0-9]+]] +// CHECK4-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[ARGC_ON_STACK]]) #[[ATTR2:[0-9]+]] // CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4) -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void @@ -1200,7 +1200,7 @@ // CHECK4-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK4-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 // CHECK4-NEXT: store i8** [[ARGC]], i8*** [[ARGC_ADDR]], align 4 -// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK4-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK4-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK4-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK4: user_code.entry: @@ -1210,9 +1210,9 @@ // CHECK4-NEXT: store i8** [[TMP1]], i8*** [[ARGC_ON_STACK]], align 4 // CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK4-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR1]] +// CHECK4-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8*** [[ARGC_ON_STACK]]) #[[ATTR2]] // CHECK4-NEXT: call void @__kmpc_free_shared(i8* [[ARGC1]], i32 4) -// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK4-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK4-NEXT: ret void // CHECK4: worker.exit: // CHECK4-NEXT: ret void diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp --- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -4215,7 +4215,7 @@ // CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK1-NEXT: store i64 [[E]], i64* [[E_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[E_ADDR]] to double* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -4225,9 +4225,9 @@ // CHECK1-NEXT: store double [[TMP1]], double* [[E_ON_STACK]], align 8 // CHECK1-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR1:[0-9]+]] +// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E_ON_STACK]]) #[[ATTR4:[0-9]+]] // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[E1]], i64 8) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -4323,7 +4323,7 @@ // CHECK1: then: // CHECK1-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* // CHECK1-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4]] // CHECK1-NEXT: br label [[IFCONT:%.*]] // CHECK1: else: // CHECK1-NEXT: br label [[IFCONT]] @@ -4453,7 +4453,7 @@ // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -4501,7 +4501,7 @@ // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK1-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -4517,7 +4517,7 @@ // CHECK1-NEXT: store i64 [[D]], i64* [[D_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[C_ADDR]] to i8* // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[D_ADDR]] to float* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: @@ -4530,10 +4530,10 @@ // CHECK1-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4 // CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK1-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR1]] +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR4]] // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i64 4) // CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i64 1) -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -4664,7 +4664,7 @@ // CHECK1: then: // CHECK1-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] // CHECK1-NEXT: br label [[IFCONT:%.*]] // CHECK1: else: // CHECK1-NEXT: br label [[IFCONT]] @@ -4825,7 +4825,7 @@ // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -4883,7 +4883,7 @@ // CHECK1-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 8 // CHECK1-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK1-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -4899,14 +4899,14 @@ // CHECK1-NEXT: store i64 [[B]], i64* [[B_ADDR]], align 8 // CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[A_ADDR]] to i32* // CHECK1-NEXT: [[CONV1:%.*]] = bitcast i64* [[B_ADDR]] to i16* -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK1: user_code.entry: // CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) // CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK1-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR1]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK1-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[CONV]], i16* [[CONV1]]) #[[ATTR4]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK1-NEXT: ret void // CHECK1: worker.exit: // CHECK1-NEXT: ret void @@ -5118,7 +5118,7 @@ // CHECK1: then: // CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] // CHECK1-NEXT: br label [[IFCONT:%.*]] // CHECK1: else: // CHECK1-NEXT: br label [[IFCONT]] @@ -5296,7 +5296,7 @@ // CHECK1: then: // CHECK1-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK1-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] // CHECK1-NEXT: br label [[IFCONT:%.*]] // CHECK1: else: // CHECK1-NEXT: br label [[IFCONT]] @@ -5463,7 +5463,7 @@ // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -5523,7 +5523,7 @@ // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 8 // CHECK1-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK1-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 8 -// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR1]] +// CHECK1-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR4]] // CHECK1-NEXT: ret void // // @@ -5537,7 +5537,7 @@ // CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK2-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 // CHECK2-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -5545,8 +5545,8 @@ // CHECK2-NEXT: [[TMP3:%.*]] = load double, double* [[TMP0]], align 8 // CHECK2-NEXT: store double [[TMP3]], double* [[E1]], align 8 // CHECK2-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR1:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -5642,7 +5642,7 @@ // CHECK2: then: // CHECK2-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* // CHECK2-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4]] // CHECK2-NEXT: br label [[IFCONT:%.*]] // CHECK2: else: // CHECK2-NEXT: br label [[IFCONT]] @@ -5772,7 +5772,7 @@ // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -5820,7 +5820,7 @@ // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK2-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -5836,7 +5836,7 @@ // CHECK2-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* // CHECK2-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: @@ -5849,10 +5849,10 @@ // CHECK2-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4 // CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK2-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR1]] +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR4]] // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i32 4) // CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i32 1) -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -5983,7 +5983,7 @@ // CHECK2: then: // CHECK2-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] // CHECK2-NEXT: br label [[IFCONT:%.*]] // CHECK2: else: // CHECK2-NEXT: br label [[IFCONT]] @@ -6144,7 +6144,7 @@ // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -6202,7 +6202,7 @@ // CHECK2-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 // CHECK2-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK2-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -6217,14 +6217,14 @@ // CHECK2-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK2-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 // CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK2-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK2-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK2: user_code.entry: // CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) // CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR1]] -// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK2-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR4]] +// CHECK2-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK2-NEXT: ret void // CHECK2: worker.exit: // CHECK2-NEXT: ret void @@ -6436,7 +6436,7 @@ // CHECK2: then: // CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] // CHECK2-NEXT: br label [[IFCONT:%.*]] // CHECK2: else: // CHECK2-NEXT: br label [[IFCONT]] @@ -6614,7 +6614,7 @@ // CHECK2: then: // CHECK2-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK2-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] // CHECK2-NEXT: br label [[IFCONT:%.*]] // CHECK2: else: // CHECK2-NEXT: br label [[IFCONT]] @@ -6781,7 +6781,7 @@ // CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -6841,7 +6841,7 @@ // CHECK2-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 // CHECK2-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK2-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR1]] +// CHECK2-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR4]] // CHECK2-NEXT: ret void // // @@ -6855,7 +6855,7 @@ // CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 // CHECK3-NEXT: store double* [[E]], double** [[E_ADDR]], align 4 // CHECK3-NEXT: [[TMP0:%.*]] = load double*, double** [[E_ADDR]], align 4 -// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -6863,8 +6863,8 @@ // CHECK3-NEXT: [[TMP3:%.*]] = load double, double* [[TMP0]], align 8 // CHECK3-NEXT: store double [[TMP3]], double* [[E1]], align 8 // CHECK3-NEXT: store i32 [[TMP2]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR1:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__omp_outlined__(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], double* [[E1]]) #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -6960,7 +6960,7 @@ // CHECK3: then: // CHECK3-NEXT: [[TMP35:%.*]] = bitcast [1 x i8*]* [[TMP5]] to i8* // CHECK3-NEXT: [[TMP36:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP35]], i8* [[TMP36]]) #[[ATTR4]] // CHECK3-NEXT: br label [[IFCONT:%.*]] // CHECK3: else: // CHECK3-NEXT: br label [[IFCONT]] @@ -7090,7 +7090,7 @@ // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP9]], i8* [[TMP10]]) #[[ATTR4]] // CHECK3-NEXT: ret void // // @@ -7138,7 +7138,7 @@ // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP6]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK3-NEXT: [[TMP10:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func"(i8* [[TMP10]], i8* [[TMP9]]) #[[ATTR4]] // CHECK3-NEXT: ret void // // @@ -7154,7 +7154,7 @@ // CHECK3-NEXT: store i32 [[D]], i32* [[D_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[C_ADDR]] to i8* // CHECK3-NEXT: [[CONV1:%.*]] = bitcast i32* [[D_ADDR]] to float* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: @@ -7167,10 +7167,10 @@ // CHECK3-NEXT: store float [[TMP2]], float* [[D_ON_STACK]], align 4 // CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) // CHECK3-NEXT: store i32 [[TMP3]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR1]] +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i8* [[C2]], float* [[D_ON_STACK]]) #[[ATTR4]] // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[D3]], i32 4) // CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[C2]], i32 1) -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -7301,7 +7301,7 @@ // CHECK3: then: // CHECK3-NEXT: [[TMP46:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP46]], i8* [[TMP47]]) #[[ATTR4]] // CHECK3-NEXT: br label [[IFCONT:%.*]] // CHECK3: else: // CHECK3-NEXT: br label [[IFCONT]] @@ -7462,7 +7462,7 @@ // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP11]], i8* [[TMP12]]) #[[ATTR4]] // CHECK3-NEXT: ret void // // @@ -7520,7 +7520,7 @@ // CHECK3-NEXT: store i8* [[TMP10]], i8** [[TMP8]], align 4 // CHECK3-NEXT: [[TMP11:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK3-NEXT: [[TMP12:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func2"(i8* [[TMP12]], i8* [[TMP11]]) #[[ATTR4]] // CHECK3-NEXT: ret void // // @@ -7535,14 +7535,14 @@ // CHECK3-NEXT: store i32 [[A]], i32* [[A_ADDR]], align 4 // CHECK3-NEXT: store i32 [[B]], i32* [[B_ADDR]], align 4 // CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[B_ADDR]] to i16* -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 true, i1 false, i1 true) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 2, i1 false, i1 true) // CHECK3-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 // CHECK3-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] // CHECK3: user_code.entry: // CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) // CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 -// CHECK3-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR1]] -// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +// CHECK3-NEXT: call void @__omp_outlined__9(i32* [[DOTTHREADID_TEMP_]], i32* [[DOTZERO_ADDR]], i32* [[A_ADDR]], i16* [[CONV]]) #[[ATTR4]] +// CHECK3-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) // CHECK3-NEXT: ret void // CHECK3: worker.exit: // CHECK3-NEXT: ret void @@ -7754,7 +7754,7 @@ // CHECK3: then: // CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func11"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] // CHECK3-NEXT: br label [[IFCONT:%.*]] // CHECK3: else: // CHECK3-NEXT: br label [[IFCONT]] @@ -7932,7 +7932,7 @@ // CHECK3: then: // CHECK3-NEXT: [[TMP47:%.*]] = bitcast [2 x i8*]* [[TMP5]] to i8* // CHECK3-NEXT: [[TMP48:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]] to i8* -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP47]], i8* [[TMP48]]) #[[ATTR4]] // CHECK3-NEXT: br label [[IFCONT:%.*]] // CHECK3: else: // CHECK3-NEXT: br label [[IFCONT]] @@ -8099,7 +8099,7 @@ // CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP12]], i8* [[TMP13]]) #[[ATTR4]] // CHECK3-NEXT: ret void // // @@ -8159,6 +8159,6 @@ // CHECK3-NEXT: store i8* [[TMP11]], i8** [[TMP9]], align 4 // CHECK3-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[DOTOMP_REDUCTION_RED_LIST]] to i8* // CHECK3-NEXT: [[TMP13:%.*]] = load i8*, i8** [[DOTADDR2]], align 4 -// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR1]] +// CHECK3-NEXT: call void @"_omp$reduction$reduction_func14"(i8* [[TMP13]], i8* [[TMP12]]) #[[ATTR4]] // CHECK3-NEXT: ret void // diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp --- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp @@ -95,7 +95,7 @@ // CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG47]] // CHECK1-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG47]] // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG47]] -// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true), !dbg [[DBG47]] +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true), !dbg [[DBG47]] // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG47]] // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG47]] // CHECK1: user_code.entry: @@ -117,7 +117,7 @@ // CHECK1-NEXT: store i8* [[TMP7]], i8** [[TMP18]], align 8, !dbg [[DBG48]] // CHECK1-NEXT: [[TMP19:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG48]] // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP19]], i64 4), !dbg [[DBG48]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB5:[0-9]+]], i1 true, i1 true), !dbg [[DBG49:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB5:[0-9]+]], i8 2, i1 true), !dbg [[DBG49:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG51:![0-9]+]] // CHECK1: worker.exit: // CHECK1-NEXT: ret void, !dbg [[DBG47]] @@ -316,7 +316,7 @@ // CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG137]] // CHECK1-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG137]] // CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG137]] -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB7:[0-9]+]], i1 true, i1 false, i1 true), !dbg [[DBG137]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB7:[0-9]+]], i8 2, i1 false, i1 true), !dbg [[DBG137]] // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG137]] // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG137]] // CHECK1: user_code.entry: @@ -338,7 +338,7 @@ // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP19]], align 8, !dbg [[DBG138]] // CHECK1-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG138]] // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB9]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG138]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB11:[0-9]+]], i1 true, i1 true), !dbg [[DBG139:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB11:[0-9]+]], i8 2, i1 true), !dbg [[DBG139:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG141:![0-9]+]] // CHECK1: worker.exit: // CHECK1-NEXT: ret void, !dbg [[DBG137]] @@ -531,7 +531,7 @@ // CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG212]] // CHECK1-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG212]] // CHECK1-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG212]] -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB13:[0-9]+]], i1 true, i1 false, i1 true), !dbg [[DBG212]] +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB13:[0-9]+]], i8 2, i1 false, i1 true), !dbg [[DBG212]] // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG212]] // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG212]] // CHECK1: user_code.entry: @@ -549,7 +549,7 @@ // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP20]], align 8, !dbg [[DBG213]] // CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG213]] // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB15]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i32*, [10 x [10 x i32]]*, i8*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP21]], i64 4), !dbg [[DBG213]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB17:[0-9]+]], i1 true, i1 true), !dbg [[DBG214:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB17:[0-9]+]], i8 2, i1 true), !dbg [[DBG214:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG216:![0-9]+]] // CHECK1: worker.exit: // CHECK1-NEXT: ret void, !dbg [[DBG212]] diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp --- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp +++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp @@ -89,7 +89,7 @@ // CHECK1-NEXT: [[TMP6:%.*]] = addrspacecast i8 addrspace(1)* [[TMP5]] to i8*, !dbg [[DBG41]] // CHECK1-NEXT: store i8* [[TMP6]], i8** [[_TMP2]], align 8, !dbg [[DBG41]] // CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG41]] -// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false), !dbg [[DBG41]] +// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false), !dbg [[DBG41]] // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG41]] // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG41]] // CHECK1: user_code.entry: @@ -114,7 +114,7 @@ // CHECK1-NEXT: [[TMP20:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG42]] // CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG42]] // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB6]], i32 [[TMP9]], i32 [[TMP20]], i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP21]], i64 4), !dbg [[DBG42]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB8:[0-9]+]], i1 true, i1 false), !dbg [[DBG45:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB8:[0-9]+]], i8 2, i1 false), !dbg [[DBG45:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG46:![0-9]+]] // CHECK1: worker.exit: // CHECK1-NEXT: ret void, !dbg [[DBG41]] @@ -392,7 +392,7 @@ // CHECK1-NEXT: [[TMP7:%.*]] = addrspacecast i8 addrspace(1)* [[TMP6]] to i8*, !dbg [[DBG146]] // CHECK1-NEXT: store i8* [[TMP7]], i8** [[_TMP2]], align 8, !dbg [[DBG146]] // CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG146]] -// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB10:[0-9]+]], i1 true, i1 false, i1 false), !dbg [[DBG146]] +// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB10:[0-9]+]], i8 2, i1 false, i1 false), !dbg [[DBG146]] // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG146]] // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG146]] // CHECK1: user_code.entry: @@ -414,7 +414,7 @@ // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP19]], align 8, !dbg [[DBG147]] // CHECK1-NEXT: [[TMP20:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG147]] // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB15]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i64, [10 x [10 x i32]]*, i8*)* @__omp_outlined__2 to i8*), i8* null, i8** [[TMP20]], i64 4), !dbg [[DBG147]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB17:[0-9]+]], i1 true, i1 false), !dbg [[DBG148:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB17:[0-9]+]], i8 2, i1 false), !dbg [[DBG148:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG150:![0-9]+]] // CHECK1: worker.exit: // CHECK1-NEXT: ret void, !dbg [[DBG146]] @@ -680,7 +680,7 @@ // CHECK1-NEXT: [[TMP10:%.*]] = addrspacecast i8 addrspace(1)* [[TMP9]] to i8*, !dbg [[DBG236]] // CHECK1-NEXT: store i8* [[TMP10]], i8** [[_TMP3]], align 8, !dbg [[DBG236]] // CHECK1-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG236]] -// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB19:[0-9]+]], i1 true, i1 false, i1 false), !dbg [[DBG236]] +// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB19:[0-9]+]], i8 2, i1 false, i1 false), !dbg [[DBG236]] // CHECK1-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG236]] // CHECK1-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG236]] // CHECK1: user_code.entry: @@ -698,7 +698,7 @@ // CHECK1-NEXT: store i8* [[TMP11]], i8** [[TMP20]], align 8, !dbg [[DBG237]] // CHECK1-NEXT: [[TMP21:%.*]] = bitcast [4 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**, !dbg [[DBG237]] // CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB24]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x [10 x [10 x i32]]]*, i32*, [10 x [10 x i32]]*, i8*)* @__omp_outlined__4 to i8*), i8* null, i8** [[TMP21]], i64 4), !dbg [[DBG237]] -// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB26:[0-9]+]], i1 true, i1 false), !dbg [[DBG238:![0-9]+]] +// CHECK1-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB26:[0-9]+]], i8 2, i1 false), !dbg [[DBG238:![0-9]+]] // CHECK1-NEXT: ret void, !dbg [[DBG240:![0-9]+]] // CHECK1: worker.exit: // CHECK1-NEXT: ret void, !dbg [[DBG236]] diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -414,8 +414,8 @@ /* Int */ Int32, /* kmp_task_t */ VoidPtr) /// OpenMP Device runtime functions -__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int1, Int1, Int1) -__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int1, Int1) +__OMP_RTL(__kmpc_target_init, false, Int32, IdentPtr, Int8, Int1, Int1) +__OMP_RTL(__kmpc_target_deinit, false, Void, IdentPtr, Int8, Int1) __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr) __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32, VoidPtr, VoidPtr, VoidPtrPtr, SizeTy) diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -2761,7 +2761,9 @@ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); Value *Ident = getOrCreateIdent(SrcLocStr); - ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD); + ConstantInt *IsSPMDVal = ConstantInt::getSigned( + IntegerType::getInt8Ty(Int8->getContext()), + IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); ConstantInt *UseGenericStateMachine = ConstantInt::getBool(Int32->getContext(), !IsSPMD); ConstantInt *RequiresFullRuntimeVal = @@ -2812,7 +2814,9 @@ Constant *SrcLocStr = getOrCreateSrcLocStr(Loc); Value *Ident = getOrCreateIdent(SrcLocStr); - ConstantInt *IsSPMDVal = ConstantInt::getBool(Int32->getContext(), IsSPMD); + ConstantInt *IsSPMDVal = ConstantInt::getSigned( + IntegerType::getInt8Ty(Int8->getContext()), + IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC); ConstantInt *RequiresFullRuntimeVal = ConstantInt::getBool(Int32->getContext(), RequiresFullRuntime); diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2580,10 +2580,9 @@ CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr; if (!CB) return false; - const int InitIsSPMDArgNo = 1; - auto *IsSPMDModeCI = - dyn_cast(CB->getOperand(InitIsSPMDArgNo)); - return IsSPMDModeCI && IsSPMDModeCI->isZero(); + const int InitModeArgNo = 1; + auto *ModeCI = dyn_cast(CB->getOperand(InitModeArgNo)); + return ModeCI && (ModeCI->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC); } if (C->isZero()) { @@ -2941,7 +2940,7 @@ return FalseVal; }; - Attributor::SimplifictionCallbackTy IsSPMDModeSimplifyCB = + Attributor::SimplifictionCallbackTy ModeSimplifyCB = [&](const IRPosition &IRP, const AbstractAttribute *AA, bool &UsedAssumedInformation) -> Optional { // IRP represents the "SPMDCompatibilityTracker" argument of an @@ -2957,8 +2956,10 @@ } else { UsedAssumedInformation = false; } - auto *Val = ConstantInt::getBool(IRP.getAnchorValue().getContext(), - SPMDCompatibilityTracker.isAssumed()); + auto *Val = ConstantInt::getSigned( + IntegerType::getInt8Ty(IRP.getAnchorValue().getContext()), + SPMDCompatibilityTracker.isAssumed() ? OMP_TGT_EXEC_MODE_SPMD + : OMP_TGT_EXEC_MODE_GENERIC); return Val; }; @@ -2983,8 +2984,8 @@ return Val; }; - constexpr const int InitIsSPMDArgNo = 1; - constexpr const int DeinitIsSPMDArgNo = 1; + constexpr const int InitModeArgNo = 1; + constexpr const int DeinitModeArgNo = 1; constexpr const int InitUseStateMachineArgNo = 2; constexpr const int InitRequiresFullRuntimeArgNo = 3; constexpr const int DeinitRequiresFullRuntimeArgNo = 2; @@ -2992,11 +2993,11 @@ IRPosition::callsite_argument(*KernelInitCB, InitUseStateMachineArgNo), StateMachineSimplifyCB); A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelInitCB, InitIsSPMDArgNo), - IsSPMDModeSimplifyCB); + IRPosition::callsite_argument(*KernelInitCB, InitModeArgNo), + ModeSimplifyCB); A.registerSimplificationCallback( - IRPosition::callsite_argument(*KernelDeinitCB, DeinitIsSPMDArgNo), - IsSPMDModeSimplifyCB); + IRPosition::callsite_argument(*KernelDeinitCB, DeinitModeArgNo), + ModeSimplifyCB); A.registerSimplificationCallback( IRPosition::callsite_argument(*KernelInitCB, InitRequiresFullRuntimeArgNo), @@ -3007,9 +3008,9 @@ IsGenericModeSimplifyCB); // Check if we know we are in SPMD-mode already. - ConstantInt *IsSPMDArg = - dyn_cast(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); - if (IsSPMDArg && !IsSPMDArg->isZero()) + ConstantInt *ModeArg = + dyn_cast(KernelInitCB->getArgOperand(InitModeArgNo)); + if (ModeArg && (ModeArg->getSExtValue() & OMP_TGT_EXEC_MODE_SPMD)) SPMDCompatibilityTracker.indicateOptimisticFixpoint(); // This is a generic region but SPMDization is disabled so stop tracking. else if (DisableOpenMPOptSPMDization) @@ -3298,21 +3299,24 @@ ExecModeVal | OMP_TGT_EXEC_MODE_GENERIC_SPMD)); // Next rewrite the init and deinit calls to indicate we use SPMD-mode now. - const int InitIsSPMDArgNo = 1; - const int DeinitIsSPMDArgNo = 1; + const int InitModeArgNo = 1; + const int DeinitModeArgNo = 1; const int InitUseStateMachineArgNo = 2; const int InitRequiresFullRuntimeArgNo = 3; const int DeinitRequiresFullRuntimeArgNo = 2; auto &Ctx = getAnchorValue().getContext(); - A.changeUseAfterManifest(KernelInitCB->getArgOperandUse(InitIsSPMDArgNo), - *ConstantInt::getBool(Ctx, 1)); + A.changeUseAfterManifest( + KernelInitCB->getArgOperandUse(InitModeArgNo), + *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), + OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitUseStateMachineArgNo), *ConstantInt::getBool(Ctx, 0)); A.changeUseAfterManifest( - KernelDeinitCB->getArgOperandUse(DeinitIsSPMDArgNo), - *ConstantInt::getBool(Ctx, 1)); + KernelDeinitCB->getArgOperandUse(DeinitModeArgNo), + *ConstantInt::getSigned(IntegerType::getInt8Ty(Ctx), + OMP_TGT_EXEC_MODE_SPMD)); A.changeUseAfterManifest( KernelInitCB->getArgOperandUse(InitRequiresFullRuntimeArgNo), *ConstantInt::getBool(Ctx, 0)); @@ -3337,7 +3341,7 @@ assert(ReachedKnownParallelRegions.isValidState() && "Custom state machine with invalid parallel region states?"); - const int InitIsSPMDArgNo = 1; + const int InitModeArgNo = 1; const int InitUseStateMachineArgNo = 2; // Check if the current configuration is non-SPMD and generic state machine. @@ -3346,14 +3350,14 @@ // we give up. ConstantInt *UseStateMachine = dyn_cast( KernelInitCB->getArgOperand(InitUseStateMachineArgNo)); - ConstantInt *IsSPMD = - dyn_cast(KernelInitCB->getArgOperand(InitIsSPMDArgNo)); + ConstantInt *Mode = + dyn_cast(KernelInitCB->getArgOperand(InitModeArgNo)); // If we are stuck with generic mode, try to create a custom device (=GPU) // state machine which is specialized for the parallel regions that are // reachable by the kernel. - if (!UseStateMachine || UseStateMachine->isZero() || !IsSPMD || - !IsSPMD->isZero()) + if (!UseStateMachine || UseStateMachine->isZero() || !Mode || + (Mode->getSExtValue() & OMP_TGT_EXEC_MODE_GENERIC)) return ChangeStatus::UNCHANGED; // If not SPMD mode, indicate we use a custom state machine now. diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll --- a/llvm/test/Transforms/OpenMP/always_inline_device.ll +++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll @@ -12,32 +12,32 @@ ; CHECK: Function Attrs: convergent norecurse nounwind ; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void ; entry: - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry call void @bar() #2 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) ; Function Attrs: convergent nounwind define hidden void @bar() #1 { diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll --- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll @@ -141,7 +141,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -149,14 +149,14 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) #0 { entry: @@ -190,14 +190,14 @@ declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22() #0 { entry: %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -205,7 +205,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__1(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -292,7 +292,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -300,7 +300,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__4(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -372,7 +372,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -380,7 +380,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__6(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -460,7 +460,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -468,7 +468,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__9(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -546,7 +546,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -554,7 +554,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__12(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -635,7 +635,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -643,7 +643,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__15(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -690,7 +690,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -698,7 +698,7 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__16(i32* %.threadid_temp., i32* %.zero.addr) #3 - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry @@ -847,13 +847,14 @@ ; AMDGPU-NEXT: entry: ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] ; AMDGPU-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -908,51 +909,17 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check1: -; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.check3: -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -970,10 +937,10 @@ ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] ; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; AMDGPU-NEXT: ret void ; ; @@ -1033,57 +1000,17 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check1: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.check3: -; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute5: -; AMDGPU-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.check6: -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1102,7 +1029,7 @@ ; AMDGPU-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; AMDGPU-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]] ; AMDGPU-NEXT: ret void ; @@ -1181,53 +1108,17 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check1: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1244,10 +1135,10 @@ ; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; AMDGPU-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]] ; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; AMDGPU-NEXT: ret void ; ; @@ -1307,51 +1198,17 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check1: -; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.check3: -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1368,10 +1225,10 @@ ; AMDGPU-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] ; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; AMDGPU-NEXT: ret void ; ; @@ -1431,51 +1288,17 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__13_wrapper.ID to void (i16, i32)*) -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check1: -; AMDGPU-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute2: -; AMDGPU-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.check3: -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1493,9 +1316,9 @@ ; AMDGPU-NEXT: call void @unknown_no_openmp() #[[ATTR9]] ; AMDGPU-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; AMDGPU-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; AMDGPU-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; AMDGPU-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; AMDGPU-NEXT: ret void ; ; @@ -1555,46 +1378,16 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.check: -; AMDGPU-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__19_wrapper -; AMDGPU-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; AMDGPU: worker_state_machine.parallel_region.execute: -; AMDGPU-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1655,40 +1448,16 @@ ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 ; AMDGPU-SAME: () #[[ATTR0]] { ; AMDGPU-NEXT: entry: -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8, addrspace(5) ; AMDGPU-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; AMDGPU-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; AMDGPU-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; AMDGPU: worker_state_machine.begin: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_GENERIC:%.*]] = addrspacecast i8* addrspace(5)* [[WORKER_WORK_FN_ADDR]] to i8** -; AMDGPU-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR_GENERIC]]) -; AMDGPU-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR_GENERIC]], align 8 -; AMDGPU-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; AMDGPU-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; AMDGPU-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; AMDGPU: worker_state_machine.finished: -; AMDGPU-NEXT: ret void -; AMDGPU: worker_state_machine.is_active.check: -; AMDGPU-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; AMDGPU: worker_state_machine.parallel_region.fallback.execute: -; AMDGPU-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; AMDGPU: worker_state_machine.parallel_region.end: -; AMDGPU-NEXT: call void @__kmpc_kernel_end_parallel() -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; AMDGPU: worker_state_machine.done.barrier: -; AMDGPU-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; AMDGPU-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; AMDGPU: thread.user_code.check: +; AMDGPU-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU: user_code.entry: ; AMDGPU-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-NEXT: ret void ; AMDGPU: worker.exit: ; AMDGPU-NEXT: ret void @@ -1817,13 +1586,14 @@ ; NVPTX-NEXT: entry: ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] ; NVPTX-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -1878,50 +1648,17 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.check1: -; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__3_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.check3: -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -1939,10 +1676,10 @@ ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] ; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; NVPTX-NEXT: ret void ; ; @@ -2002,56 +1739,17 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__17_wrapper -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.check1: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.check3: -; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE5:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK6:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute5: -; NVPTX-NEXT: call void @__omp_outlined__18_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.check6: -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2070,7 +1768,7 @@ ; NVPTX-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR8]] ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; NVPTX-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR8]] ; NVPTX-NEXT: ret void ; @@ -2149,52 +1847,17 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.check1: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2211,10 +1874,10 @@ ; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; NVPTX-NEXT: [[CALL:%.*]] = call i32 @unknown() #[[ATTR10]] ; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; NVPTX-NEXT: ret void ; ; @@ -2274,50 +1937,17 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.check1: -; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__11_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.check3: -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2334,10 +1964,10 @@ ; NVPTX-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8 ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] ; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; NVPTX-NEXT: ret void ; ; @@ -2397,50 +2027,17 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__13_wrapper.ID to void (i16, i32)*) -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__13_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.check1: -; NVPTX-NEXT: br i1 true, label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute2: -; NVPTX-NEXT: call void @__omp_outlined__14_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.check3: -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2458,9 +2055,9 @@ ; NVPTX-NEXT: call void @unknown_no_openmp() #[[ATTR9]] ; NVPTX-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; NVPTX-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef @__omp_outlined__13_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__13 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__13_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; NVPTX-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** -; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef @__omp_outlined__14_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0) +; NVPTX-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__14 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__14_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; NVPTX-NEXT: ret void ; ; @@ -2520,45 +2117,16 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.check: -; NVPTX-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__19_wrapper -; NVPTX-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] -; NVPTX: worker_state_machine.parallel_region.execute: -; NVPTX-NEXT: call void @__omp_outlined__19_wrapper(i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2619,39 +2187,16 @@ ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112 ; NVPTX-SAME: () #[[ATTR0]] { ; NVPTX-NEXT: entry: -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; NVPTX-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) -; NVPTX-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 -; NVPTX-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] -; NVPTX: worker_state_machine.begin: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: [[WORKER_IS_ACTIVE:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORKER_WORK_FN_ADDR]]) -; NVPTX-NEXT: [[WORKER_WORK_FN:%.*]] = load i8*, i8** [[WORKER_WORK_FN_ADDR]], align 8 -; NVPTX-NEXT: [[WORKER_WORK_FN_ADDR_CAST:%.*]] = bitcast i8* [[WORKER_WORK_FN]] to void (i16, i32)* -; NVPTX-NEXT: [[WORKER_IS_DONE:%.*]] = icmp eq i8* [[WORKER_WORK_FN]], null -; NVPTX-NEXT: br i1 [[WORKER_IS_DONE]], label [[WORKER_STATE_MACHINE_FINISHED:%.*]], label [[WORKER_STATE_MACHINE_IS_ACTIVE_CHECK:%.*]] -; NVPTX: worker_state_machine.finished: -; NVPTX-NEXT: ret void -; NVPTX: worker_state_machine.is_active.check: -; NVPTX-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] -; NVPTX: worker_state_machine.parallel_region.fallback.execute: -; NVPTX-NEXT: call void [[WORKER_WORK_FN_ADDR_CAST]](i16 0, i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] -; NVPTX: worker_state_machine.parallel_region.end: -; NVPTX-NEXT: call void @__kmpc_kernel_end_parallel() -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_DONE_BARRIER]] -; NVPTX: worker_state_machine.done.barrier: -; NVPTX-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]]) -; NVPTX-NEXT: br label [[WORKER_STATE_MACHINE_BEGIN]] -; NVPTX: thread.user_code.check: +; NVPTX-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX: user_code.entry: ; NVPTX-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-NEXT: ret void ; NVPTX: worker.exit: ; NVPTX-NEXT: ret void @@ -2780,13 +2325,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -2843,14 +2389,15 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -2933,14 +2480,15 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3040,14 +2588,15 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3129,14 +2678,15 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3218,14 +2768,15 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3307,13 +2858,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3376,13 +2928,14 @@ ; AMDGPU-DISABLED-NEXT: entry: ; AMDGPU-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; AMDGPU-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; AMDGPU-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; AMDGPU-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; AMDGPU-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; AMDGPU-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; AMDGPU-DISABLED: user_code.entry: ; AMDGPU-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; AMDGPU-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; AMDGPU-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; AMDGPU-DISABLED-NEXT: ret void ; AMDGPU-DISABLED: worker.exit: ; AMDGPU-DISABLED-NEXT: ret void @@ -3511,13 +3064,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3:[0-9]+]] ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3574,14 +3128,15 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__1(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3664,14 +3219,15 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__4(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3771,14 +3327,15 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__6(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3860,14 +3417,15 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__9(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -3949,14 +3507,15 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: store i32 [[TMP1]], i32* [[DOTTHREADID_TEMP_]], align 4 ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__12(i32* noalias nocapture noundef nonnull readonly align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -4038,13 +3597,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__15(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void @@ -4107,13 +3667,14 @@ ; NVPTX-DISABLED-NEXT: entry: ; NVPTX-DISABLED-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 ; NVPTX-DISABLED-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 -; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; NVPTX-DISABLED-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +; NVPTX-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; NVPTX-DISABLED-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; NVPTX-DISABLED-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; NVPTX-DISABLED: user_code.entry: ; NVPTX-DISABLED-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) #[[ATTR3]] ; NVPTX-DISABLED-NEXT: call void @__omp_outlined__16(i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTTHREADID_TEMP_]], i32* noalias nocapture noundef nonnull readnone align 4 dereferenceable(4) [[DOTZERO_ADDR]]) #[[ATTR3]] -; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) +; NVPTX-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 1, i1 true) ; NVPTX-DISABLED-NEXT: ret void ; NVPTX-DISABLED: worker.exit: ; NVPTX-DISABLED-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll --- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll +++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll @@ -14,7 +14,7 @@ ;; unknown(); ;; } ;; } -;; +;; ;; void test_fallback(void) { ;; #pragma omp target teams ;; { @@ -60,7 +60,7 @@ define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 { entry: %captured_vars_addrs.i.i = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3, !dbg !18 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) #3, !dbg !18 %exec_user_code = icmp eq i32 %0, -1, !dbg !18 br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18 @@ -77,11 +77,11 @@ call void @__kmpc_parallel_51(%struct.ident_t* noundef nonnull @13, i32 %3, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef nonnull %4, i64 noundef 0) #3, !dbg !23 call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !26 call void @unknown() #6, !dbg !27 - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i1 false, i1 true) #3, !dbg !28 + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @5, i8 1, i1 true) #3, !dbg !28 br label %common.ret } -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) local_unnamed_addr ; Function Attrs: convergent declare void @unknown() local_unnamed_addr #1 @@ -99,13 +99,13 @@ ; Function Attrs: nounwind declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) local_unnamed_addr ; Function Attrs: norecurse nounwind define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 { entry: %captured_vars_addrs.i2.i = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i1 false, i1 true, i1 true) #3, !dbg !33 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @7, i8 1, i1 true, i1 true) #3, !dbg !33 %exec_user_code = icmp eq i32 %0, -1, !dbg !33 br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33 @@ -130,7 +130,7 @@ call void @llvm.lifetime.end.p0i8(i64 0, i8* nonnull %2) #3, !dbg !45 call void @no_openmp() call void @no_parallelism() - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i1 false, i1 true) #3, !dbg !46 + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @11, i8 1, i1 true) #3, !dbg !46 br label %common.ret } diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll --- a/llvm/test/Transforms/OpenMP/deduplication_target.ll +++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll @@ -20,37 +20,37 @@ ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 true) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 true) ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i8 2, i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 true, i1 false, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 2, i1 false, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit user_code.entry: ; preds = %entry %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) %2 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2) - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 true, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 2, i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #1 -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) attributes #0 = { convergent noinline norecurse nounwind "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ptx32,+sm_20" } attributes #1 = { nounwind } diff --git a/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll b/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll --- a/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll +++ b/llvm/test/Transforms/OpenMP/fold_generic_main_thread.ll @@ -10,7 +10,7 @@ define void @kernel() { ; CHECK-LABEL: define {{[^@]+}}@kernel() { -; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 false) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 false) ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: @@ -20,11 +20,11 @@ ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ; CHECK-NEXT: ret void ; ; CHECK-DISABLED-LABEL: define {{[^@]+}}@kernel() { -; CHECK-DISABLED-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 false) +; CHECK-DISABLED-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i8 1, i1 false, i1 false) ; CHECK-DISABLED-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-DISABLED-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK-DISABLED: if.then: @@ -34,10 +34,10 @@ ; CHECK-DISABLED-NEXT: call void @bar() ; CHECK-DISABLED-NEXT: br label [[IF_END]] ; CHECK-DISABLED: if.end: -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) +; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ; CHECK-DISABLED-NEXT: ret void ; - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false) + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 false, i1 false) %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.else if.then: @@ -47,7 +47,7 @@ call void @bar() br label %if.end if.end: - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ret void } @@ -135,9 +135,9 @@ declare i32 @__kmpc_get_hardware_thread_id() -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -11,25 +11,23 @@ ; CHECK: @[[KERNEL0_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32 ; CHECK: @[[KERNEL1_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 -; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 3 -; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c" -; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @[[GLOB0]], i32 0, i32 0) }, align 8 +; CHECK: @[[KERNEL2_EXEC_MODE:[a-zA-Z0-9_$"\\.-]+]] = weak constant i8 1 ;. define weak void @kernel0() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel0 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) ; CHECK-NEXT: call void @helper0() #[[ATTR1:[0-9]+]] ; CHECK-NEXT: call void @helper1() #[[ATTR1]] ; CHECK-NEXT: call void @helper2() #[[ATTR1]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) call void @helper0() call void @helper1() call void @helper2() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ret void } @@ -38,14 +36,14 @@ define weak void @kernel1() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel1 ; CHECK-SAME: () #[[ATTR0]] { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) ; CHECK-NEXT: call void @helper1() #[[ATTR1]] -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) call void @helper1() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ret void } @@ -56,7 +54,7 @@ ; CHECK-SAME: () #[[ATTR0]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 true, i1 true) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: @@ -68,12 +66,12 @@ ; CHECK-NEXT: call void @helper1() #[[ATTR1]] ; CHECK-NEXT: call void @helper2() #[[ATTR1]] ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ; CHECK-NEXT: ret void ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %i, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -87,27 +85,14 @@ call void @helper1() call void @helper2() call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ret void } define internal void @helper0() { ; CHECK-LABEL: define {{[^@]+}}@helper0 ; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] -; CHECK: region.check.tid: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] -; CHECK: region.guarded: ; CHECK-NEXT: store i32 666, i32* @G, align 4 -; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] -; CHECK: region.guarded.end: -; CHECK-NEXT: br label [[REGION_BARRIER]] -; CHECK: region.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) -; CHECK-NEXT: br label [[REGION_EXIT:%.*]] -; CHECK: region.exit: ; CHECK-NEXT: ret void ; %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() @@ -137,20 +122,7 @@ define internal void @helper2() { ; CHECK-LABEL: define {{[^@]+}}@helper2 ; CHECK-SAME: () #[[ATTR1]] { -; CHECK-NEXT: br label [[REGION_CHECK_TID:%.*]] -; CHECK: region.check.tid: -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block() -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0 -; CHECK-NEXT: br i1 [[TMP2]], label [[REGION_GUARDED:%.*]], label [[REGION_BARRIER:%.*]] -; CHECK: region.guarded: ; CHECK-NEXT: store i32 666, i32* @G, align 4 -; CHECK-NEXT: br label [[REGION_GUARDED_END:%.*]] -; CHECK: region.guarded.end: -; CHECK-NEXT: br label [[REGION_BARRIER]] -; CHECK: region.barrier: -; CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* @[[GLOB1]], i32 [[TMP1]]) -; CHECK-NEXT: br label [[REGION_EXIT:%.*]] -; CHECK: region.exit: ; CHECK-NEXT: ret void ; %threadLimit = call i32 @__kmpc_get_hardware_num_threads_in_block() @@ -179,8 +151,8 @@ } declare i32 @__kmpc_get_hardware_num_threads_in_block() -declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1 zeroext, i1 zeroext) #1 +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8, i1 zeroext) #1 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) declare i32 @__kmpc_global_thread_num(%struct.ident_t*) @@ -200,7 +172,6 @@ ; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" } ; CHECK: attributes #[[ATTR1]] = { nounwind } ; CHECK: attributes #[[ATTR2:[0-9]+]] = { alwaysinline } -; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll --- a/llvm/test/Transforms/OpenMP/global_constructor.ll +++ b/llvm/test/Transforms/OpenMP/global_constructor.ll @@ -10,7 +10,7 @@ define weak void @__omp_offloading_fd02_85283c04_main_l11(double* nonnull align 8 dereferenceable(8) %X) local_unnamed_addr { entry: - %0 = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 true, i1 false, i1 false) #0 + %0 = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 2, i1 false, i1 false) #0 %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -29,13 +29,13 @@ region.barrier: tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @1, i32 %2) - tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i1 true, i1 false) #0 + tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i8 2, i1 false) #0 br label %common.ret } -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) local_unnamed_addr +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) local_unnamed_addr -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) local_unnamed_addr +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) local_unnamed_addr define internal void @__omp_offloading__fd02_85283c04_Device_l6_ctor() { entry: @@ -78,7 +78,7 @@ ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_85283c04_main_l11 ; CHECK-SAME: (double* nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) #[[ATTR1:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1:[0-9]+]], i8 2, i1 false, i1 false) #[[ATTR1:[0-9]+]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: @@ -93,6 +93,6 @@ ; CHECK-NEXT: br label [[REGION_BARRIER]] ; CHECK: region.barrier: ; CHECK-NEXT: tail call void @__kmpc_barrier_simple_spmd(%struct.ident_t* nonnull @[[GLOB1]], i32 [[TMP2]]) #[[ATTR1]] -; CHECK-NEXT: tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i1 true, i1 false) #[[ATTR1]] +; CHECK-NEXT: tail call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR1]] ; CHECK-NEXT: br label [[COMMON_RET]] ; diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll --- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll +++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll @@ -48,7 +48,7 @@ %.zero.addr = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4 store i32 0, i32* %.zero.addr, align 4 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -56,14 +56,14 @@ %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) store i32 %1, i32* %.threadid_temp., align 4 call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) - call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* @1, i8 1, i1 true) ret void worker.exit: ; preds = %entry ret void } -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) declare void @unknown() define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { @@ -146,7 +146,7 @@ declare i32 @__kmpc_global_thread_num(%struct.ident_t*) -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { entry: diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll --- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll @@ -21,18 +21,18 @@ ;. define weak void @is_spmd() { ; CHECK-LABEL: define {{[^@]+}}@is_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) ; CHECK-NEXT: call void @is_spmd_helper1() ; CHECK-NEXT: call void @is_spmd_helper2() ; CHECK-NEXT: call void @is_mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) call void @is_spmd_helper1() call void @is_spmd_helper2() call void @is_mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ret void } @@ -40,7 +40,7 @@ ; CHECK-LABEL: define {{[^@]+}}@will_be_spmd() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] ; CHECK: common.ret: @@ -50,12 +50,12 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: call void @is_spmd_helper2() ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; entry: %captured_vars_addrs = alloca [0 x i8*], align 8 - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 true, i1 true) %exec_user_code = icmp eq i32 %i, -1 br i1 %exec_user_code, label %user_code.entry, label %common.ret @@ -67,41 +67,41 @@ %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** call void @is_spmd_helper2() call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ret void } define weak void @non_spmd() { ; CHECK-LABEL: define {{[^@]+}}@non_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) ; CHECK-NEXT: call void @is_generic_helper1() ; CHECK-NEXT: call void @is_generic_helper2() ; CHECK-NEXT: call void @is_mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) call void @is_generic_helper1() call void @is_generic_helper2() call void @is_mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ret void } define weak void @will_not_be_spmd() { ; CHECK-LABEL: define {{[^@]+}}@will_not_be_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) ; CHECK-NEXT: call void @is_generic_helper1() ; CHECK-NEXT: call void @is_generic_helper2() ; CHECK-NEXT: call void @is_mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) call void @is_generic_helper1() call void @is_generic_helper2() call void @is_mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ret void } @@ -197,8 +197,8 @@ declare void @spmd_compatible() "llvm.assume"="ompx_spmd_amenable" declare i8 @__kmpc_is_spmd_exec_mode() -declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1 zeroext, i1 zeroext) +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8, i1 zeroext) declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) declare i32 @__kmpc_global_thread_num(%struct.ident_t*) declare void @foo() diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll --- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll +++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll @@ -19,46 +19,46 @@ ;. define weak void @none_spmd() { ; CHECK-LABEL: define {{[^@]+}}@none_spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) ; CHECK-NEXT: call void @none_spmd_helper() ; CHECK-NEXT: call void @mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 1, i1 false, i1 false) call void @none_spmd_helper() call void @mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 false) ret void } define weak void @spmd() { ; CHECK-LABEL: define {{[^@]+}}@spmd() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) ; CHECK-NEXT: call void @spmd_helper() ; CHECK-NEXT: call void @mixed_helper() -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) call void @spmd_helper() call void @mixed_helper() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ret void } define weak void @parallel() { ; CHECK-LABEL: define {{[^@]+}}@parallel() { -; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* align 1073741824 null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* align 1073741824 null, i8 2, i1 false, i1 false) ; CHECK-NEXT: call void @spmd_helper() ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noalias noundef align 1073741824 null, i32 noundef 0, i32 noundef 0, i32 noundef 0, i32 noundef 0, i8* noalias noundef align 1073741824 null, i8* noalias noundef align 1073741824 null, i8** noalias noundef align 1073741824 null, i64 noundef 0) -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i8 2, i1 false, i1 false) call void @spmd_helper() call void @__kmpc_parallel_51(%struct.ident_t* null, i32 0, i32 0, i32 0, i32 0, i8* null, i8* null, i8** null, i64 0) - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 2, i1 false) ret void } @@ -130,8 +130,8 @@ declare void @foo() declare void @bar() declare i8 @__kmpc_parallel_level() -declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 +declare i32 @__kmpc_target_init(%struct.ident_t*, i8 zeroext, i1 zeroext, i1 zeroext) #1 +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i8 zeroext, i1 zeroext) #1 !llvm.module.flags = !{!0, !1} !nvvm.annotations = !{!2, !3, !4} diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -17,7 +17,7 @@ define dso_local void @foo() { entry: - %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) %x = call i8* @__kmpc_alloc_shared(i64 4) call void @unknown_no_openmp() %x_on_stack = bitcast i8* %x to i32* @@ -29,7 +29,7 @@ } define void @bar() { - %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true) + %c = call i32 @__kmpc_target_init(%struct.ident_t* @1, i8 1, i1 true, i1 true) call void @unknown_no_openmp() call void @baz() call void @qux() @@ -40,7 +40,7 @@ define internal void @baz() { entry: - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 true) + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 false, i1 true) call void @unknown_no_openmp() %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %master, label %exit @@ -57,7 +57,7 @@ define internal void @qux() { entry: - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) call void @unknown_no_openmp() %0 = icmp eq i32 %call, -1 br i1 %0, label %master, label %exit @@ -74,7 +74,7 @@ define internal void @negative_qux_spmd() { entry: - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 true, i1 true, i1 true) + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 2, i1 true, i1 true) call void @unknown_no_openmp() %0 = icmp eq i32 %call, -1 br i1 %0, label %master, label %exit @@ -106,9 +106,9 @@ declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp" @@ -138,7 +138,7 @@ ;. ; CHECK-LABEL: define {{[^@]+}}@foo() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 false, i1 true) +; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 false, i1 true) ; CHECK-NEXT: [[X:%.*]] = call i8* @__kmpc_alloc_shared(i64 4) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: call void @use.internalized(i8* nofree writeonly [[X]]) #[[ATTR5:[0-9]+]] @@ -148,7 +148,7 @@ ; ; ; CHECK-LABEL: define {{[^@]+}}@bar() { -; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i1 false, i1 true, i1 true) +; CHECK-NEXT: [[C:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1]], i8 1, i1 true, i1 true) ; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: call void @baz() ; CHECK-NEXT: call void @qux() @@ -159,7 +159,7 @@ ; ; CHECK-LABEL: define {{[^@]+}}@baz() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i1 noundef false, i1 noundef false, i1 noundef true) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i8 noundef 1, i1 noundef false, i1 noundef true) ; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-NEXT: br i1 [[CMP]], label [[MASTER:%.*]], label [[EXIT:%.*]] @@ -172,7 +172,7 @@ ; ; CHECK-LABEL: define {{[^@]+}}@qux() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i1 noundef false, i1 noundef true, i1 noundef true) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i8 noundef 1, i1 noundef true, i1 noundef true) ; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-NEXT: br i1 [[TMP0]], label [[MASTER:%.*]], label [[EXIT:%.*]] @@ -185,7 +185,7 @@ ; ; CHECK-LABEL: define {{[^@]+}}@negative_qux_spmd() { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i1 noundef true, i1 noundef true, i1 noundef true) +; CHECK-NEXT: [[CALL:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* noundef nonnull @[[GLOB1]], i8 noundef 2, i1 noundef true, i1 noundef true) ; CHECK-NEXT: call void @unknown_no_openmp() ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[CALL]], -1 ; CHECK-NEXT: br i1 [[TMP0]], label [[MASTER:%.*]], label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll --- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll +++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll @@ -14,7 +14,7 @@ ; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread. ; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread. define void @kernel() { - %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 false, i1 false) + %call = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 false, i1 false) %cmp = icmp eq i32 %call, -1 br i1 %cmp, label %if.then, label %if.else if.then: @@ -22,7 +22,7 @@ if.else: br label %if.end if.end: - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) + call void @__kmpc_target_deinit(%struct.ident_t* null, i8 1, i1 true) ret void } @@ -105,9 +105,9 @@ declare void @__kmpc_kernel_prepare_parallel(i8*) -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) attributes #0 = { cold noinline } diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll --- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll +++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll @@ -59,7 +59,7 @@ ; CHECK-SAME: (i32* [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32 -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i1 true, i1 false, i1 false) #[[ATTR4:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false, i1 false) #[[ATTR4:[0-9]+]] ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: @@ -166,7 +166,7 @@ ; CHECK-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i1 true, i1 false) #[[ATTR4]] +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 2, i1 false) #[[ATTR4]] ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void @@ -176,7 +176,7 @@ ; CHECK-DISABLED-NEXT: entry: ; CHECK-DISABLED-NEXT: [[WORKER_WORK_FN_ADDR:%.*]] = alloca i8*, align 8 ; CHECK-DISABLED-NEXT: [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32 -; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i1 false, i1 false, i1 true) #[[ATTR4:[0-9]+]] +; CHECK-DISABLED-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @[[GLOB1]], i8 1, i1 false, i1 true) #[[ATTR4:[0-9]+]] ; CHECK-DISABLED-NEXT: [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1 ; CHECK-DISABLED-NEXT: br i1 [[THREAD_IS_WORKER]], label [[WORKER_STATE_MACHINE_BEGIN:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]] ; CHECK-DISABLED: worker_state_machine.begin: @@ -245,14 +245,14 @@ ; CHECK-DISABLED-NEXT: [[CALL14_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-DISABLED-NEXT: [[CALL15_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 ; CHECK-DISABLED-NEXT: [[CALL16_I:%.*]] = call i32 @no_openmp(i32* nonnull [[X]]) #[[ATTR7]], !noalias !8 -; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i1 false, i1 true) #[[ATTR4]] +; CHECK-DISABLED-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* nonnull @[[GLOB1]], i8 1, i1 true) #[[ATTR4]] ; CHECK-DISABLED-NEXT: ret void ; CHECK-DISABLED: worker.exit: ; CHECK-DISABLED-NEXT: ret void ; entry: %N.addr.sroa.0.0.extract.trunc = trunc i64 %N to i32 - %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i1 false, i1 true, i1 true) #3 + %0 = call i32 @__kmpc_target_init(%struct.ident_t* nonnull @1, i8 1, i1 true, i1 true) #3 %exec_user_code = icmp eq i32 %0, -1 br i1 %exec_user_code, label %user_code.entry, label %worker.exit @@ -298,7 +298,7 @@ %call14.i = call i32 @no_openmp(i32* nonnull %x) #5, !noalias !8 %call15.i = call i32 @no_openmp(i32* nonnull %x) #5, !noalias !8 %call16.i = call i32 @no_openmp(i32* nonnull %x) #5, !noalias !8 - call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i1 false, i1 true) #3 + call void @__kmpc_target_deinit(%struct.ident_t* nonnull @1, i8 1, i1 true) #3 ret void worker.exit: ; preds = %entry @@ -330,7 +330,7 @@ declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) -declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1) +declare i32 @__kmpc_target_init(%struct.ident_t*, i8, i1, i1) ; Function Attrs: convergent declare i32 @no_openmp(i32*) #1 @@ -341,7 +341,7 @@ ; Function Attrs: nounwind declare i32 @__kmpc_global_thread_num(%struct.ident_t*) #3 -declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1) +declare void @__kmpc_target_deinit(%struct.ident_t*, i8, i1) ; Function Attrs: inaccessiblememonly nofree nosync nounwind willreturn declare void @llvm.experimental.noalias.scope.decl(metadata) #4 diff --git a/openmp/libomptarget/deviceRTLs/common/include/target.h b/openmp/libomptarget/deviceRTLs/common/include/target.h --- a/openmp/libomptarget/deviceRTLs/common/include/target.h +++ b/openmp/libomptarget/deviceRTLs/common/include/target.h @@ -36,7 +36,7 @@ /// /// \code /// void kernel(...) { -/// ThreadKind = __kmpc_target_init(Ident, /* IsSPMD */ false, +/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1, /// /* UseGenericStateMachine */ true, /// /* RequiresFullRuntime */ ... ); /// if (ThreadKind == -1) { @@ -50,7 +50,7 @@ /// /// \code /// void kernel(...) { -/// ThreadKind = __kmpc_target_init(Ident, /* IsSPMD */ false, +/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1, /// /* UseGenericStateMachine */ false, /// /* RequiresFullRuntime */ ... ); /// if (ThreadKind == -1) { @@ -72,7 +72,7 @@ /// /// \param Ident Source location identification, can be NULL. /// -int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, +int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode, bool UseGenericStateMachine, bool RequiresFullRuntime); @@ -86,7 +86,7 @@ /// /// \param Ident Source location identification, can be NULL. /// -void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, +void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, bool RequiresFullRuntime); ///} diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -256,7 +256,7 @@ extern omptarget_nvptx_ThreadPrivateContext * EXTERN_SHARED(omptarget_nvptx_threadPrivateContext); -extern uint32_t EXTERN_SHARED(execution_param); +extern int8_t EXTERN_SHARED(execution_param); extern void *EXTERN_SHARED(ReductionScratchpadPtr); //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -56,7 +56,7 @@ //////////////////////////////////////////////////////////////////////////////// // OpenMP kernel execution parameters //////////////////////////////////////////////////////////////////////////////// -uint32_t SHARED(execution_param); +int8_t SHARED(execution_param); //////////////////////////////////////////////////////////////////////////////// // Scratchpad for teams reduction. diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -38,7 +38,7 @@ if (threadIdInBlock != GetMasterThreadID()) return; - setExecutionParameters(Generic, RuntimeInitialized); + setExecutionParameters(OMP_TGT_EXEC_MODE_GENERIC, OMP_TGT_RUNTIME_INITIALIZED); ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), "__kmpc_kernel_init() must be called by team master warp only!"); PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); @@ -85,8 +85,9 @@ static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) { PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); - setExecutionParameters(Spmd, RequiresFullRuntime ? RuntimeInitialized - : RuntimeUninitialized); + setExecutionParameters(OMP_TGT_EXEC_MODE_SPMD, + RequiresFullRuntime ? OMP_TGT_RUNTIME_INITIALIZED + : OMP_TGT_RUNTIME_UNINITIALIZED); int threadId = __kmpc_get_hardware_thread_id_in_block(); if (threadId == 0) { usedSlotIdx = __kmpc_impl_smid() % MAX_SM; @@ -160,7 +161,7 @@ // Return true if the current target region is executed in SPMD mode. EXTERN int8_t __kmpc_is_spmd_exec_mode() { - return (execution_param & ModeMask) == Spmd; + return execution_param & OMP_TGT_EXEC_MODE_SPMD; } EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) { @@ -202,9 +203,10 @@ } EXTERN -int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, +int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode, bool UseGenericStateMachine, bool RequiresFullRuntime) { + const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; int TId = __kmpc_get_hardware_thread_id_in_block(); if (IsSPMD) __kmpc_spmd_kernel_init(RequiresFullRuntime); @@ -226,13 +228,13 @@ } EXTERN -void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, - bool RequiresFullRuntime) { +void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, + bool RequiresFullRuntime) { + const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD; if (IsSPMD) __kmpc_spmd_kernel_deinit(RequiresFullRuntime); else __kmpc_generic_kernel_deinit(); } - #pragma omp end declare target diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -19,19 +19,18 @@ // Execution Parameters //////////////////////////////////////////////////////////////////////////////// -void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { +void setExecutionParameters(OMPTgtExecModeFlags EMode, + OMPTgtRuntimeModeFlags RMode) { execution_param = EMode; execution_param |= RMode; } -bool isGenericMode() { return (execution_param & ModeMask) == Generic; } +bool isGenericMode() { return execution_param & OMP_TGT_EXEC_MODE_GENERIC; } -bool isRuntimeUninitialized() { - return (execution_param & RuntimeMask) == RuntimeUninitialized; -} +bool isRuntimeUninitialized() { return !isRuntimeInitialized(); } bool isRuntimeInitialized() { - return (execution_param & RuntimeMask) == RuntimeInitialized; + return execution_param & OMP_TGT_RUNTIME_INITIALIZED; } //////////////////////////////////////////////////////////////////////////////// diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -19,19 +19,18 @@ //////////////////////////////////////////////////////////////////////////////// // Execution Parameters //////////////////////////////////////////////////////////////////////////////// -enum ExecutionMode { - Spmd = 0x00u, - Generic = 0x01u, - ModeMask = 0x01u, +enum OMPTgtExecModeFlags : int8_t { + OMP_TGT_EXEC_MODE_GENERIC = 1 << 0, + OMP_TGT_EXEC_MODE_SPMD = 1 << 1 }; -enum RuntimeMode { - RuntimeInitialized = 0x00u, - RuntimeUninitialized = 0x02u, - RuntimeMask = 0x02u, +enum OMPTgtRuntimeModeFlags : int8_t { + OMP_TGT_RUNTIME_UNINITIALIZED = 0, + OMP_TGT_RUNTIME_INITIALIZED = 1 << 2 }; -void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +void setExecutionParameters(OMPTgtExecModeFlags EMode, + OMPTgtRuntimeModeFlags RMode); bool isGenericMode(); bool isRuntimeUninitialized(); bool isRuntimeInitialized(); diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -416,10 +416,10 @@ int32_t cancelVal); // non standard -EXTERN int32_t __kmpc_target_init(ident_t *Ident, bool IsSPMD, +EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode, bool UseGenericStateMachine, bool RequiresFullRuntime); -EXTERN void __kmpc_target_deinit(ident_t *Ident, bool IsSPMD, +EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode, bool RequiresFullRuntime); EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn); EXTERN bool __kmpc_kernel_parallel(void **WorkFn);