diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -427,6 +427,20 @@ /// true if we're definitely in the parallel region. bool IsInParallelRegion = false; + struct StateMode { + ExecutionMode SavedExecutionMode = EM_Unknown; + bool SavedIsInTargetMasterThreadRegion = false; + bool SavedIsInTTDRegion = false; + bool SavedIsInParallelRegion = false; + StateMode(ExecutionMode SavedExecutionMode, + bool SavedIsInTargetMasterThreadRegion, bool SavedIsInTTDRegion, + bool SavedIsInParallelRegion) + : SavedExecutionMode(SavedExecutionMode), + SavedIsInTargetMasterThreadRegion(SavedIsInTargetMasterThreadRegion), + SavedIsInTTDRegion(SavedIsInTTDRegion), + SavedIsInParallelRegion(SavedIsInParallelRegion) {} + }; + llvm::DenseMap, StateMode> SavedExecutionModes; /// Map between an outlined function and its wrapper. llvm::DenseMap WrapperFunctionsMap; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -4147,9 +4147,6 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) - return; - assert(D && "Expected function or captured|block decl."); assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && "Function is registered already."); @@ -4168,6 +4165,18 @@ getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) return; } + if (!NeedToDelayGlobalization) { + SavedExecutionModes.try_emplace(D, CurrentExecutionMode, + IsInTargetMasterThreadRegion, IsInTTDRegion, + IsInParallelRegion); + CurrentExecutionMode = EM_Unknown; + IsInTargetMasterThreadRegion = false; + IsInTTDRegion = false; + IsInParallelRegion = false; + } + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) + return; + if (!Body) return; CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); @@ -4293,6 +4302,14 @@ } void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { + auto I = SavedExecutionModes.find(CGF.CurCodeDecl); + if (I != SavedExecutionModes.end()) { + CurrentExecutionMode = I->second.SavedExecutionMode; + IsInTargetMasterThreadRegion = I->second.SavedIsInTargetMasterThreadRegion; + IsInTTDRegion = I->second.SavedIsInTTDRegion; + IsInParallelRegion = I->second.SavedIsInParallelRegion; + SavedExecutionModes.erase(CGF.CurCodeDecl); + } FunctionGlobalizedDecls.erase(CGF.CurFn); CGOpenMPRuntime::functionFinished(CGF); } diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -37,11 +37,14 @@ // CHECK: define {{.*}}[[BAR]]() // CHECK: alloca i32, // CHECK: [[A_LOCAL_ADDR:%.+]] = alloca i32, +// CHECK: [[PL:%.+]] = call i16 @__kmpc_parallel_level( +// CHECK: [[IS_IN_PARALLEL:%.+]] = icmp eq i16 [[PL]], 0 // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label // CHECK: br label -// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) +// CHECK: [[SZ:%.+]] = select i1 [[IS_IN_PARALLEL]], i64 4, i64 128 +// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[SZ]], i16 0) // CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%.+]]* // CHECK: br label // CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[GLOBALS]], {{.+}} ] @@ -49,7 +52,9 @@ // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK: [[LID:%.+]] = and i32 [[TID]], 31 // CHECK: [[A_GLOBAL_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ADDR]], i32 0, i32 [[LID]] -// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_GLOBAL_ADDR]] +// CHECK: [[A_GLOBAL_PARALLEL_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0 +// CHECK: [[A_PARALLEL_ADDR:%.+]] = select i1 [[IS_IN_PARALLEL]], i32* [[A_GLOBAL_PARALLEL_ADDR]], i32* [[A_GLOBAL_ADDR]] +// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_PARALLEL_ADDR]] // CHECK: call {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}} [[A_ADDR]]) // CHECK: br i1 [[IS_SPMD]], label // CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* diff --git a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp @@ -34,9 +34,9 @@ // CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK1-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK1: .execute: -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK1-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i64 0) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i64 0) // CHECK1-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK1: .omp.deinit: // CHECK1-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) @@ -60,9 +60,9 @@ // CHECK1-SAME: () #[[ATTR2:[0-9]+]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) // CHECK1-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP1]], i64 0) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP1]], i64 0) // CHECK1-NEXT: ret void // // @@ -77,8 +77,23 @@ // CHECK1-NEXT: ret void // // +// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK1-NEXT: entry: +// CHECK1-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK1-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK1-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8 +// CHECK1-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK1-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK1-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK1-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR5:[0-9]+]] +// CHECK1-NEXT: ret void +// +// // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker -// CHECK1-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK1-SAME: () #[[ATTR4]] { // CHECK1-NEXT: entry: // CHECK1-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8 // CHECK1-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 @@ -98,7 +113,7 @@ // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) // CHECK1-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* // CHECK1-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) // CHECK1-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] @@ -122,7 +137,7 @@ // CHECK1-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK1-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] // CHECK1: .worker: -// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR4:[0-9]+]] +// CHECK1-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR5]] // CHECK1-NEXT: br label [[DOTEXIT:%.*]] // CHECK1: .mastercheck: // CHECK1-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() @@ -159,9 +174,9 @@ // CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK2-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK2: .execute: -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK2-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i32 0) // CHECK2-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK2: .omp.deinit: // CHECK2-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) @@ -185,9 +200,9 @@ // CHECK2-SAME: () #[[ATTR2:[0-9]+]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) // CHECK2-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP1]], i32 0) // CHECK2-NEXT: ret void // // @@ -202,8 +217,23 @@ // CHECK2-NEXT: ret void // // +// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK2-NEXT: entry: +// CHECK2-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK2-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK2-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK2-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK2-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK2-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK2-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR5:[0-9]+]] +// CHECK2-NEXT: ret void +// +// // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker -// CHECK2-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK2-SAME: () #[[ATTR4]] { // CHECK2-NEXT: entry: // CHECK2-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 // CHECK2-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 @@ -223,7 +253,7 @@ // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) // CHECK2-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* // CHECK2-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) // CHECK2-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] @@ -247,7 +277,7 @@ // CHECK2-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK2-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] // CHECK2: .worker: -// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR4:[0-9]+]] +// CHECK2-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR5]] // CHECK2-NEXT: br label [[DOTEXIT:%.*]] // CHECK2: .mastercheck: // CHECK2-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() @@ -284,9 +314,9 @@ // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd() // CHECK3-NEXT: br label [[DOTEXECUTE:%.*]] // CHECK3: .execute: -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK3-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* null, i8** [[TMP1]], i32 0) // CHECK3-NEXT: br label [[DOTOMP_DEINIT:%.*]] // CHECK3: .omp.deinit: // CHECK3-NEXT: call void @__kmpc_spmd_kernel_deinit_v2(i16 1) @@ -310,9 +340,9 @@ // CHECK3-SAME: () #[[ATTR2:[0-9]+]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) // CHECK3-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* null, i8** [[TMP1]], i32 0) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP1]], i32 0) // CHECK3-NEXT: ret void // // @@ -327,8 +357,23 @@ // CHECK3-NEXT: ret void // // +// CHECK3-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper +// CHECK3-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] { +// CHECK3-NEXT: entry: +// CHECK3-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2 +// CHECK3-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 +// CHECK3-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 4 +// CHECK3-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4 +// CHECK3-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2 +// CHECK3-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4 +// CHECK3-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]]) +// CHECK3-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]]) #[[ATTR5:[0-9]+]] +// CHECK3-NEXT: ret void +// +// // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker -// CHECK3-SAME: () #[[ATTR5:[0-9]+]] { +// CHECK3-SAME: () #[[ATTR4]] { // CHECK3-NEXT: entry: // CHECK3-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 4 // CHECK3-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1 @@ -348,7 +393,7 @@ // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]]) // CHECK3-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)* // CHECK3-NEXT: call void [[TMP5]](i16 0, i32 [[TMP4]]) // CHECK3-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]] @@ -372,7 +417,7 @@ // CHECK3-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]] // CHECK3-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]] // CHECK3: .worker: -// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR4:[0-9]+]] +// CHECK3-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_worker() #[[ATTR5]] // CHECK3-NEXT: br label [[DOTEXIT:%.*]] // CHECK3: .mastercheck: // CHECK3-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp --- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp +++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp @@ -54,7 +54,7 @@ // CHECK1-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK1-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK1: .execute.parallel: -// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK1-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK1-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8 // CHECK1-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) // CHECK1-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] @@ -114,14 +114,14 @@ // CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) // CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) // CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] -// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) +// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2) // CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 // CHECK1-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0]] to i8* // CHECK1-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 8 // CHECK1-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i64 1) +// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i64 1) // CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK1: .termination.notifier: // CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -136,7 +136,7 @@ // CHECK1-NEXT: entry: // CHECK1-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 8 // CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8 -// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) // CHECK1-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 8 // CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 2) // CHECK1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0 @@ -260,7 +260,7 @@ // CHECK2-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK2-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK2: .execute.parallel: -// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK2-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK2-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 // CHECK2-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) // CHECK2-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] @@ -320,14 +320,14 @@ // CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) // CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) // CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] -// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) +// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2) // CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 // CHECK2-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0]] to i8* // CHECK2-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 // CHECK2-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i32 1) // CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK2: .termination.notifier: // CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -342,7 +342,7 @@ // CHECK2-NEXT: entry: // CHECK2-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 // CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) // CHECK2-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 // CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 2) // CHECK2-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 @@ -466,7 +466,7 @@ // CHECK3-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0 // CHECK3-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]] // CHECK3: .execute.parallel: -// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) +// CHECK3-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]]) // CHECK3-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 4 // CHECK3-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*) // CHECK3-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]] @@ -526,14 +526,14 @@ // CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]] // CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1) // CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack() -// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]]) // CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]] -// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2) +// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 2) // CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0 // CHECK3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP0]] to i8* // CHECK3-NEXT: store i8* [[TMP8]], i8** [[TMP7]], align 4 // CHECK3-NEXT: [[TMP9:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** -// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i32 1) +// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP9]], i32 1) // CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]] // CHECK3: .termination.notifier: // CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1) @@ -548,7 +548,7 @@ // CHECK3-NEXT: entry: // CHECK3-NEXT: [[C_ADDR:%.*]] = alloca i32*, align 4 // CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 4 -// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]]) +// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]]) // CHECK3-NEXT: store i32* [[C]], i32** [[C_ADDR]], align 4 // CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP0]], i32 2) // CHECK3-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0