diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h @@ -427,6 +427,20 @@ /// true if we're definitely in the parallel region. bool IsInParallelRegion = false; + struct StateMode { + ExecutionMode SavedExecutionMode = EM_Unknown; + bool SavedIsInTargetMasterThreadRegion = false; + bool SavedIsInTTDRegion = false; + bool SavedIsInParallelRegion = false; + StateMode(ExecutionMode SavedExecutionMode, + bool SavedIsInTargetMasterThreadRegion, bool SavedIsInTTDRegion, + bool SavedIsInParallelRegion) + : SavedExecutionMode(SavedExecutionMode), + SavedIsInTargetMasterThreadRegion(SavedIsInTargetMasterThreadRegion), + SavedIsInTTDRegion(SavedIsInTTDRegion), + SavedIsInParallelRegion(SavedIsInParallelRegion) {} + }; + llvm::DenseMap, StateMode> SavedExecutionModes; /// Map between an outlined function and its wrapper. llvm::DenseMap WrapperFunctionsMap; diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -4311,9 +4311,6 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) { - if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) - return; - assert(D && "Expected function or captured|block decl."); assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 && "Function is registered already."); @@ -4332,6 +4329,18 @@ getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) return; } + if (!NeedToDelayGlobalization) { + SavedExecutionModes.try_emplace(D, CurrentExecutionMode, + IsInTargetMasterThreadRegion, IsInTTDRegion, + IsInParallelRegion); + CurrentExecutionMode = EM_Unknown; + IsInTargetMasterThreadRegion = false; + IsInTTDRegion = false; + IsInParallelRegion = false; + } + if (getDataSharingMode(CGM) != CGOpenMPRuntimeGPU::Generic) + return; + if (!Body) return; CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second); @@ -4457,6 +4466,14 @@ } void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) { + auto I = SavedExecutionModes.find(CGF.CurCodeDecl); + if (I != SavedExecutionModes.end()) { + CurrentExecutionMode = I->second.SavedExecutionMode; + IsInTargetMasterThreadRegion = I->second.SavedIsInTargetMasterThreadRegion; + IsInTTDRegion = I->second.SavedIsInTTDRegion; + IsInParallelRegion = I->second.SavedIsInParallelRegion; + SavedExecutionModes.erase(CGF.CurCodeDecl); + } FunctionGlobalizedDecls.erase(CGF.CurFn); CGOpenMPRuntime::functionFinished(CGF); } diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp --- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp +++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp @@ -37,11 +37,14 @@ // CHECK: define {{.*}}[[BAR]]() // CHECK: alloca i32, // CHECK: [[A_LOCAL_ADDR:%.+]] = alloca i32, +// CHECK: [[PL:%.+]] = call i16 @__kmpc_parallel_level( +// CHECK: [[IS_IN_PARALLEL:%.+]] = icmp eq i16 [[PL]], 0 // CHECK: [[RES:%.+]] = call i8 @__kmpc_is_spmd_exec_mode() // CHECK: [[IS_SPMD:%.+]] = icmp ne i8 [[RES]], 0 // CHECK: br i1 [[IS_SPMD]], label // CHECK: br label -// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) +// CHECK: [[SZ:%.+]] = select i1 [[IS_IN_PARALLEL]], i64 4, i64 128 +// CHECK: [[RES:%.+]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[SZ]], i16 0) // CHECK: [[GLOBALS:%.+]] = bitcast i8* [[RES]] to [[GLOBAL_ST:%.+]]* // CHECK: br label // CHECK: [[ITEMS:%.+]] = phi [[GLOBAL_ST]]* [ null, {{.+}} ], [ [[GLOBALS]], {{.+}} ] @@ -49,7 +52,9 @@ // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() // CHECK: [[LID:%.+]] = and i32 [[TID]], 31 // CHECK: [[A_GLOBAL_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A_ADDR]], i32 0, i32 [[LID]] -// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_GLOBAL_ADDR]] +// CHECK: [[A_GLOBAL_PARALLEL_ADDR:%.+]] = getelementptr inbounds %{{.+}}, %{{.+}}* %{{.+}}, i32 0, i32 0 +// CHECK: [[A_PARALLEL_ADDR:%.+]] = select i1 [[IS_IN_PARALLEL]], i32* [[A_GLOBAL_PARALLEL_ADDR]], i32* [[A_GLOBAL_ADDR]] +// CHECK: [[A_ADDR:%.+]] = select i1 [[IS_SPMD]], i32* [[A_LOCAL_ADDR]], i32* [[A_PARALLEL_ADDR]] // CHECK: call {{.*}}[[FOO]](i32* nonnull align {{[0-9]+}} dereferenceable{{.*}} [[A_ADDR]]) // CHECK: br i1 [[IS_SPMD]], label // CHECK: [[BC:%.+]] = bitcast [[GLOBAL_ST]]* [[ITEMS]] to i8* diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c --- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c +++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c @@ -98,5 +98,5 @@ } } -// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num moved to}} -// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} +// all-remark@* 5 {{OpenMP runtime call __kmpc_global_thread_num moved to}} +// all-remark@* 12 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c --- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c +++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c @@ -44,4 +44,4 @@ } // expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}} -// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} +// expected-remark@* 2 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}}