diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -599,6 +599,12 @@ return true; } + /// Returns true if this kernel contains any OpenMP parallel regions. + bool mayContainParallelRegion() { + return !ReachedKnownParallelRegions.empty() || + !ReachedUnknownParallelRegions.empty(); + } + /// Return empty set as the best state of potential values. static KernelInfoState getBestState() { return KernelInfoState(true); } @@ -3003,7 +3009,7 @@ // If we can we change the execution mode to SPMD-mode otherwise we build a // custom state machine. - if (!changeToSPMDMode(A)) + if (!mayContainParallelRegion() || !changeToSPMDMode(A)) buildCustomStateMachine(A); return ChangeStatus::CHANGED; @@ -3308,8 +3314,7 @@ // happen if there simply are no parallel regions. In the resulting kernel // all worker threads will simply exit right away, leaving the main thread // to do the work alone. - if (ReachedKnownParallelRegions.empty() && - ReachedUnknownParallelRegions.empty()) { + if (!mayContainParallelRegion()) { ++NumOpenMPTargetRegionKernelsWithoutStateMachine; auto Remark = [&](OptimizationRemark OR) { diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll --- a/llvm/test/Transforms/OpenMP/always_inline_device.ll +++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll @@ -12,11 +12,11 @@ ; CHECK: Function Attrs: convergent norecurse nounwind ; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 true, i1 false, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* @[[GLOB1:[0-9]+]], i1 false, i1 false, i1 true) ; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1 ; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]] ; CHECK: user_code.entry: -; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 true, i1 false) +; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* @[[GLOB1]], i1 false, i1 true) ; CHECK-NEXT: ret void ; CHECK: worker.exit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll --- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll +++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll @@ -54,18 +54,40 @@ define weak void @kernel2() #0 { ; CHECK-LABEL: define {{[^@]+}}@kernel2 ; CHECK-SAME: () #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) #[[ATTR1]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: call void @helper0() #[[ATTR1]] ; CHECK-NEXT: call void @helper1() #[[ATTR1]] ; CHECK-NEXT: call void @helper2() #[[ATTR1]] +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %i, -1 + br i1 %exec_user_code, label %user_code.entry, label %common.ret + +common.ret: + ret void + +user_code.entry: + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** call void @helper0() call void @helper1() call void @helper2() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0) + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) ret void } @@ -136,9 +158,31 @@ ret void } +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void +} + +define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void +} + declare i32 @__kmpc_get_hardware_num_threads_in_block() declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) !llvm.module.flags = !{!0, !1} @@ -155,7 +199,8 @@ ;. ; CHECK: attributes #[[ATTR0]] = { "omp_target_num_teams"="777" "omp_target_thread_limit"="666" } ; CHECK: attributes #[[ATTR1]] = { nounwind } -; CHECK: attributes #[[ATTR2:[0-9]+]] = { convergent nounwind } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { alwaysinline } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50} diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll --- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll +++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll @@ -38,14 +38,36 @@ define weak void @will_be_spmd() { ; CHECK-LABEL: define {{[^@]+}}@will_be_spmd() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8 ; CHECK-NEXT: [[I:%.*]] = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 true, i1 false, i1 false) +; CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1 +; CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]] +; CHECK: common.ret: +; CHECK-NEXT: ret void +; CHECK: user_code.entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: call void @is_spmd_helper2() +; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* null, i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP1]], i64 0) ; CHECK-NEXT: call void @__kmpc_target_deinit(%struct.ident_t* null, i1 true, i1 false) ; CHECK-NEXT: ret void ; - %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 false, i1 false) +entry: + %captured_vars_addrs = alloca [0 x i8*], align 8 + %i = call i32 @__kmpc_target_init(%struct.ident_t* null, i1 false, i1 true, i1 true) + %exec_user_code = icmp eq i32 %i, -1 + br i1 %exec_user_code, label %user_code.entry, label %common.ret + +common.ret: + ret void + +user_code.entry: + %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* null) + %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** call void @is_spmd_helper2() - call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 false) + call void @__kmpc_parallel_51(%struct.ident_t* null, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** %1, i64 0) + call void @__kmpc_target_deinit(%struct.ident_t* null, i1 false, i1 true) ret void } @@ -153,10 +175,32 @@ ret void } +define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined__ +; CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void +} + +define internal void @__omp_outlined___wrapper(i16 zeroext %0, i32 %1) { +; CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper +; CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + ret void +} + declare void @spmd_compatible() "llvm.assume"="ompx_spmd_amenable" declare i8 @__kmpc_is_spmd_exec_mode() -declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) #1 -declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) #1 +declare i32 @__kmpc_target_init(%struct.ident_t*, i1 zeroext, i1 zeroext, i1 zeroext) +declare void @__kmpc_target_deinit(%struct.ident_t* nocapture readnone, i1 zeroext, i1 zeroext) +declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) +declare i32 @__kmpc_global_thread_num(%struct.ident_t*) declare void @foo() declare void @bar() @@ -171,6 +215,8 @@ !5 = !{void ()* @will_not_be_spmd, !"kernel", i32 1} ;. ; CHECK: attributes #[[ATTR0:[0-9]+]] = { "llvm.assume"="ompx_spmd_amenable" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { alwaysinline } +; CHECK: attributes #[[ATTR2]] = { nounwind } ;. ; CHECK: [[META0:![0-9]+]] = !{i32 7, !"openmp", i32 50} ; CHECK: [[META1:![0-9]+]] = !{i32 7, !"openmp-device", i32 50}