diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -253,6 +253,7 @@ LANGOPT(OpenMPThreadSubscription , 1, 0, "Assume work-shared loops do not have more iterations than participating threads.") LANGOPT(OpenMPTeamSubscription , 1, 0, "Assume distributed loops do not have more iterations than participating teams.") LANGOPT(OpenMPNoThreadState , 1, 0, "Assume that no thread in a parallel region will modify an ICV.") +LANGOPT(OpenMPNoNestedParallelism , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region") LANGOPT(OpenMPOffloadMandatory , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.") LANGOPT(NoGPULib , 1, 0, "Indicate a build without the standard GPU libraries.") LANGOPT(RenderScript , 1, 0, "RenderScript") diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -2580,6 +2580,10 @@ Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, HelpText<"Assert no thread in a parallel region modifies an ICV">, MarshallingInfoFlag>; +def fopenmp_assume_no_nested_parallelism : Flag<["-"], "fopenmp-assume-no-nested-parallelism">, Group, + Flags<[CC1Option, NoArgumentUnused, HelpHidden]>, + HelpText<"Assert no nested parallel regions in the GPU">, + MarshallingInfoFlag>; def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group, Flags<[CC1Option, NoArgumentUnused]>, HelpText<"Do not create a host fallback if offloading to the device fails.">, diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -1213,6 +1213,8 @@ "__omp_rtl_assume_threads_oversubscription"); OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState, "__omp_rtl_assume_no_thread_state"); + OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism, + "__omp_rtl_assume_no_nested_parallelism"); } void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF, diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -6132,6 +6132,8 @@ CmdArgs.push_back("-fopenmp-assume-threads-oversubscription"); if (Args.hasArg(options::OPT_fopenmp_assume_no_thread_state)) CmdArgs.push_back("-fopenmp-assume-no-thread-state"); + if (Args.hasArg(options::OPT_fopenmp_assume_no_nested_parallelism)) + CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism"); if (Args.hasArg(options::OPT_fopenmp_offload_mandatory)) CmdArgs.push_back("-fopenmp-offload-mandatory"); break; @@ -8417,29 +8419,6 @@ } } - // Get the AMDGPU math libraries. - // FIXME: This method is bad, remove once AMDGPU has a proper math library - // (see AMDGCN::OpenMPLinker::constructLLVMLinkCommand). - for (auto &I : llvm::make_range(OpenMPTCRange.first, OpenMPTCRange.second)) { - const ToolChain *TC = I.second; - - if (!TC->getTriple().isAMDGPU() || Args.hasArg(options::OPT_nogpulib)) - continue; - - const ArgList &TCArgs = C.getArgsForToolChain(TC, "", Action::OFK_OpenMP); - StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ); - const toolchains::ROCMToolChain RocmTC(TC->getDriver(), TC->getTriple(), - TCArgs); - - SmallVector BCLibs = - RocmTC.getCommonDeviceLibNames(TCArgs, Arch.str()); - - for (StringRef LibName : BCLibs) - CmdArgs.push_back(Args.MakeArgString( - "--bitcode-library=" + Action::GetOffloadKindName(Action::OFK_OpenMP) + - "-" + TC->getTripleString() + "-" + Arch + "=" + LibName)); - } - if (D.isUsingLTO(/* IsOffload */ true)) { // Pass in the optimization level to use for LTO. if (const Arg *A = Args.getLastArg(options::OPT_O_Group)) { diff --git a/clang/test/OpenMP/target_globals_codegen.cpp b/clang/test/OpenMP/target_globals_codegen.cpp --- a/clang/test/OpenMP/target_globals_codegen.cpp +++ b/clang/test/OpenMP/target_globals_codegen.cpp @@ -7,6 +7,7 @@ // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-thread-state -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-STATE +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-no-nested-parallelism -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-NESTED // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -nogpulib -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME // expected-no-diagnostics @@ -19,36 +20,49 @@ // CHECK: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-EQ: @__omp_rtl_debug_kind = weak_odr hidden constant i32 111 // CHECK-EQ: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-EQ: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-EQ: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-DEFAULT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-DEFAULT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-DEFAULT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-THREADS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-THREADS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 1 // CHECK-THREADS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-THREADS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-TEAMS: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-TEAMS: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-TEAMS: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-TEAMS: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. // CHECK-STATE: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-STATE: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 1 +// CHECK-STATE: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 +//. +// CHECK-NESTED: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-NESTED: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 1 //. // CHECK-RUNTIME-NOT: @__omp_rtl_debug_kind = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_teams_oversubscription = weak_odr hidden constant i32 1 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_threads_oversubscription = weak_odr hidden constant i32 0 // CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_thread_state = weak_odr hidden constant i32 0 +// CHECK-RUNTIME-NOT: @__omp_rtl_assume_no_nested_parallelism = weak_odr hidden constant i32 0 //. void foo() { #pragma omp target diff --git a/openmp/libomptarget/DeviceRTL/include/Configuration.h b/openmp/libomptarget/DeviceRTL/include/Configuration.h --- a/openmp/libomptarget/DeviceRTL/include/Configuration.h +++ b/openmp/libomptarget/DeviceRTL/include/Configuration.h @@ -44,6 +44,10 @@ /// explicitly disabled by the user. bool mayUseThreadStates(); +/// Indicates if this kernel may require data environments for nested +/// parallelism, or if it was explicitly disabled by the user. +bool mayUseNestedParallelism(); + } // namespace config } // namespace _OMP diff --git a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp --- a/openmp/libomptarget/DeviceRTL/src/Configuration.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Configuration.cpp @@ -23,6 +23,7 @@ // defined by CGOpenMPRuntimeGPU extern uint32_t __omp_rtl_debug_kind; extern uint32_t __omp_rtl_assume_no_thread_state; +extern uint32_t __omp_rtl_assume_no_nested_parallelism; // TODO: We want to change the name as soon as the old runtime is gone. // This variable should be visibile to the plugin so we override the default @@ -52,4 +53,8 @@ bool config::mayUseThreadStates() { return !__omp_rtl_assume_no_thread_state; } +bool config::mayUseNestedParallelism() { + return !__omp_rtl_assume_no_nested_parallelism; +} + #pragma omp end declare target diff --git a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp --- a/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp +++ b/openmp/libomptarget/DeviceRTL/src/Parallelism.cpp @@ -86,11 +86,16 @@ uint32_t TId = mapping::getThreadIdInBlock(); + // Assert the parallelism level is zero if disabled by the user. + if (!config::mayUseNestedParallelism()) + ASSERT(icv::Level == 0 && "nested parallelism while disabled"); + // Handle the serialized case first, same for SPMD/non-SPMD: // 1) if-clause(0) - // 2) nested parallel regions - // 3) parallel in task or other thread state inducing construct - if (OMP_UNLIKELY(!if_expr || icv::Level || state::HasThreadState)) { + // 2) parallel in task or other thread state inducing construct + // 3) nested parallel regions + if (OMP_UNLIKELY(!if_expr || state::HasThreadState || + (icv::Level && config::mayUseNestedParallelism()))) { state::DateEnvironmentRAII DERAII(ident); ++icv::Level; invokeMicrotask(TId, 0, fn, args, nargs);