diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c @@ -0,0 +1,102 @@ +// RUN: %clang_cc1 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out + +// host-no-diagnostics + +void bar1(void) { +#pragma omp parallel // #0 + // all-remark@#0 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // safe-remark@#0 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} + // force-remark@#0 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}} + // force-remark@#0 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: }} + { + } +} +void bar2(void) { +#pragma omp parallel // #1 + // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // safe-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} + // force-remark@#1 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}} + // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__6_wrapper, kernel ID: }} + { + } +} + +void foo1(void) { +#pragma omp target teams // #2 + // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} + // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} + { +#pragma omp parallel // #3 + // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} + { + } + bar1(); +#pragma omp parallel // #4 + // all-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} + { + } + } +} + +void foo2(void) { +#pragma omp target teams // #5 + // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading_22}} + // all-remark@#5 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading_22}} + { +#pragma omp parallel // #6 + // all-remark@#6 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#6 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__5_wrapper, kernel ID: __omp_offloading_22}} + { + } + bar1(); + bar2(); +#pragma omp parallel // #7 + // all-remark@#7 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#7 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__7_wrapper, kernel ID: __omp_offloading_22}} + { + } + bar1(); + bar2(); + } +} + +void foo3(void) { +#pragma omp target teams // #8 + // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading_22}} + // all-remark@#8 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading_22}} + { +#pragma omp parallel // #9 + // all-remark@#9 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#9 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__9_wrapper, kernel ID: __omp_offloading_22}} + { + } + bar1(); + bar2(); +#pragma omp parallel // #10 + // all-remark@#10 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} + // all-remark@#10 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__10_wrapper, kernel ID: __omp_offloading_22}} + { + } + bar1(); + bar2(); + } +} + +void spmd(void) { + // Verify we do not emit the remarks above for "SPMD" regions. +#pragma omp target teams +#pragma omp parallel + { + } + +#pragma omp target teams distribute parallel for + for (int i = 0; i < 100; ++i) { + } +} + +// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num moved to}} +// all-remark@* 3 {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c new file mode 100644 --- /dev/null +++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c @@ -0,0 +1,47 @@ +// RUN: %clang_cc1 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out + +// host-no-diagnostics + +void bar(void) { +#pragma omp parallel // #1 \ + // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // expected-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} + { + } +} + +void foo(void) { +#pragma omp target teams // #2 \ + // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} \ + // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} + { +#pragma omp parallel // #3 \ + // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} + { + } + bar(); +#pragma omp parallel // #4 \ + // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // expected-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} + { + } + } +} + +void spmd(void) { + // Verify we do not emit the remarks above for "SPMD" regions. +#pragma omp target teams +#pragma omp parallel + { + } + +#pragma omp target teams distribute parallel for + for (int i = 0; i < 100; ++i) { + } +} + +// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}} +// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -1033,6 +1033,7 @@ // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at // all. bool UnknownUse = false; + bool KernelPrepareUse = false; unsigned NumDirectCalls = 0; SmallVector ToBeReplacedStateMachineUses; @@ -1047,33 +1048,92 @@ ToBeReplacedStateMachineUses.push_back(&U); return; } - if (OpenMPOpt::getCallIfRegularCall(*U.getUser(), - &KernelPrepareParallelRFI)) { + if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall( + *U.getUser(), &KernelPrepareParallelRFI)) { + KernelPrepareUse = true; ToBeReplacedStateMachineUses.push_back(&U); return; } UnknownUse = true; }); - // If this ever hits, we should investigate. - if (UnknownUse || NumDirectCalls != 1) + // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel + // use. + if (!KernelPrepareUse) continue; - // TODO: This is not a necessary restriction and should be lifted. - if (ToBeReplacedStateMachineUses.size() != 2) + { + auto Remark = [&](OptimizationRemark OR) { + return OR << "Found a parallel region that is called in a target " + "region but not part of a combined target construct nor " + "nesed inside a target construct without intermediate " + "code. This can lead to excessive register usage for " + "unrelated target regions in the same translation unit " + "due to spurious call edges assumed by ptxas."; + }; + emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); + } + + // If this ever hits, we should investigate. + // TODO: Checking the number of uses is not a necessary restriction and + // should be lifted. + if (UnknownUse || NumDirectCalls != 1 || + ToBeReplacedStateMachineUses.size() != 2) { + { + auto Remark = [&](OptimizationRemark OR) { + return OR << "Parallel region is used in " + << (UnknownUse ? "unknown" : "unexpected") + << " ways; will not attempt to rewrite the state machine."; + }; + emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark); + } continue; + } // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give // up if the function is not called from a unique kernel. Kernel K = getUniqueKernelFor(*F); - if (!K) + if (!K) { + { + auto Remark = [&](OptimizationRemark OR) { + return OR << "Parallel region is not known to be called from a " + "unique single target region, maybe the surrounding " + "function has external linkage?; will not attempt to " + "rewrite the state machine use."; + }; + emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", + Remark); + } continue; + } // We now know F is a parallel body function called only from the kernel K. // We also identified the state machine uses in which we replace the // function pointer by a new global symbol for identification purposes. This // ensures only direct calls to the function are left. + { + auto RemarkParalleRegion = [&](OptimizationRemark OR) { + return OR << "Specialize parallel region that is only reached from a " + "single target region to avoid spurious call edges and " + "excessive register usage in other target regions. " + "(parallel region ID: " + << ore::NV("OpenMPParallelRegion", F->getName()) + << ", kernel ID: " + << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; + }; + emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", + RemarkParalleRegion); + auto RemarkKernel = [&](OptimizationRemark OR) { + return OR << "Target region containing the parallel region that is " + "specialized. (parallel region ID: " + << ore::NV("OpenMPParallelRegion", F->getName()) + << ", kernel ID: " + << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; + }; + emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); + } + Module &M = *F->getParent(); Type *Int8Ty = Type::getInt8Ty(M.getContext());