diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c --- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c +++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c @@ -1,31 +1,35 @@ -// RUN: %clang_cc1 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc -// RUN: %clang_cc1 -verify -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out -// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -verify=host -Rpass=openmp -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify=all,safe -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,safe -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -verify=all,force -mllvm -openmp-unsafe-assume-no-external-target-regions -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out +// RUN: %clang_cc1 -fexperimental-new-pass-manager -verify=all,force -mllvm -openmp-unsafe-assume-no-external-target-regions -Rpass=openmp -fopenmp -O2 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o %t.out // host-no-diagnostics void bar(void) { #pragma omp parallel // #1 \ - // expected-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ - // expected-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} + // all-remark@#1 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // safe-remark@#1 {{Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will not attempt to rewrite the state machine use.}} \ + // force-remark@#1 {{[UNSAFE] Parallel region is not known to be called from a unique single target region, maybe the surrounding function has external linkage?; will rewrite the state machine use due to command line flag, this can lead to undefined behavior if the parallel region is called from a target region outside this translation unit.}} \ + // force-remark@#1 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__2_wrapper, kernel ID: }} { } } void foo(void) { #pragma omp target teams // #2 \ - // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} \ - // expected-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} + // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} \ + // all-remark@#2 {{Target region containing the parallel region that is specialized. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} { #pragma omp parallel // #3 \ - // expected-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ - // expected-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} + // all-remark@#3 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // all-remark@#3 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__1_wrapper, kernel ID: __omp_offloading_22}} { } bar(); #pragma omp parallel // #4 \ - // expected-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ - // expected-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} + // all-remark@#4 {{Found a parallel region that is called in a target region but not part of a combined target construct nor nesed inside a target construct without intermediate code. This can lead to excessive register usage for unrelated target regions in the same translation unit due to spurious call edges assumed by ptxas.}} \ + // all-remark@#4 {{Specialize parallel region that is only reached from a single target region to avoid spurious call edges and excessive register usage in other target regions. (parallel region ID: __omp_outlined__3_wrapper, kernel ID: __omp_offloading_22}} { } } @@ -43,5 +47,5 @@ } } -// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}} -// expected-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} +// all-remark@* {{OpenMP runtime call __kmpc_global_thread_num moved to}} +// all-remark@* {{OpenMP runtime call __kmpc_global_thread_num deduplicated}} diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -36,6 +36,12 @@ "openmp-opt-disable", cl::ZeroOrMore, cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false)); +static cl::opt UnsafeAssumeNoExternalTargetRegions( + "openmp-unsafe-assume-no-external-target-regions", cl::ZeroOrMore, + cl::desc("[UNSAFE!] Assume no external target regions when specializing " + "OpenMP parallel regions to avoid spurious register usage issues, " + "combine with `-Rpass=openmp`"), + cl::Hidden, cl::init(false)); static cl::opt PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden); @@ -1096,15 +1102,23 @@ if (!K) { { auto Remark = [&](OptimizationRemark OR) { - return OR << "Parallel region is not known to be called from a " + return OR << (UnsafeAssumeNoExternalTargetRegions ? "[UNSAFE] " : "") + << "Parallel region is not known to be called from a " "unique single target region, maybe the surrounding " - "function has external linkage?; will not attempt to " - "rewrite the state machine use."; + "function has external linkage?" + << (UnsafeAssumeNoExternalTargetRegions + ? "; will rewrite the state machine use due to " + "command line flag, this can lead to undefined " + "behavior if the parallel region is called from " + "a target region outside this translation unit." + : "; will not attempt to rewrite the state machine " + "use."); }; emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl", Remark); } - continue; + if (!UnsafeAssumeNoExternalTargetRegions) + continue; } // We now know F is a parallel body function called only from the kernel K. @@ -1120,10 +1134,12 @@ "(parallel region ID: " << ore::NV("OpenMPParallelRegion", F->getName()) << ", kernel ID: " - << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; + << ore::NV("OpenMPTargetRegion", K ? K->getName() : "") + << ")"; }; emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", RemarkParalleRegion); + auto RemarkKernel = [&](OptimizationRemark OR) { return OR << "Target region containing the parallel region that is " "specialized. (parallel region ID: " @@ -1131,7 +1147,8 @@ << ", kernel ID: " << ore::NV("OpenMPTargetRegion", K->getName()) << ")"; }; - emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); + if (K) + emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel); } Module &M = *F->getParent();