diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -4018,9 +4018,13 @@ "omp_no_openmp" "omp_no_openmp_routines" "omp_no_parallelism" + "omp_no_external_caller_in_target_region" -The OpenMP standard defines the meaning of OpenMP assumptions ("omp_XYZ" is -spelled "XYZ" in the `OpenMP 5.1 Standard`_). +The OpenMP standard defines the meaning of the first three OpenMP assumptions +("omp_XYZ" is spelled "XYZ" in the `OpenMP 5.1 Standard`_) while other are +clang extensions spelled described `here +`_ and spelled +without the `omp` in the OpenMP assume directive syntax. .. _`OpenMP 5.1 Standard`: https://www.openmp.org/spec-html/5.1/openmpsu37.html#x56-560002.5.2 diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/ValueTracking.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPIRBuilder.h" +#include "llvm/IR/Assumptions.h" #include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/IPO.h" @@ -1450,6 +1451,12 @@ } }; +/// The "omp_no_external_caller_in_target_region" assumption guarantees that +/// there are no external caller of a function which are inside an OpenMP +/// target region. +static KnownAssumptionString + NoExternalCallerInTargetRegion("omp_no_external_caller_in_target_region"); + Kernel OpenMPOpt::getUniqueKernelFor(Function &F) { if (!OMPInfoCache.ModuleSlice.count(&F)) return nullptr; @@ -1469,7 +1476,8 @@ } CachedKernel = nullptr; - if (!F.hasLocalLinkage()) { + if (!F.hasLocalLinkage() && + !hasAssumption(F, NoExternalCallerInTargetRegion)) { // See https://openmp.llvm.org/remarks/OptimizationRemarks.html auto Remark = [&](OptimizationRemark OR) { diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll --- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll +++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll @@ -7,12 +7,18 @@ ; #pragma omp parallel ; { } ; } +; __attribute__((assume("no_external_callers"))) +; void baz(void) { +; #pragma omp parallel +; { } +; } ; void foo(void) { ; #pragma omp target teams ; { ; #pragma omp parallel ; {} ; bar(); +; baz(); ; #pragma omp parallel ; {} ; } @@ -23,13 +29,16 @@ ; another kernel. ; CHECK-DAG: @__omp_outlined__1_wrapper.ID = private constant i8 undef +; CHECK-DAG: @__omp_outlined__2b_wrapper.ID = private constant i8 undef ; CHECK-DAG: @__omp_outlined__3_wrapper.ID = private constant i8 undef ; CHECK-DAG: icmp eq i8* %5, @__omp_outlined__1_wrapper.ID +; CHECK-DAG: icmp eq i8* %b6, @__omp_outlined__2b_wrapper.ID ; CHECK-DAG: icmp eq i8* %7, @__omp_outlined__3_wrapper.ID ; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__1_wrapper.ID) -; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2_wrapper to i8*)) +; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2a_wrapper to i8*)) +; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__2b_wrapper.ID) ; CHECK-DAG: call void @__kmpc_kernel_prepare_parallel(i8* @__omp_outlined__3_wrapper.ID) @@ -69,11 +78,20 @@ .check.next: ; preds = %.execute.parallel %6 = load i8*, i8** %work_fn, align 8 - %work_match1 = icmp eq i8* %6, bitcast (void ()* @__omp_outlined__2_wrapper to i8*) - br i1 %work_match1, label %.execute.fn2, label %.check.next3 + %work_match1 = icmp eq i8* %6, bitcast (void ()* @__omp_outlined__2a_wrapper to i8*) + br i1 %work_match1, label %.execute.fn2a, label %.check.next2 + +.execute.fn2a: ; preds = %.check.next + call void @__omp_outlined__2a_wrapper() + br label %.terminate.parallel + +.check.next2: ; preds = %.execute.parallel + %b6 = load i8*, i8** %work_fn, align 8 + %work_match1b = icmp eq i8* %b6, bitcast (void ()* @__omp_outlined__2b_wrapper to i8*) + br i1 %work_match1b, label %.execute.fn2b, label %.check.next3 -.execute.fn2: ; preds = %.check.next - call void @__omp_outlined__2_wrapper() +.execute.fn2b: ; preds = %.check.next + call void @__omp_outlined__2b_wrapper() br label %.terminate.parallel .check.next3: ; preds = %.check.next @@ -111,6 +129,7 @@ define internal void @__omp_outlined__() { call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__1_wrapper to i8*)) call void @bar() + call void @baz() call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__3_wrapper to i8*)) ret void } @@ -125,11 +144,20 @@ } define hidden void @bar() { - call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2_wrapper to i8*)) + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2a_wrapper to i8*)) + ret void +} + +define hidden void @baz() #0 { + call void @__kmpc_kernel_prepare_parallel(i8* bitcast (void ()* @__omp_outlined__2b_wrapper to i8*)) + ret void +} + +define internal void @__omp_outlined__2a_wrapper() { ret void } -define internal void @__omp_outlined__2_wrapper() { +define internal void @__omp_outlined__2b_wrapper() { ret void } @@ -147,6 +175,7 @@ declare i32 @__kmpc_global_thread_num(%struct.ident_t* nocapture readnone) +attributes #0 = { "llvm.assume"="abc,omp_no_external_caller_in_target_region,123" } !nvvm.annotations = !{!0} diff --git a/openmp/docs/remarks/OptimizationRemarks.rst b/openmp/docs/remarks/OptimizationRemarks.rst --- a/openmp/docs/remarks/OptimizationRemarks.rst +++ b/openmp/docs/remarks/OptimizationRemarks.rst @@ -25,6 +25,13 @@ modify the linkage and thereby help optimization with a `static` or `__attribute__((internal))` function annotation. If changing the linkage is impossible, e.g., because there are outside callers on the host, one can split -the function into an external visible interface which is not compiled for -the target and an internal implementation which is compiled for the target -and should be called from within the target region. +the function into an external visible interface which is not compiled for the +target and an internal implementation which is compiled for the target and +should be called from within the target region. Finally, we provide an +assumption for the target side which can be spelled either as +`__attribute__((assume("omp_no_external_caller_in_target_region")))` or as +OpenMP assumption, i.a., `#pragma omp begin assume +ext_no_external_caller_in_target_region`. The assumption effectively allows the +compiler to assume no caller outside the current translation unit will call the +function from a target region, hence the function is not called from outside +translation units on the device.