diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -559,9 +559,38 @@ __ompt_get_task_info_internal(1, NULL, &parent_task_data, NULL, NULL, NULL); if (ompt_enabled.ompt_callback_parallel_end) { + // This is executed in one of the following cases: + // 1. gcc is used to generate the code of serialized parallel region + // and this function is called from within the internal implementation + // of the runtime (__kmp_join_call inside kmp_runtime.cpp). + // 2. clang is used to generate the code of serialized parallel region + // and omp parallel directive contains the clause if(0). + // This function is invoked directly from the application code + // and the call is generated by the clang compiler. + // For the first case, ompt_parallel_invoker_runtime should be passed + // to the dispatched callback, while for the second case, + // ompt_parallel_invoker_program should be passed. + // Invoking this function from within the runtime is inherently wrong, + // because it introduces unnecessary complexity. Namely, to distinct + // one of the above cases, it is required to be able to dynamically + // determine the caller of the function. + // To do this, check whether return_address exists. Note that it doesn't + // exist if the clang is used, since the return address was set inside + // __kmpc_serialized_parallel that has already been finished so the + // corresponding guard is destroyed and the address is cleared. + + // Determine whether the clang is used and if the function is invoked + // directly by the application (second case). + bool clang_if0 = this_thr->th.ompt_thread_info.return_address == NULL; + int invoker = clang_if0 ? ompt_parallel_invoker_program + : ompt_parallel_invoker_runtime; + // This is obvious a hack. It would be smarter to refactor the runtime + // to avoid invoking __kmpc_* functions from within the runtime itself. + // The return_address maintenance should be fixed inside + // __kmpc_serialized_parallel too. ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( &(serial_team->t.ompt_team_info.parallel_data), parent_task_data, - ompt_parallel_invoker_program | ompt_parallel_team, + invoker | ompt_parallel_team, OMPT_LOAD_RETURN_ADDRESS(global_tid)); } __ompt_lw_taskteam_unlink(this_thr); diff --git a/openmp/runtime/src/ompt-internal.h b/openmp/runtime/src/ompt-internal.h --- a/openmp/runtime/src/ompt-internal.h +++ b/openmp/runtime/src/ompt-internal.h @@ -20,9 +20,12 @@ #define _OMP_EXTERN extern "C" -#define OMPT_INVOKER(x) \ - ((x == fork_context_gnu) ? ompt_parallel_invoker_program \ - : ompt_parallel_invoker_runtime) +// Invoker is equal to ompt_parallel_invoker_program only if the clang +// generated the code for serialized parallel region whose omp parallel +// directive contains if(0) clause. For all other cases, the implicit tasks +// of the corresponding regions are invoked by the runtime, so use below +// ompt_parallel_invoker_runtime flag. +#define OMPT_INVOKER(x) ompt_parallel_invoker_runtime #define ompt_callback(e) e##_callback diff --git a/openmp/runtime/test/ompt/parallel/invoker.c b/openmp/runtime/test/ompt/parallel/invoker.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/ompt/parallel/invoker.c @@ -0,0 +1,73 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// REQUIRES: ompt + + +#include "callback.h" + + +#define DELAY 1000000 +#define COUNT 10 + +// burn some CPU cycles +void burn_CPU(int count) { + int j, k; + volatile float x; + int jmax; + + jmax = 7 * count; + + for (j = 0; j < jmax; j++) { + x = 0; + for (k = 0; k < DELAY; k++) { + x = x + 1.0; + } + } +} + +int main() { +#pragma omp parallel num_threads(2) + { + burn_CPU(COUNT); + } + + // Check if libomp supports the callbacks for this test. + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' + + + // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]] + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=2 + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=2 + + + // Consider the callstacks of the master thread while executing the burn_CPU + // function when the code is generated by gcc and clang respectively: + // 1) gcc used to generate the code + // #0 burn_CPU (count=10) + // #1 0x0000000000402b8b in main._omp_fn.0 () + // #2 0x00007ffff7b601f8 in __kmp_api_GOMP_parallel + // #3 0x0000000000402b6e in main () + // 2) clang is used to generate the code. + // #0 burn_CPU (count=10) + // #1 0x000000000040325a in .omp_outlined._debug__ + // #2 0x000000000040327d in .omp_outlined. + // #3 0x00007ffff7b862c3 in __kmp_invoke_microtask + // #4 0x00007ffff7acac6e in __kmp_invoke_task_func + // #5 0x00007ffff7abea3d in __kmp_fork_call + // #6 0x00007ffff7aa7930 in __kmpc_fork_call + // #7 0x0000000000403231 in main () + // One can notice that, no matter what compiler of the previous two is used, + // the outlined function that corresponds to the implicit tasks is invoked + // by the runtime. So the invoker should take value of + // ompt_parallel_invoker_runtime==2 + + + return 0; +} diff --git a/openmp/runtime/test/ompt/parallel/invoker_if0_clang.c b/openmp/runtime/test/ompt/parallel/invoker_if0_clang.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/ompt/parallel/invoker_if0_clang.c @@ -0,0 +1,63 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// REQUIRES: ompt +// UNSUPPORTED: gcc + +#include "callback.h" + + +#define DELAY 1000000 +#define COUNT 10 + +// burn some CPU cycles +void burn_CPU(int count) { + int j, k; + volatile float x; + int jmax; + + jmax = 7 * count; + + for (j = 0; j < jmax; j++) { + x = 0; + for (k = 0; k < DELAY; k++) { + x = x + 1.0; + } + } +} + +int main() { +#pragma omp parallel if(0) + { + burn_CPU(COUNT); + } + + // Check if libomp supports the callbacks for this test. + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' + + + // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]] + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=1 + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=1 + + + // Consider the callstack of the master thread while executing the burn_CPU + // function when the code is generated by clang: + // #0 burn_CPU (count=10) + // #1 0x000000000040327a in .omp_outlined._debug__ + // #2 0x000000000040329d in .omp_outlined. + // #3 0x000000000040323c in main () + + // One can notice that the outlined function that corresponds to the implicit + // tasks is invoked directly by the application, so the invoker should take + // value of ompt_parallel_invoker_runtime==1 + + + return 0; +} diff --git a/openmp/runtime/test/ompt/parallel/invoker_if0_gcc.c b/openmp/runtime/test/ompt/parallel/invoker_if0_gcc.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/ompt/parallel/invoker_if0_gcc.c @@ -0,0 +1,63 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// REQUIRES: ompt +// UNSUPPORTED: clang + +#include "callback.h" + + +#define DELAY 1000000 +#define COUNT 10 + +// burn some CPU cycles +void burn_CPU(int count) { + int j, k; + volatile float x; + int jmax; + + jmax = 7 * count; + + for (j = 0; j < jmax; j++) { + x = 0; + for (k = 0; k < DELAY; k++) { + x = x + 1.0; + } + } +} + +int main() { +#pragma omp parallel if(0) + { + burn_CPU(COUNT); + } + + // Check if libomp supports the callbacks for this test. + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' + + + // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]] + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=2 + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=2 + + + // Consider the callstack of the master thread while executing the burn_CPU + // function when the code is generated by gcc: + // #0 burn_CPU (count=10) + // #1 0x0000000000402b8b in main._omp_fn.0 () + // #2 0x00007ffff7b601f8 in __kmp_api_GOMP_parallel + // #3 0x0000000000402b6e in main () + + // One can notice that the outlined function that corresponds to the implicit + // tasks is invoked by the runtime, so the invoker should take value of + // ompt_parallel_invoker_runtime==2 + + + return 0; +} diff --git a/openmp/runtime/test/ompt/parallel/invoker_serialized.c b/openmp/runtime/test/ompt/parallel/invoker_serialized.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/ompt/parallel/invoker_serialized.c @@ -0,0 +1,73 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// REQUIRES: ompt + + +#include "callback.h" + + +#define DELAY 1000000 +#define COUNT 10 + +// burn some CPU cycles +void burn_CPU(int count) { + int j, k; + volatile float x; + int jmax; + + jmax = 7 * count; + + for (j = 0; j < jmax; j++) { + x = 0; + for (k = 0; k < DELAY; k++) { + x = x + 1.0; + } + } +} + +int main() { +#pragma omp parallel num_threads(1) + { + burn_CPU(COUNT); + } + + // Check if libomp supports the callbacks for this test. + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_begin' + // CHECK-NOT: {{^}}0: Could not register callback 'ompt_callback_parallel_end' + + + // CHECK: {{^}}0: NULL_POINTER=[[NULL:.*$]] + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_initial_task_begin + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_begin + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=2 + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_parallel_end + // CHECK-SAME: parallel_id=[[PARALLEL_ID_1:[0-9]+]] + // CHECK-SAME: invoker=2 + + + // Consider the callstacks of the master thread while executing the burn_CPU + // function when the code is generated by gcc and clang respectively: + // 1) gcc used to generate the code + // #0 burn_CPU (count=10) + // #1 0x0000000000402b8b in main._omp_fn.0 () + // #2 0x00007ffff7b601f8 in __kmp_api_GOMP_parallel + // #3 0x0000000000402b6e in main () + + // 2) clang is used to generate the code. + // #0 burn_CPU (count=10) + // #1 0x000000000040325a in .omp_outlined._debug__ + // #2 0x000000000040327d in .omp_outlined. + // #3 0x00007ffff7b862c3 in __kmp_invoke_microtask () + // #4 0x00007ffff7abd746 in __kmp_fork_call + // #5 0x00007ffff7aa7930 in __kmpc_fork_call + // #6 0x0000000000403231 in main () + // One can notice that, no matter what compiler of the previous two is used, + // the outlined function that corresponds to the implicit tasks is invoked + // by the runtime. So the invoker should take value of + // ompt_parallel_invoker_runtime==2 + + + return 0; +}