Index: openmp/runtime/src/kmp_barrier.cpp =================================================================== --- openmp/runtime/src/kmp_barrier.cpp +++ openmp/runtime/src/kmp_barrier.cpp @@ -15,9 +15,7 @@ #include "kmp_itt.h" #include "kmp_os.h" #include "kmp_stats.h" -#if OMPT_SUPPORT #include "ompt-specific.h" -#endif #if KMP_MIC #include @@ -128,8 +126,11 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; (*reduce)(this_thr->th.th_local.reduce_data, other_threads[i]->th.th_local.reduce_data); + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -355,8 +356,11 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid)); ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -600,8 +604,11 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid)); ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -912,6 +919,8 @@ flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); if (reduce) { ANNOTATE_REDUCE_AFTER(reduce); + OMPT_REDUCTION_DECL(this_thr, gtid); + OMPT_REDUCTION_BEGIN; for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; ++child_tid) { KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " @@ -923,6 +932,7 @@ (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); } + OMPT_REDUCTION_END; ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } Index: openmp/runtime/src/kmp_csupport.cpp =================================================================== --- openmp/runtime/src/kmp_csupport.cpp +++ openmp/runtime/src/kmp_csupport.cpp @@ -18,10 +18,7 @@ #include "kmp_itt.h" #include "kmp_lock.h" #include "kmp_stats.h" - -#if OMPT_SUPPORT #include "ompt-specific.h" -#endif #define MAX_MESSAGE 512 @@ -3429,13 +3426,18 @@ loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); + OMPT_REDUCTION_DECL(th, global_tid); if (packed_reduction_method == critical_reduce_block) { + OMPT_REDUCTION_BEGIN; + __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); retval = 1; } else if (packed_reduction_method == empty_reduce_block) { + OMPT_REDUCTION_BEGIN; + // usage: if team size == 1, no synchronization is required ( Intel // platforms only ) retval = 1; @@ -3536,15 +3538,20 @@ packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); + OMPT_REDUCTION_DECL(__kmp_thread_from_gtid(global_tid), global_tid); + if (packed_reduction_method == critical_reduce_block) { __kmp_end_critical_section_reduce_block(loc, global_tid, lck); + OMPT_REDUCTION_END; } else if (packed_reduction_method == empty_reduce_block) { // usage: if team size == 1, no synchronization is required ( on Intel // platforms only ) + OMPT_REDUCTION_END; + } else if (packed_reduction_method == atomic_reduce_block) { // neither master nor other workers should get here @@ -3556,6 +3563,7 @@ tree_reduce_block)) { // only master gets here + // OMPT: tree reduction is annotated in the barrier code } else { @@ -3629,13 +3637,17 @@ loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); + OMPT_REDUCTION_DECL(th, global_tid); + if (packed_reduction_method == critical_reduce_block) { + OMPT_REDUCTION_BEGIN; __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); retval = 1; } else if (packed_reduction_method == empty_reduce_block) { + OMPT_REDUCTION_BEGIN; // usage: if team size == 1, no synchronization is required ( Intel // platforms only ) retval = 1; @@ -3723,10 +3735,13 @@ // this barrier should be visible to a customer and to the threading profile // tool (it's a terminating barrier on constructs if NOWAIT not specified) + OMPT_REDUCTION_DECL(th, global_tid); if (packed_reduction_method == critical_reduce_block) { __kmp_end_critical_section_reduce_block(loc, global_tid, lck); + OMPT_REDUCTION_END; + // TODO: implicit barrier: should be exposed #if OMPT_SUPPORT ompt_frame_t *ompt_frame; @@ -3749,6 +3764,8 @@ } else if (packed_reduction_method == empty_reduce_block) { + OMPT_REDUCTION_END; + // usage: if team size==1, no synchronization is required (Intel platforms only) // TODO: implicit barrier: should be exposed Index: openmp/runtime/src/ompt-event-specific.h =================================================================== --- openmp/runtime/src/ompt-event-specific.h +++ openmp/runtime/src/ompt-event-specific.h @@ -99,7 +99,7 @@ #define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL -#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED +#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED Index: openmp/runtime/src/ompt-specific.h =================================================================== --- openmp/runtime/src/ompt-specific.h +++ openmp/runtime/src/ompt-specific.h @@ -15,6 +15,7 @@ #include "kmp.h" +#if OMPT_SUPPORT /***************************************************************************** * forward declarations ****************************************************************************/ @@ -101,5 +102,30 @@ inline const char *ompt_get_runtime_version() { return &__kmp_version_lib_ver[KMP_VERSION_MAGIC_LEN]; } +#endif // OMPT_SUPPRORT + +// macros providing the OMPT callbacks for reduction clause +#if OMPT_SUPPORT && OMPT_OPTIONAL +#define OMPT_REDUCTION_DECL(this_thr, gtid) \ + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); \ + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); \ + void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); +#define OMPT_REDUCTION_BEGIN \ + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \ + ompt_callbacks.ompt_callback(ompt_callback_reduction)( \ + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, \ + my_task_data, return_address); \ + } +#define OMPT_REDUCTION_END \ + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { \ + ompt_callbacks.ompt_callback(ompt_callback_reduction)( \ + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, \ + my_task_data, return_address); \ + } +#else // OMPT_SUPPORT && OMPT_OPTIONAL +#define OMPT_REDUCTION_DECL(this_thr, gtid) +#define OMPT_REDUCTION_BEGIN +#define OMPT_REDUCTION_END +#endif // ! OMPT_SUPPORT && OMPT_OPTIONAL #endif Index: openmp/runtime/test/ompt/callback.h =================================================================== --- openmp/runtime/test/ompt/callback.h +++ openmp/runtime/test/ompt/callback.h @@ -358,6 +358,9 @@ printf("%" PRIu64 ": ompt_event_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region\n"); + exit(-1); break; } break; @@ -377,6 +380,9 @@ printf("%" PRIu64 ": ompt_event_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region\n"); + exit(-1); break; } break; @@ -409,6 +415,9 @@ printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region_wait\n"); + exit(-1); break; } break; @@ -428,12 +437,38 @@ printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); break; case ompt_sync_region_reduction: + printf("ompt_sync_region_reduction should never be passed to " + "on_ompt_callback_sync_region_wait\n"); + exit(-1); break; } break; } } +static void on_ompt_callback_reduction(ompt_sync_region_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + const void *codeptr_ra) { + switch (endpoint) { + case ompt_scope_begin: + printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64 + ", task_id=%" PRIu64 ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, + (parallel_data) ? parallel_data->value : 0, task_data->value, + codeptr_ra); + break; + case ompt_scope_end: + printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64 + ", task_id=%" PRIu64 ", codeptr_ra=%p\n", + ompt_get_thread_data()->value, + (parallel_data) ? parallel_data->value : 0, task_data->value, + codeptr_ra); + break; + } +} + static void on_ompt_callback_flush( ompt_data_t *thread_data, @@ -784,6 +819,7 @@ register_callback(ompt_callback_nest_lock); register_callback(ompt_callback_sync_region); register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t); + register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t); register_callback(ompt_callback_control_tool); register_callback(ompt_callback_flush); register_callback(ompt_callback_cancel); Index: openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c =================================================================== --- /dev/null +++ openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c @@ -0,0 +1,37 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s +// REQUIRES: ompt +#include "callback.h" +#include + +#ifdef NOWAIT +#define FOR_CLAUSE nowait +#else +#define FOR_CLAUSE +#endif + +int main() { + int sum = 0; + int i; +#pragma omp parallel num_threads(1) +#pragma omp for reduction(+ : sum) FOR_CLAUSE + for (i = 0; i < 10000; i++) { + sum += i; + } + + // CHECK: 0: NULL_POINTER=[[NULL:.*$]] + + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]] + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], + // CHECK-SAME: codeptr_ra= + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], + // CHECK-SAME: task_id=[[TASK_ID]], codeptr_ra= + + return 0; +} Index: openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c =================================================================== --- /dev/null +++ openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c @@ -0,0 +1,47 @@ +// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s +// REQUIRES: ompt +#include "callback.h" +#include + +#ifdef NOWAIT +#define FOR_CLAUSE nowait +#else +#define FOR_CLAUSE +#endif + +int main() { + int sum = 0; + int i; +#pragma omp parallel num_threads(5) +#pragma omp for reduction(+ : sum) FOR_CLAUSE + for (i = 0; i < 10000; i++) { + sum += i; + } + + // CHECK: 0: NULL_POINTER=[[NULL:.*$]] + + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]] + + // order and distribution to threads not determined + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + // CHECK: {{^}}{{[0-f]+}}: ompt_event_reduction_end: + // CHECK-SAME: parallel_id=[[PARALLEL_ID]], task_id={{[0-9]+}} + + return 0; +}