diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp --- a/openmp/runtime/src/kmp_barrier.cpp +++ b/openmp/runtime/src/kmp_barrier.cpp @@ -128,8 +128,25 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(i, team), team->t.t_id, i)); ANNOTATE_REDUCE_AFTER(reduce); +#if OMPT_SUPPORT && OMPT_OPTIONAL + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif (*reduce)(this_thr->th.th_local.reduce_data, other_threads[i]->th.th_local.reduce_data); +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -355,8 +372,25 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid)); ANNOTATE_REDUCE_AFTER(reduce); +#if OMPT_SUPPORT && OMPT_OPTIONAL + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -600,8 +634,25 @@ gtid, team->t.t_id, tid, __kmp_gtid_from_tid(child_tid, team), team->t.t_id, child_tid)); ANNOTATE_REDUCE_AFTER(reduce); +#if OMPT_SUPPORT && OMPT_OPTIONAL + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif (*reduce)(this_thr->th.th_local.reduce_data, child_thr->th.th_local.reduce_data); +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } @@ -912,6 +963,16 @@ flag.wait(this_thr, FALSE USE_ITT_BUILD_ARG(itt_sync_obj)); if (reduce) { ANNOTATE_REDUCE_AFTER(reduce); +#if OMPT_SUPPORT && OMPT_OPTIONAL + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif for (child_tid = tid + 1; child_tid <= tid + thr_bar->leaf_kids; ++child_tid) { KA_TRACE(100, ("__kmp_hierarchical_barrier_gather: T#%d(%d:%d) += " @@ -923,6 +984,13 @@ (*reduce)(this_thr->th.th_local.reduce_data, other_threads[child_tid]->th.th_local.reduce_data); } +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif ANNOTATE_REDUCE_BEFORE(reduce); ANNOTATE_REDUCE_BEFORE(&team->t.t_bar); } diff --git a/openmp/runtime/src/kmp_csupport.cpp b/openmp/runtime/src/kmp_csupport.cpp --- a/openmp/runtime/src/kmp_csupport.cpp +++ b/openmp/runtime/src/kmp_csupport.cpp @@ -3429,13 +3429,35 @@ loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); +#if OMPT_SUPPORT && OMPT_OPTIONAL + kmp_info_t *this_thr = __kmp_threads[global_tid]; + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(global_tid); +#endif if (packed_reduction_method == critical_reduce_block) { +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif + __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); retval = 1; } else if (packed_reduction_method == empty_reduce_block) { +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif + // usage: if team size == 1, no synchronization is required ( Intel // platforms only ) retval = 1; @@ -3536,15 +3558,38 @@ packed_reduction_method = __KMP_GET_REDUCTION_METHOD(global_tid); +#if OMPT_SUPPORT && OMPT_OPTIONAL + kmp_info_t *this_thr = __kmp_threads[global_tid]; + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(global_tid); +#endif + if (packed_reduction_method == critical_reduce_block) { __kmp_end_critical_section_reduce_block(loc, global_tid, lck); +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif } else if (packed_reduction_method == empty_reduce_block) { // usage: if team size == 1, no synchronization is required ( on Intel // platforms only ) +#if OMPT_SUPPORT && OMPT_OPTIONAL + + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif + } else if (packed_reduction_method == atomic_reduce_block) { // neither master nor other workers should get here @@ -3556,6 +3601,7 @@ tree_reduce_block)) { // only master gets here + // OMPT: tree reduction is annotated in the barrier code } else { @@ -3629,13 +3675,34 @@ loc, global_tid, num_vars, reduce_size, reduce_data, reduce_func, lck); __KMP_SET_REDUCTION_METHOD(global_tid, packed_reduction_method); +#if OMPT_SUPPORT && OMPT_OPTIONAL + kmp_info_t *this_thr = __kmp_threads[global_tid]; + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(global_tid); +#endif + if (packed_reduction_method == critical_reduce_block) { +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif __kmp_enter_critical_section_reduce_block(loc, global_tid, lck); retval = 1; } else if (packed_reduction_method == empty_reduce_block) { +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_begin, my_parallel_data, + my_task_data, return_address); + } +#endif // usage: if team size == 1, no synchronization is required ( Intel // platforms only ) retval = 1; @@ -3723,9 +3790,23 @@ // this barrier should be visible to a customer and to the threading profile // tool (it's a terminating barrier on constructs if NOWAIT not specified) +#if OMPT_SUPPORT && OMPT_OPTIONAL + kmp_info_t *this_thr = __kmp_threads[global_tid]; + ompt_data_t *my_task_data = OMPT_CUR_TASK_DATA(this_thr); + ompt_data_t *my_parallel_data = OMPT_CUR_TEAM_DATA(this_thr); + void *return_address = OMPT_LOAD_RETURN_ADDRESS(global_tid); +#endif if (packed_reduction_method == critical_reduce_block) { __kmp_end_critical_section_reduce_block(loc, global_tid, lck); + +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif // TODO: implicit barrier: should be exposed #if OMPT_SUPPORT @@ -3749,6 +3830,14 @@ } else if (packed_reduction_method == empty_reduce_block) { +#if OMPT_SUPPORT && OMPT_OPTIONAL + if (ompt_enabled.enabled && ompt_enabled.ompt_callback_reduction) { + ompt_callbacks.ompt_callback(ompt_callback_reduction)( + ompt_sync_region_reduction, ompt_scope_end, my_parallel_data, + my_task_data, return_address); + } +#endif + // usage: if team size==1, no synchronization is required (Intel platforms only) // TODO: implicit barrier: should be exposed diff --git a/openmp/runtime/src/ompt-event-specific.h b/openmp/runtime/src/ompt-event-specific.h --- a/openmp/runtime/src/ompt-event-specific.h +++ b/openmp/runtime/src/ompt-event-specific.h @@ -99,7 +99,7 @@ #define ompt_callback_cancel_implemented ompt_event_MAY_ALWAYS_OPTIONAL -#define ompt_callback_reduction_implemented ompt_event_UNIMPLEMENTED +#define ompt_callback_reduction_implemented ompt_event_MAY_ALWAYS_OPTIONAL #define ompt_callback_dispatch_implemented ompt_event_UNIMPLEMENTED diff --git a/openmp/runtime/test/ompt/callback.h b/openmp/runtime/test/ompt/callback.h --- a/openmp/runtime/test/ompt/callback.h +++ b/openmp/runtime/test/ompt/callback.h @@ -408,7 +408,7 @@ case ompt_sync_region_taskgroup: printf("%" PRIu64 ": ompt_event_wait_taskgroup_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, parallel_data->value, task_data->value, codeptr_ra); break; - case ompt_sync_region_reduction: + default: break; } break; @@ -427,13 +427,32 @@ case ompt_sync_region_taskgroup: printf("%" PRIu64 ": ompt_event_wait_taskgroup_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); break; - case ompt_sync_region_reduction: + default: break; } break; } } +static void +on_ompt_callback_reduction( + ompt_sync_region_kind_t kind, + ompt_scope_endpoint_t endpoint, + ompt_data_t *parallel_data, + ompt_data_t *task_data, + const void *codeptr_ra) +{ + switch(endpoint) + { + case ompt_scope_begin: + printf("%" PRIu64 ": ompt_event_reduction_begin: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); + break; + case ompt_scope_end: + printf("%" PRIu64 ": ompt_event_reduction_end: parallel_id=%" PRIu64 ", task_id=%" PRIu64 ", codeptr_ra=%p\n", ompt_get_thread_data()->value, (parallel_data)?parallel_data->value:0, task_data->value, codeptr_ra); + break; + } +} + static void on_ompt_callback_flush( ompt_data_t *thread_data, @@ -784,6 +803,7 @@ register_callback(ompt_callback_nest_lock); register_callback(ompt_callback_sync_region); register_callback_t(ompt_callback_sync_region_wait, ompt_callback_sync_region_t); + register_callback_t(ompt_callback_reduction, ompt_callback_sync_region_t); register_callback(ompt_callback_control_tool); register_callback(ompt_callback_flush); register_callback(ompt_callback_cancel); diff --git a/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/ompt/synchronization/reduction/empty_reduce.c @@ -0,0 +1,34 @@ +// RUN: %libomp-compile-and-run | FileCheck %s +// RUN: %libomp-compile -DNOWAIT && %libomp-run | FileCheck %s +// REQUIRES: ompt +#include "callback.h" +#include + +#ifdef NOWAIT + #define FOR_CLAUSE nowait +#else + #define FOR_CLAUSE +#endif + +int main() +{ + int sum = 0; + int i; + #pragma omp parallel num_threads(1) + #pragma omp for reduction(+:sum) FOR_CLAUSE + for(i = 0; i < 10000; i++) + { + sum += i; + } + + // CHECK: 0: NULL_POINTER=[[NULL:.*$]] + + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]] + + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_reduction_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + + return 0; +} diff --git a/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/ompt/synchronization/reduction/tree_reduce.c @@ -0,0 +1,40 @@ +// RUN: %libomp-compile-and-run | %sort-threads | FileCheck %s +// REQUIRES: ompt +#include "callback.h" +#include + +#ifdef NOWAIT + #define FOR_CLAUSE nowait +#else + #define FOR_CLAUSE +#endif + +int main() +{ + int sum = 0; + int i; + #pragma omp parallel num_threads(5) + #pragma omp for reduction(+:sum) FOR_CLAUSE + for(i = 0; i < 10000; i++) + { + sum += i; + } + + // CHECK: 0: NULL_POINTER=[[NULL:.*$]] + + // CHECK: {{^}}[[MASTER_ID:[0-9]+]]: ompt_event_parallel_begin: + // CHECK-SAME: parallel_id=[[PARALLEL_ID:[0-9]+]] + // CHECK: {{^}}[[MASTER_ID]]: ompt_event_implicit_task_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID:[0-9]+]] + + // order and distribution to threads not determined + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_begin: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + // CHECK-DAG: {{^}}{{[0-f]+}}: ompt_event_reduction_end: parallel_id=[[PARALLEL_ID]], task_id=[[TASK_ID]], codeptr_ra= + + return 0; +}