Index: runtime/src/kmp_barrier.cpp =================================================================== --- runtime/src/kmp_barrier.cpp +++ runtime/src/kmp_barrier.cpp @@ -57,7 +57,7 @@ #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) { + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); } #endif @@ -97,7 +97,7 @@ USE_ITT_BUILD_ARG(itt_sync_obj) ); #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - write min of the thread time and the other thread time to the thread. - if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) { + if (__kmp_forkjoin_frames_mode == 2) { this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, other_threads[i]->th.th_bar_min_time); } @@ -234,7 +234,7 @@ #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) { + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); } #endif @@ -262,7 +262,7 @@ USE_ITT_BUILD_ARG(itt_sync_obj) ); #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - write min of the thread time and a child time to the thread. - if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) { + if (__kmp_forkjoin_frames_mode == 2) { this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, child_thr->th.th_bar_min_time); } @@ -432,7 +432,7 @@ #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - save arrive time to the thread - if(__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) { + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { this_thr->th.th_bar_arrive_time = this_thr->th.th_bar_min_time = __itt_get_timestamp(); } #endif @@ -485,7 +485,7 @@ USE_ITT_BUILD_ARG(itt_sync_obj) ); #if USE_ITT_BUILD && USE_ITT_NOTIFY // Barrier imbalance - write min of the thread time and a child time to the thread. - if (__kmp_forkjoin_frames_mode == 2 || __kmp_forkjoin_frames_mode == 3) { + if (__kmp_forkjoin_frames_mode == 2) { this_thr->th.th_bar_min_time = KMP_MIN(this_thr->th.th_bar_min_time, child_thr->th.th_bar_min_time); } @@ -743,6 +743,13 @@ gtid, team->t.t_id, tid, bt)); KMP_DEBUG_ASSERT(this_thr == other_threads[this_thr->th.th_info.ds.ds_tid]); +#if USE_ITT_BUILD && USE_ITT_NOTIFY + // Barrier imbalance - save arrive time to the thread + if(__kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 2) { + this_thr->th.th_bar_arrive_time = __itt_get_timestamp(); + } +#endif + (void)__kmp_init_hierarchical_barrier_thread(bt, thr_bar, nproc, gtid, tid, team); if (thr_bar->my_level) { // not a leaf (my_level==0 means leaf) @@ -1113,24 +1120,29 @@ __kmp_itt_barrier_middle(gtid, itt_sync_obj); #endif /* USE_ITT_BUILD */ #if USE_ITT_BUILD && USE_ITT_NOTIFY - // Barrier - report frame end - if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) { + // Barrier - report frame end (only if active_level == 1) + if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && +#if OMP_40_ENABLED + this_thr->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1) + { kmp_uint64 cur_time = __itt_get_timestamp(); - kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads; + kmp_info_t **other_threads = team->t.t_threads; int nproc = this_thr->th.th_team_nproc; int i; - // Initialize with master's wait time - kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; switch(__kmp_forkjoin_frames_mode) { case 1: __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); this_thr->th.th_frame_time = cur_time; break; - case 2: + case 2: // AC 2015-01-19: currently does not work for hierarchical (to be fixed) __kmp_itt_frame_submit(gtid, this_thr->th.th_bar_min_time, cur_time, 1, loc, nproc); break; case 3: if( __itt_metadata_add_ptr ) { + // Initialize with master's wait time + kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; for (i=1; ith.th_bar_arrive_time ); } @@ -1355,14 +1367,17 @@ # if USE_ITT_BUILD && USE_ITT_NOTIFY // Join barrier - report frame end - if (__itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode) { + if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && __kmp_forkjoin_frames_mode && +#if OMP_40_ENABLED + this_thr->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1) + { kmp_uint64 cur_time = __itt_get_timestamp(); ident_t * loc = team->t.t_ident; - kmp_info_t **other_threads = this_thr->th.th_team->t.t_threads; + kmp_info_t **other_threads = team->t.t_threads; int nproc = this_thr->th.th_team_nproc; int i; - // Initialize with master's wait time - kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; switch(__kmp_forkjoin_frames_mode) { case 1: __kmp_itt_frame_submit(gtid, this_thr->th.th_frame_time, cur_time, 0, loc, nproc); @@ -1372,6 +1387,8 @@ break; case 3: if( __itt_metadata_add_ptr ) { + // Initialize with master's wait time + kmp_uint64 delta = cur_time - this_thr->th.th_bar_arrive_time; for (i=1; ith.th_bar_arrive_time ); } Index: runtime/src/kmp_csupport.c =================================================================== --- runtime/src/kmp_csupport.c +++ runtime/src/kmp_csupport.c @@ -509,27 +509,30 @@ #if USE_ITT_BUILD kmp_uint64 cur_time = 0; #if USE_ITT_NOTIFY - if( __itt_get_timestamp_ptr ) { + if ( __itt_get_timestamp_ptr ) { cur_time = __itt_get_timestamp(); } #endif /* USE_ITT_NOTIFY */ - // Report the barrier - if( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && __itt_frame_submit_v3_ptr ) { - if( this_thr->th.th_team->t.t_level == 0 ) { - __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, cur_time, 0, loc, this_thr->th.th_team_nproc, 0 ); - } - } - // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment. - if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG ) - { - this_thr->th.th_ident = loc; - __kmp_itt_region_joined( global_tid, 1 ); - } - if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG ) - { + if ( this_thr->th.th_team->t.t_level == 0 +#if OMP_40_ENABLED + && this_thr->th.th_teams_microtask == NULL +#endif + ) { + // Report the barrier this_thr->th.th_ident = loc; - // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier. - __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time, cur_time, 0, loc, this_thr->th.th_team_nproc, 2 ); + if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && + ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) ) + { + __kmp_itt_frame_submit( global_tid, this_thr->th.th_frame_time_serialized, + cur_time, 0, loc, this_thr->th.th_team_nproc, 0 ); + if ( __kmp_forkjoin_frames_mode == 3 ) + // Since barrier frame for serialized region is equal to the region we use the same begin timestamp as for the barrier. + __kmp_itt_frame_submit( global_tid, serial_team->t.t_region_time, + cur_time, 0, loc, this_thr->th.th_team_nproc, 2 ); + } else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) && + ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames ) + // Mark the end of the "parallel" region for VTune. Only use one of frame notification scheme at the moment. + __kmp_itt_region_joined( global_tid, 1 ); } #endif /* USE_ITT_BUILD */ Index: runtime/src/kmp_dispatch.cpp =================================================================== --- runtime/src/kmp_dispatch.cpp +++ runtime/src/kmp_dispatch.cpp @@ -628,6 +628,12 @@ #if USE_ITT_BUILD kmp_uint64 cur_chunk = chunk; + int itt_need_metadata_reporting = __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && + KMP_MASTER_GTID(gtid) && +#if OMP_40_ENABLED + th->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1; #endif if ( ! active ) { pr = reinterpret_cast< dispatch_private_info_template< T >* > @@ -864,9 +870,8 @@ } #if USE_ITT_BUILD // Calculate chunk for metadata report - if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { + if ( itt_need_metadata_reporting ) cur_chunk = limit - init + 1; - } #endif if ( st == 1 ) { pr->u.p.lb = lb + init; @@ -1119,16 +1124,10 @@ if ( pr->ordered ) { __kmp_itt_ordered_init( gtid ); }; // if -#endif /* USE_ITT_BUILD */ - }; // if - -#if USE_ITT_BUILD - // Report loop metadata - if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { - kmp_uint32 tid = __kmp_tid_from_gtid( gtid ); - if (KMP_MASTER_TID(tid)) { + // Report loop metadata + if ( itt_need_metadata_reporting ) { + // Only report metadata by master of active team at level 1 kmp_uint64 schedtype = 0; - switch ( schedule ) { case kmp_sch_static_chunked: case kmp_sch_static_balanced:// Chunk is calculated in the switch above @@ -1151,8 +1150,8 @@ } __kmp_itt_metadata_loop(loc, schedtype, tc, cur_chunk); } - } #endif /* USE_ITT_BUILD */ + }; // if #ifdef KMP_DEBUG { Index: runtime/src/kmp_runtime.c =================================================================== --- runtime/src/kmp_runtime.c +++ runtime/src/kmp_runtime.c @@ -800,6 +800,16 @@ /* TODO: Should this be acquire or release? */ status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this, th->th.th_local.this_construct); +#if USE_ITT_BUILD + if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) && +#if OMP_40_ENABLED + th->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1 ) + { // Only report metadata by master of active team at level 1 + __kmp_itt_metadata_single( id_ref ); + } +#endif /* USE_ITT_BUILD */ } if( __kmp_env_consistency_check ) { @@ -813,10 +823,6 @@ if ( status ) { __kmp_itt_single_start( gtid ); } - if( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid)) { - __kmp_itt_metadata_single( id_ref ); - } - #endif /* USE_ITT_BUILD */ return status; } @@ -1394,22 +1400,26 @@ #if USE_ITT_BUILD // Mark the start of the "parallel" region for VTune. Only use one of frame notification scheme at the moment - if ( ( __itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG ) - { - this_thr->th.th_ident = loc; - // 0 - no barriers; 1 - serialized parallel - __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 ); - } - // Save the start of the "parallel" region for VTune. This is the join barrier begin at the same time. - if( ( ( __kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3 ) && - __itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr ) || KMP_ITT_DEBUG ) - { - this_thr->th.th_ident = loc; + if ( serial_team->t.t_level == 1 +#if OMP_40_ENABLED + && this_thr->th.th_teams_microtask == NULL +#endif + ) { #if USE_ITT_NOTIFY - if( this_thr->th.th_team->t.t_level == 1 ) { - serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp(); - } + // Save the start of the "parallel" region for VTune. This is the frame begin at the same time. + if ( ( __itt_get_timestamp_ptr || KMP_ITT_DEBUG ) && + ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) ) + { + serial_team->t.t_region_time = this_thr->th.th_frame_time_serialized = __itt_get_timestamp(); + } else // only one notification scheme (either "submit" or "forking/joined", not both) #endif + if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) && + __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) + { + this_thr->th.th_ident = loc; + // 0 - no barriers; 1 - serialized parallel + __kmp_itt_region_forking( global_tid, this_thr->th.th_team_nproc, 0, 1 ); + } } #endif /* USE_ITT_BUILD */ } @@ -1872,36 +1882,30 @@ #if USE_ITT_BUILD - // Mark start of "parallel" region for VTune. Only use one of frame notification scheme at the moment. - if ((__itt_frame_begin_v3_ptr && __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) || KMP_ITT_DEBUG) { + if ( team->t.t_active_level == 1 // only report frames at level 1 # if OMP_40_ENABLED - if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master) - // Either not in teams or the outer fork of the teams construct + && !master_th->th.th_teams_microtask // not in teams construct # endif /* OMP_40_ENABLED */ - { - __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); - } - } + ) { #if USE_ITT_NOTIFY - kmp_uint64 tmp_time = 0; - if (((__kmp_forkjoin_frames_mode == 1 || __kmp_forkjoin_frames_mode == 3) && __itt_frame_submit_v3_ptr) || KMP_ITT_DEBUG) { - if (!(team->t.t_active_level > 1)) { -# if OMP_40_ENABLED - if (!master_th->th.th_teams_microtask || microtask == (microtask_t)__kmp_teams_master) { - // Either not in teams or the outer fork of the teams construct -# endif /* OMP_40_ENABLED */ - if ( __itt_get_timestamp_ptr ) - tmp_time = __itt_get_timestamp(); - // Internal fork - report frame begin + if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && + ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) ) + { + kmp_uint64 tmp_time = 0; + if ( __itt_get_timestamp_ptr ) + tmp_time = __itt_get_timestamp(); + // Internal fork - report frame begin master_th->th.th_frame_time = tmp_time; - if ( __kmp_forkjoin_frames_mode==3 ) - team->t.t_region_time = tmp_time; -# if OMP_40_ENABLED - } -# endif /* OMP_40_ENABLED */ + if ( __kmp_forkjoin_frames_mode == 3 ) + team->t.t_region_time = tmp_time; + } else // only one notification scheme (either "submit" or "forking/joined", not both) +#endif /* USE_ITT_NOTIFY */ + if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) && + __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode ) + { // Mark start of "parallel" region for VTune. + __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); } } -#endif /* USE_ITT_NOTIFY */ #endif /* USE_ITT_BUILD */ /* now go on and do the work */ @@ -2027,31 +2031,21 @@ __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier } - // Mark end of "parallel" region for VTune. Only use one of frame notification scheme at the moment. - if ( ( __itt_frame_end_v3_ptr && __kmp_forkjoin_frames && ! __kmp_forkjoin_frames_mode ) || KMP_ITT_DEBUG ) { + // Mark end of "parallel" region for VTune. + if ( team->t.t_active_level == 1 # if OMP_40_ENABLED - if ( !master_th->th.th_teams_microtask /* not in teams */ || - ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) ) - // Either not in teams or exiting teams region - // (teams is a frame and no other frames inside the teams) + && !master_th->th.th_teams_microtask /* not in teams construct */ # endif /* OMP_40_ENABLED */ - { - master_th->th.th_ident = loc; - __kmp_itt_region_joined( gtid ); - } - } - if ( ( __itt_frame_submit_v3_ptr && __kmp_forkjoin_frames_mode == 3 ) || KMP_ITT_DEBUG ) { -# if OMP_40_ENABLED - if ( !master_th->th.th_teams_microtask /* not in teams */ || - ( !exit_teams && team->t.t_level == master_th->th.th_teams_level ) ) - // Either not in teams or exiting teams region - // (teams is a frame and no other frames inside the teams) -# endif /* OMP_40_ENABLED */ - { + ) { master_th->th.th_ident = loc; - __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time, 0, loc, master_th->th.th_team_nproc, 1 ); - } - } + // only one notification scheme (either "submit" or "forking/joined", not both) + if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 ) + __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time, + 0, loc, master_th->th.th_team_nproc, 1 ); + else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) && + ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames ) + __kmp_itt_region_joined( gtid ); + } // active_level == 1 #endif /* USE_ITT_BUILD */ #if OMP_40_ENABLED Index: runtime/src/kmp_sched.cpp =================================================================== --- runtime/src/kmp_sched.cpp +++ runtime/src/kmp_sched.cpp @@ -88,6 +88,7 @@ register kmp_uint32 nth; register UT trip_count; register kmp_team_t *team; + register kmp_info_t *th = __kmp_threads[ gtid ]; KMP_DEBUG_ASSERT( plastiter && plower && pupper && pstride ); KE_TRACE( 10, ("__kmpc_for_static_init called (%d)\n", global_tid)); @@ -139,13 +140,13 @@ if ( schedtype > kmp_ord_upper ) { // we are in DISTRIBUTE construct schedtype += kmp_sch_static - kmp_distribute_static; // AC: convert to usual schedule type - tid = __kmp_threads[ gtid ]->th.th_team->t.t_master_tid; - team = __kmp_threads[ gtid ]->th.th_team->t.t_parent; + tid = th->th.th_team->t.t_master_tid; + team = th->th.th_team->t.t_parent; } else #endif { tid = __kmp_tid_from_gtid( global_tid ); - team = __kmp_threads[ gtid ]->th.th_team; + team = th->th.th_team; } /* determine if "for" loop is an active worksharing construct */ @@ -282,7 +283,12 @@ #if USE_ITT_BUILD // Report loop metadata - if ( KMP_MASTER_TID(tid) && __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 ) { + if ( KMP_MASTER_TID(tid) && __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && +#if OMP_40_ENABLED + th->th.th_teams_microtask == NULL && +#endif + team->t.t_active_level == 1 ) + { kmp_uint64 cur_chunk = chunk; // Calculate chunk in case it was not specified; it is specified for kmp_sch_static_chunked if ( schedtype == kmp_sch_static ) {