Index: openmp/trunk/runtime/src/kmp_tasking.c =================================================================== --- openmp/trunk/runtime/src/kmp_tasking.c +++ openmp/trunk/runtime/src/kmp_tasking.c @@ -1806,17 +1806,17 @@ int *thread_finished USE_ITT_BUILD_ARG(void * itt_sync_obj), kmp_int32 is_constrained) { - kmp_task_team_t * task_team; + kmp_task_team_t * task_team = thread->th.th_task_team; kmp_thread_data_t * threads_data; kmp_task_t * task; + kmp_info_t * other_thread; kmp_taskdata_t * current_task = thread -> th.th_current_task; volatile kmp_uint32 * unfinished_threads; - kmp_int32 nthreads, last_stolen, k, tid; + kmp_int32 nthreads, victim=-2, use_own_tasks=1, new_victim=0, tid=thread->th.th_info.ds.ds_tid; KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec ); KMP_DEBUG_ASSERT( thread == __kmp_threads[ gtid ] ); - task_team = thread -> th.th_task_team; if (task_team == NULL) return FALSE; KA_TRACE(15, ("__kmp_execute_tasks_template(enter): T#%d final_spin=%d *thread_finished=%d\n", @@ -1834,277 +1834,154 @@ #endif KMP_DEBUG_ASSERT( (int)(TCR_4(*unfinished_threads)) >= 0 ); - // Choose tasks from our own work queue. - start: - while (( task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained )) != NULL ) { -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) { - if ( itt_sync_obj == NULL ) { - // we are at fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); - } - __kmp_itt_task_starting( itt_sync_obj ); - } -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - __kmp_invoke_task( gtid, task, current_task ); -#if USE_ITT_BUILD - if ( itt_sync_obj != NULL ) - __kmp_itt_task_finished( itt_sync_obj ); -#endif /* USE_ITT_BUILD */ - - // If this thread is only partway through the barrier and the condition - // is met, then return now, so that the barrier gather/release pattern can proceed. - // If this thread is in the last spin loop in the barrier, waiting to be - // released, we know that the termination condition will not be satisified, - // so don't waste any cycles checking it. - if (flag == NULL || (!final_spin && flag->done_check())) { - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #1): T#%d spin condition satisfied\n", gtid) ); - return TRUE; - } - if (thread->th.th_task_team == NULL) break; - KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task - } - - // This thread's work queue is empty. If we are in the final spin loop - // of the barrier, check and see if the termination condition is satisfied. -#if OMP_41_ENABLED - // The work queue may be empty but there might be proxy tasks still executing - if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0) -#else - if (final_spin) -#endif - { - // First, decrement the #unfinished threads, if that has not already - // been done. This decrement might be to the spin location, and - // result in the termination condition being satisfied. - if (! *thread_finished) { - kmp_uint32 count; - - count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1; - KA_TRACE(20, ("__kmp_execute_tasks_template(dec #1): T#%d dec unfinished_threads to %d task_team=%p\n", - gtid, count, task_team) ); - *thread_finished = TRUE; - } - - // It is now unsafe to reference thread->th.th_team !!! - // Decrementing task_team->tt.tt_unfinished_threads can allow the master - // thread to pass through the barrier, where it might reset each thread's - // th.th_team field for the next parallel region. - // If we can steal more work, we know that this has not happened yet. - if (flag != NULL && flag->done_check()) { - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #2): T#%d spin condition satisfied\n", gtid) ); - return TRUE; - } - } - - if (thread->th.th_task_team == NULL) return FALSE; -#if OMP_41_ENABLED - // check if there are other threads to steal from, otherwise go back - if ( nthreads == 1 ) - goto start; -#endif + while (1) { // Outer loop keeps trying to find tasks in case of single thread getting tasks from target constructs + while (1) { // Inner loop to find a task and execute it + task = NULL; + if (use_own_tasks) { // check on own queue first + task = __kmp_remove_my_task( thread, gtid, task_team, is_constrained ); + } + if ((task == NULL) && (nthreads > 1)) { // Steal a task + int asleep = 1; + use_own_tasks = 0; + // Try to steal from the last place I stole from successfully. + if (victim == -2) { // haven't stolen anything yet + victim = threads_data[tid].td.td_deque_last_stolen; + if (victim != -1) // if we have a last stolen from victim, get the thread + other_thread = threads_data[victim].td.td_thr; + } + if (victim != -1) { // found last victim + asleep = 0; + } + else if (!new_victim) { // no recent steals and we haven't already used a new victim; select a random thread + do { // Find a different thread to steal work from. + // Pick a random thread. Initial plan was to cycle through all the threads, and only return if + // we tried to steal from every thread, and failed. Arch says that's not such a great idea. + victim = __kmp_get_random(thread) % (nthreads - 1); + if (victim >= tid) { + ++victim; // Adjusts random distribution to exclude self + } + // Found a potential victim + other_thread = threads_data[victim].td.td_thr; + // There is a slight chance that __kmp_enable_tasking() did not wake up all threads + // waiting at the barrier. If victim is sleeping, then wake it up. Since we were going to + // pay the cache miss penalty for referencing another thread's kmp_info_t struct anyway, + // the check shouldn't cost too much performance at this point. In extra barrier mode, tasks + // do not sleep at the separate tasking barrier, so this isn't a problem. + asleep = 0; + if ( ( __kmp_tasking_mode == tskm_task_teams ) && + (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && + (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) { + asleep = 1; + __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc); + // A sleeping thread should not have any tasks on it's queue. There is a slight + // possibility that it resumes, steals a task from another thread, which spawns more + // tasks, all in the time that it takes this thread to check => don't write an assertion + // that the victim's queue is empty. Try stealing from a different thread. + } + } while (asleep); + } - // Try to steal from the last place I stole from successfully. - tid = thread -> th.th_info.ds.ds_tid;//__kmp_tid_from_gtid( gtid ); - last_stolen = threads_data[ tid ].td.td_deque_last_stolen; + if (!asleep) { + // We have a victim to try to steal from + task = __kmp_steal_task(other_thread, gtid, task_team, unfinished_threads, thread_finished, is_constrained); + } + if (task != NULL) { // set last stolen to victim + if (threads_data[tid].td.td_deque_last_stolen != victim) { + threads_data[tid].td.td_deque_last_stolen = victim; + // The pre-refactored code did not try more than 1 successful new vicitm, + // unless the last one generated more local tasks; new_victim keeps track of this + new_victim = 1; + } + } + else { // No tasks found; unset last_stolen + KMP_CHECK_UPDATE(threads_data[tid].td.td_deque_last_stolen, -1); + victim = -2; // no successful victim found + } + } - if (last_stolen != -1) { - kmp_info_t *other_thread = threads_data[last_stolen].td.td_thr; + if (task == NULL) // break out of tasking loop + break; - while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads, - thread_finished, is_constrained )) != NULL) - { + // Found a task; execute it #if USE_ITT_BUILD && USE_ITT_NOTIFY if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) { - if ( itt_sync_obj == NULL ) { - // we are at fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); + if ( itt_sync_obj == NULL ) { // we are at fork barrier where we could not get the object reliably + itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); } __kmp_itt_task_starting( itt_sync_obj ); } #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ __kmp_invoke_task( gtid, task, current_task ); #if USE_ITT_BUILD - if ( itt_sync_obj != NULL ) - __kmp_itt_task_finished( itt_sync_obj ); + if ( itt_sync_obj != NULL ) __kmp_itt_task_finished( itt_sync_obj ); #endif /* USE_ITT_BUILD */ - - // Check to see if this thread can proceed. + // If this thread is only partway through the barrier and the condition is met, then return now, + // so that the barrier gather/release pattern can proceed. If this thread is in the last spin loop + // in the barrier, waiting to be released, we know that the termination condition will not be + // satisified, so don't waste any cycles checking it. if (flag == NULL || (!final_spin && flag->done_check())) { - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #3): T#%d spin condition satisfied\n", - gtid) ); + KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) ); return TRUE; } - - if (thread->th.th_task_team == NULL) break; + if (thread->th.th_task_team == NULL) { + break; + } KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task - // If the execution of the stolen task resulted in more tasks being - // placed on our run queue, then restart the whole process. - if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) { - KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", - gtid) ); - goto start; + // If execution of a stolen task results in more tasks being placed on our run queue, reset use_own_tasks + if (!use_own_tasks && TCR_4(threads_data[tid].td.td_deque_ntasks) != 0) { + KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", gtid)); + use_own_tasks = 1; + new_victim = 0; } } - // Don't give priority to stealing from this thread anymore. - threads_data[ tid ].td.td_deque_last_stolen = -1; - - // The victims's work queue is empty. If we are in the final spin loop - // of the barrier, check and see if the termination condition is satisfied. + // The task source has been exhausted. If in final spin loop of barrier, check if termination condition is satisfied. #if OMP_41_ENABLED // The work queue may be empty but there might be proxy tasks still executing - if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0) + if (final_spin && TCR_4(current_task->td_incomplete_child_tasks) == 0) #else if (final_spin) #endif { - // First, decrement the #unfinished threads, if that has not already - // been done. This decrement might be to the spin location, and - // result in the termination condition being satisfied. + // First, decrement the #unfinished threads, if that has not already been done. This decrement + // might be to the spin location, and result in the termination condition being satisfied. if (! *thread_finished) { kmp_uint32 count; count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1; - KA_TRACE(20, ("__kmp_execute_tasks_template(dec #2): T#%d dec unfinished_threads to %d " - "task_team=%p\n", gtid, count, task_team) ); + KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d dec unfinished_threads to %d task_team=%p\n", + gtid, count, task_team) ); *thread_finished = TRUE; } - // If __kmp_tasking_mode != tskm_immediate_exec - // then it is now unsafe to reference thread->th.th_team !!! - // Decrementing task_team->tt.tt_unfinished_threads can allow the master - // thread to pass through the barrier, where it might reset each thread's - // th.th_team field for the next parallel region. + // It is now unsafe to reference thread->th.th_team !!! + // Decrementing task_team->tt.tt_unfinished_threads can allow the master thread to pass through + // the barrier, where it might reset each thread's th.th_team field for the next parallel region. // If we can steal more work, we know that this has not happened yet. if (flag != NULL && flag->done_check()) { - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #4): T#%d spin condition satisfied\n", - gtid) ); + KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d spin condition satisfied\n", gtid) ); return TRUE; } } - if (thread->th.th_task_team == NULL) return FALSE; - } - - // Find a different thread to steal work from. Pick a random thread. - // My initial plan was to cycle through all the threads, and only return - // if we tried to steal from every thread, and failed. Arch says that's - // not such a great idea. - // GEH - need yield code in this loop for throughput library mode? - new_victim: - k = __kmp_get_random( thread ) % (nthreads - 1); - if ( k >= thread -> th.th_info.ds.ds_tid ) { - ++k; // Adjusts random distribution to exclude self - } - { - kmp_info_t *other_thread = threads_data[k].td.td_thr; - int first; - - // There is a slight chance that __kmp_enable_tasking() did not wake up - // all threads waiting at the barrier. If this thread is sleeping, then - // wake it up. Since we were going to pay the cache miss penalty - // for referencing another thread's kmp_info_t struct anyway, the check - // shouldn't cost too much performance at this point. - // In extra barrier mode, tasks do not sleep at the separate tasking - // barrier, so this isn't a problem. - if ( ( __kmp_tasking_mode == tskm_task_teams ) && - (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) && - (TCR_PTR(other_thread->th.th_sleep_loc) != NULL)) - { - __kmp_null_resume_wrapper(__kmp_gtid_from_thread(other_thread), other_thread->th.th_sleep_loc); - // A sleeping thread should not have any tasks on it's queue. - // There is a slight possibility that it resumes, steals a task from - // another thread, which spawns more tasks, all in the time that it takes - // this thread to check => don't write an assertion that the victim's - // queue is empty. Try stealing from a different thread. - goto new_victim; - } - - // Now try to steal work from the selected thread - first = TRUE; - while ((task = __kmp_steal_task( other_thread, gtid, task_team, unfinished_threads, - thread_finished, is_constrained )) != NULL) - { -#if USE_ITT_BUILD && USE_ITT_NOTIFY - if ( __itt_sync_create_ptr || KMP_ITT_DEBUG ) { - if ( itt_sync_obj == NULL ) { - // we are at fork barrier where we could not get the object reliably - itt_sync_obj = __kmp_itt_barrier_object( gtid, bs_forkjoin_barrier ); - } - __kmp_itt_task_starting( itt_sync_obj ); - } -#endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ - __kmp_invoke_task( gtid, task, current_task ); -#if USE_ITT_BUILD - if ( itt_sync_obj != NULL ) - __kmp_itt_task_finished( itt_sync_obj ); -#endif /* USE_ITT_BUILD */ - - // Try stealing from this victim again, in the future. - if (first) { - threads_data[ tid ].td.td_deque_last_stolen = k; - first = FALSE; - } - - // Check to see if this thread can proceed. - if (flag == NULL || (!final_spin && flag->done_check())) { - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #5): T#%d spin condition satisfied\n", - gtid) ); - return TRUE; - } - if (thread->th.th_task_team == NULL) break; - KMP_YIELD( __kmp_library == library_throughput ); // Yield before executing next task - // If the execution of the stolen task resulted in more tasks being - // placed on our run queue, then restart the whole process. - if (TCR_4(threads_data[ tid ].td.td_deque_ntasks) != 0) { - KA_TRACE(20, ("__kmp_execute_tasks_template: T#%d stolen task spawned other tasks, restart\n", - gtid) ); - goto start; - } + // If this thread's task team is NULL, master has recognized that there are no more tasks; bail out + if (thread->th.th_task_team == NULL) { + KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d no more tasks\n", gtid) ); + return FALSE; } - // The victims's work queue is empty. If we are in the final spin loop - // of the barrier, check and see if the termination condition is satisfied. - // Going on and finding a new victim to steal from is expensive, as it - // involves a lot of cache misses, so we definitely want to re-check the - // termination condition before doing that. #if OMP_41_ENABLED - // The work queue may be empty but there might be proxy tasks still executing - if (final_spin && TCR_4(current_task -> td_incomplete_child_tasks) == 0) -#else - if (final_spin) + // We could be getting tasks from target constructs; if this is the only thread, keep trying to execute + // tasks from own queue + if (nthreads == 1) + use_own_tasks = 1; + else #endif { - // First, decrement the #unfinished threads, if that has not already - // been done. This decrement might be to the spin location, and - // result in the termination condition being satisfied. - if (! *thread_finished) { - kmp_uint32 count; - - count = KMP_TEST_THEN_DEC32( (kmp_int32 *)unfinished_threads ) - 1; - KA_TRACE(20, ("__kmp_execute_tasks_template(dec #3): T#%d dec unfinished_threads to %d; " - "task_team=%p\n", - gtid, count, task_team) ); - *thread_finished = TRUE; - } - - // If __kmp_tasking_mode != tskm_immediate_exec, - // then it is now unsafe to reference thread->th.th_team !!! - // Decrementing task_team->tt.tt_unfinished_threads can allow the master - // thread to pass through the barrier, where it might reset each thread's - // th.th_team field for the next parallel region. - // If we can steal more work, we know that this has not happened yet. - if (flag != NULL && flag->done_check()) { - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #6): T#%d spin condition satisfied\n", gtid) ); - return TRUE; - } + KA_TRACE(15, ("__kmp_execute_tasks_template: T#%d can't find work\n", gtid) ); + return FALSE; } - if (thread->th.th_task_team == NULL) return FALSE; } - - KA_TRACE(15, ("__kmp_execute_tasks_template(exit #7): T#%d can't find work\n", gtid) ); - return FALSE; } int __kmp_execute_tasks_32(kmp_info_t *thread, kmp_int32 gtid, kmp_flag_32 *flag, int final_spin, Index: openmp/trunk/runtime/test/lit.cfg =================================================================== --- openmp/trunk/runtime/test/lit.cfg +++ openmp/trunk/runtime/test/lit.cfg @@ -26,6 +26,9 @@ else: config.environment[name] = path +for name,value in os.environ.items(): + config.environment[name] = value + # name: The name of this test suite. config.name = 'libomp'