Index: openmp/trunk/runtime/src/kmp.h =================================================================== --- openmp/trunk/runtime/src/kmp.h +++ openmp/trunk/runtime/src/kmp.h @@ -35,10 +35,6 @@ #define TASK_CURRENT_NOT_QUEUED 0 #define TASK_CURRENT_QUEUED 1 -#define TASK_DEQUE_BITS 8 // Used solely to define TASK_DEQUE_SIZE and TASK_DEQUE_MASK. -#define TASK_DEQUE_SIZE ( 1 << TASK_DEQUE_BITS ) -#define TASK_DEQUE_MASK ( TASK_DEQUE_SIZE - 1 ) - #ifdef BUILD_TIED_TASK_STACK #define TASK_STACK_EMPTY 0 // entries when the stack is empty @@ -2223,6 +2219,7 @@ // Used only in __kmp_execute_tasks_template, maybe not avail until task is queued? kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque kmp_taskdata_t ** td_deque; // Deque of tasks encountered by td_thr, dynamically allocated + kmp_int32 td_deque_size; // Size of deck kmp_uint32 td_deque_head; // Head of deque (will wrap) kmp_uint32 td_deque_tail; // Tail of deque (will wrap) kmp_int32 td_deque_ntasks; // Number of tasks in deque @@ -2233,6 +2230,12 @@ #endif // BUILD_TIED_TASK_STACK } kmp_base_thread_data_t; +#define TASK_DEQUE_BITS 8 // Used solely to define INITIAL_TASK_DEQUE_SIZE +#define INITIAL_TASK_DEQUE_SIZE ( 1 << TASK_DEQUE_BITS ) + +#define TASK_DEQUE_SIZE(td) ((td).td_deque_size) +#define TASK_DEQUE_MASK(td) ((td).td_deque_size - 1) + typedef union KMP_ALIGN_CACHE kmp_thread_data { kmp_base_thread_data_t td; double td_align; /* use worst case alignment */ Index: openmp/trunk/runtime/src/kmp_debugger.c =================================================================== --- openmp/trunk/runtime/src/kmp_debugger.c +++ openmp/trunk/runtime/src/kmp_debugger.c @@ -93,7 +93,7 @@ sizeof( void * ), OMP_LOCK_T_SIZE < sizeof(void *), bs_last_barrier, - TASK_DEQUE_SIZE, + INITIAL_TASK_DEQUE_SIZE, // thread structure information sizeof( kmp_base_info_t ), @@ -222,6 +222,7 @@ // thread_data_t. sizeof( kmp_thread_data_t ), offset_and_size_of( kmp_base_thread_data_t, td_deque ), + offset_and_size_of( kmp_base_thread_data_t, td_deque_size ), offset_and_size_of( kmp_base_thread_data_t, td_deque_head ), offset_and_size_of( kmp_base_thread_data_t, td_deque_tail ), offset_and_size_of( kmp_base_thread_data_t, td_deque_ntasks ), Index: openmp/trunk/runtime/src/kmp_omp.h =================================================================== --- openmp/trunk/runtime/src/kmp_omp.h +++ openmp/trunk/runtime/src/kmp_omp.h @@ -218,6 +218,7 @@ /* kmp_thread_data_t */ kmp_int32 hd_sizeof_struct; offset_and_size_t hd_deque; + offset_and_size_t hd_deque_size; offset_and_size_t hd_deque_head; offset_and_size_t hd_deque_tail; offset_and_size_t hd_deque_ntasks; Index: openmp/trunk/runtime/src/kmp_tasking.c =================================================================== --- openmp/trunk/runtime/src/kmp_tasking.c +++ openmp/trunk/runtime/src/kmp_tasking.c @@ -305,7 +305,7 @@ } // Check if deque is full - if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE ) + if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) ) { KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full; returning TASK_NOT_PUSHED for task %p\n", gtid, taskdata ) ); @@ -317,7 +317,7 @@ #if OMP_41_ENABLED // Need to recheck as we can get a proxy task from a thread outside of OpenMP - if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE ) + if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) ) { __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock ); KA_TRACE(20, ( "__kmp_push_task: T#%d deque is full on 2nd check; returning TASK_NOT_PUSHED for task %p\n", @@ -326,12 +326,12 @@ } #else // Must have room since no thread can add tasks but calling thread - KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE ); + KMP_DEBUG_ASSERT( TCR_4(thread_data -> td.td_deque_ntasks) < TASK_DEQUE_SIZE(thread_data->td) ); #endif thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Push taskdata // Wrap index. - thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK; + thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td); TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1); // Adjust task count __kmp_release_bootstrap_lock( & thread_data -> td.td_deque_lock ); @@ -1641,7 +1641,7 @@ return NULL; } - tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index. + tail = ( thread_data -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(thread_data->td); // Wrap index. taskdata = thread_data -> td.td_deque[ tail ]; if (is_constrained) { @@ -1735,10 +1735,10 @@ if ( !is_constrained ) { taskdata = victim_td -> td.td_deque[ victim_td -> td.td_deque_head ]; // Bump head pointer and Wrap. - victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK; + victim_td -> td.td_deque_head = ( victim_td -> td.td_deque_head + 1 ) & TASK_DEQUE_MASK(victim_td->td); } else { // While we have postponed tasks let's steal from tail of the deque (smaller tasks) - kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK; // Wrap index. + kmp_int32 tail = ( victim_td -> td.td_deque_tail - 1 ) & TASK_DEQUE_MASK(victim_td->td); // Wrap index. taskdata = victim_td -> td.td_deque[ tail ]; // we need to check if the candidate obeys task scheduling constraint: // only child of current task can be scheduled @@ -2267,14 +2267,42 @@ KMP_DEBUG_ASSERT( thread_data -> td.td_deque_tail == 0 ); KE_TRACE( 10, ( "__kmp_alloc_task_deque: T#%d allocating deque[%d] for thread_data %p\n", - __kmp_gtid_from_thread( thread ), TASK_DEQUE_SIZE, thread_data ) ); + __kmp_gtid_from_thread( thread ), INITIAL_TASK_DEQUE_SIZE, thread_data ) ); // Allocate space for task deque, and zero the deque // Cannot use __kmp_thread_calloc() because threads not around for // kmp_reap_task_team( ). thread_data -> td.td_deque = (kmp_taskdata_t **) - __kmp_allocate( TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); + __kmp_allocate( INITIAL_TASK_DEQUE_SIZE * sizeof(kmp_taskdata_t *)); + thread_data -> td.td_deque_size = INITIAL_TASK_DEQUE_SIZE; } +//------------------------------------------------------------------------------ +// __kmp_realloc_task_deque: +// Re-allocates a task deque for a particular thread, copies the content from the old deque +// and adjusts the necessary data structures relating to the deque. +// This operation must be done with a the deque_lock being held + +static void __kmp_realloc_task_deque ( kmp_info_t *thread, kmp_thread_data_t *thread_data ) +{ + kmp_int32 size = TASK_DEQUE_SIZE(thread_data->td); + kmp_int32 new_size = 2 * size; + + KE_TRACE( 10, ( "__kmp_realloc_task_deque: T#%d reallocating deque[from %d to %d] for thread_data %p\n", + __kmp_gtid_from_thread( thread ), size, new_size, thread_data ) ); + + kmp_taskdata_t ** new_deque = (kmp_taskdata_t **) __kmp_allocate( new_size * sizeof(kmp_taskdata_t *)); + + int i,j; + for ( i = thread_data->td.td_deque_head, j = 0; j < size; i = (i+1) & TASK_DEQUE_MASK(thread_data->td), j++ ) + new_deque[j] = thread_data->td.td_deque[i]; + + __kmp_free(thread_data->td.td_deque); + + thread_data -> td.td_deque_head = 0; + thread_data -> td.td_deque_tail = size; + thread_data -> td.td_deque = new_deque; + thread_data -> td.td_deque_size = new_size; +} //------------------------------------------------------------------------------ // __kmp_free_task_deque: @@ -2769,7 +2797,7 @@ Because of this, __kmp_push_task needs to check if there's space after getting the lock */ -static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task ) +static bool __kmp_give_task ( kmp_info_t *thread, kmp_int32 tid, kmp_task_t * task, kmp_int32 pass ) { kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task); kmp_task_team_t * task_team = taskdata->td_task_team; @@ -2789,23 +2817,37 @@ return result; } - if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE ) + if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) ) { KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) ); - return result; - } - __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock ); + // if this deque is bigger than the pass ratio give a chance to another thread + if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) return result; - if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE ) - { - KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) ); - goto release_and_exit; + __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock ); + __kmp_realloc_task_deque(thread,thread_data); + + } else { + + __kmp_acquire_bootstrap_lock( & thread_data-> td.td_deque_lock ); + + if ( TCR_4(thread_data -> td.td_deque_ntasks) >= TASK_DEQUE_SIZE(thread_data->td) ) + { + KA_TRACE(30, ("__kmp_give_task: queue is full while giving task %p to thread %d.\n", taskdata, tid ) ); + + // if this deque is bigger than the pass ratio give a chance to another thread + if ( TASK_DEQUE_SIZE(thread_data->td)/INITIAL_TASK_DEQUE_SIZE >= pass ) + goto release_and_exit; + + __kmp_realloc_task_deque(thread,thread_data); + } } + // lock is held here, and there is space in the deque + thread_data -> td.td_deque[ thread_data -> td.td_deque_tail ] = taskdata; // Wrap index. - thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK; + thread_data -> td.td_deque_tail = ( thread_data -> td.td_deque_tail + 1 ) & TASK_DEQUE_MASK(thread_data->td); TCW_4(thread_data -> td.td_deque_ntasks, TCR_4(thread_data -> td.td_deque_ntasks) + 1); result = true; @@ -2919,14 +2961,21 @@ kmp_team_t * team = taskdata->td_team; kmp_int32 nthreads = team->t.t_nproc; kmp_info_t *thread; - kmp_int32 k = 0; + + //This should be similar to start_k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here + kmp_int32 start_k = 0; + kmp_int32 pass = 1; + kmp_int32 k = start_k; do { - //This should be similar to k = __kmp_get_random( thread ) % nthreads but we cannot use __kmp_get_random here //For now we're just linearly trying to find a thread - k = (k+1) % nthreads; thread = team->t.t_threads[k]; - } while ( !__kmp_give_task( thread, k, ptask ) ); + k = (k+1) % nthreads; + + // we did a full pass through all the threads + if ( k == start_k ) pass = pass << 1; + + } while ( !__kmp_give_task( thread, k, ptask, pass ) ); __kmp_second_top_half_finish_proxy(taskdata);