diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1665,14 +1665,12 @@ kmp_int32 lb; kmp_int32 st; kmp_int32 tc; - kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put - after ub */ - kmp_lock_t *th_steal_lock; // lock used for chunk stealing - // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on ) + kmp_lock_t *steal_lock; // lock used for chunk stealing + // KMP_ALIGN(32) ensures (if the KMP_ALIGN macro is turned on) // a) parm3 is properly aligned and - // b) all parm1-4 are in the same cache line. + // b) all parm1-4 are on the same cache line. // Because of parm1-4 are used together, performance seems to be better - // if they are in the same line (not measured though). + // if they are on the same cache line (not measured though). struct KMP_ALIGN(32) { // AC: changed 16 to 32 in order to simplify template kmp_int32 parm1; // structures in kmp_dispatch.cpp. This should @@ -1684,9 +1682,6 @@ kmp_uint32 ordered_lower; kmp_uint32 ordered_upper; #if KMP_OS_WINDOWS - // This var can be placed in the hole between 'tc' and 'parm1', instead of - // 'static_steal_counter'. It would be nice to measure execution times. - // Conditional if/endif can be removed at all. kmp_int32 last_upper; #endif /* KMP_OS_WINDOWS */ } dispatch_private_info32_t; @@ -1698,9 +1693,7 @@ kmp_int64 lb; /* lower-bound */ kmp_int64 st; /* stride */ kmp_int64 tc; /* trip count (number of iterations) */ - kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put - after ub */ - kmp_lock_t *th_steal_lock; // lock used for chunk stealing + kmp_lock_t *steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) @@ -1719,9 +1712,6 @@ kmp_uint64 ordered_lower; kmp_uint64 ordered_upper; #if KMP_OS_WINDOWS - // This var can be placed in the hole between 'tc' and 'parm1', instead of - // 'static_steal_counter'. It would be nice to measure execution times. - // Conditional if/endif can be removed at all. kmp_int64 last_upper; #endif /* KMP_OS_WINDOWS */ } dispatch_private_info64_t; @@ -1775,9 +1765,8 @@ } u; enum sched_type schedule; /* scheduling algorithm */ kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ + std::atomic steal_flag; // static_steal only, state of a buffer kmp_int32 ordered_bumped; - // To retain the structure size after making ordered_iteration scalar - kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; // Stack of buffers for nest of serial regions struct dispatch_private_info *next; kmp_int32 type_size; /* the size of types in private_info */ @@ -1792,7 +1781,7 @@ /* chunk index under dynamic, number of idle threads under static-steal; iteration index otherwise */ volatile kmp_uint32 iteration; - volatile kmp_uint32 num_done; + volatile kmp_int32 num_done; volatile kmp_uint32 ordered_iteration; // Dummy to retain the structure size after making ordered_iteration scalar kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 1]; @@ -1802,7 +1791,7 @@ /* chunk index under dynamic, number of idle threads under static-steal; iteration index otherwise */ volatile kmp_uint64 iteration; - volatile kmp_uint64 num_done; + volatile kmp_int64 num_done; volatile kmp_uint64 ordered_iteration; // Dummy to retain the structure size after making ordered_iteration scalar kmp_int64 ordered_dummy[KMP_MAX_ORDERED - 3]; @@ -1838,7 +1827,7 @@ dispatch_private_info_t *th_dispatch_pr_current; dispatch_private_info_t *th_disp_buffer; - kmp_int32 th_disp_index; + kmp_uint32 th_disp_index; kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags kmp_int64 *th_doacross_info; // info on loop bounds diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h --- a/openmp/runtime/src/kmp_dispatch.h +++ b/openmp/runtime/src/kmp_dispatch.h @@ -74,8 +74,7 @@ T lb; ST st; // signed UT tc; // unsigned - T static_steal_counter; // for static_steal only; maybe better to put after ub - kmp_lock_t *th_steal_lock; // lock used for chunk stealing + kmp_lock_t *steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) @@ -134,9 +133,8 @@ } u; enum sched_type schedule; /* scheduling algorithm */ kmp_sched_flags_t flags; /* flags (e.g., ordered, nomerge, etc.) */ + std::atomic steal_flag; // static_steal only, state of a buffer kmp_uint32 ordered_bumped; - // to retain the structure size after making order - kmp_int32 ordered_dummy[KMP_MAX_ORDERED - 3]; dispatch_private_info *next; /* stack of buffers for nest of serial regions */ kmp_uint32 type_size; #if KMP_USE_HIER_SCHED @@ -153,10 +151,11 @@ // dispatch_shared_info{32,64}_t types template struct dispatch_shared_infoXX_template { typedef typename traits_t::unsigned_t UT; + typedef typename traits_t::signed_t ST; /* chunk index under dynamic, number of idle threads under static-steal; iteration index otherwise */ volatile UT iteration; - volatile UT num_done; + volatile ST num_done; volatile UT ordered_iteration; // to retain the structure size making ordered_iteration scalar UT ordered_dummy[KMP_MAX_ORDERED - 3]; diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -90,6 +90,22 @@ return monotonicity; } +#if KMP_STATIC_STEAL_ENABLED +enum { // values for steal_flag (possible states of private per-loop buffer) + UNUSED = 0, + CLAIMED = 1, // owner thread started initialization + READY = 2, // available for stealing + THIEF = 3 // finished by owner, or claimed by thief + // possible state changes: + // 0 -> 1 owner only, sync + // 0 -> 3 thief only, sync + // 1 -> 2 owner only, async + // 2 -> 3 owner only, async + // 3 -> 2 owner only, async + // 3 -> 0 last thread finishing the loop, async +}; +#endif + // Initialize a dispatch_private_info_template buffer for a particular // type of schedule,chunk. The loop description is found in lb (lower bound), // ub (upper bound), and st (stride). nproc is the number of threads relevant @@ -187,6 +203,8 @@ schedule = team->t.t_sched.r_sched_type; monotonicity = __kmp_get_monotonicity(loc, schedule, use_hier); schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + if (pr->flags.ordered) // correct monotonicity for ordered loop if needed + monotonicity = SCHEDULE_MONOTONIC; // Detail the schedule if needed (global controls are differentiated // appropriately) if (schedule == kmp_sch_guided_chunked) { @@ -346,7 +364,7 @@ } switch (schedule) { -#if (KMP_STATIC_STEAL_ENABLED) +#if KMP_STATIC_STEAL_ENABLED case kmp_sch_static_steal: { T ntc, init; @@ -359,32 +377,37 @@ KMP_COUNT_BLOCK(OMP_LOOP_STATIC_STEAL); T id = tid; T small_chunk, extras; - + kmp_uint32 old = UNUSED; + int claimed = pr->steal_flag.compare_exchange_strong(old, CLAIMED); + if (traits_t::type_size > 4) { + // AC: TODO: check if 16-byte CAS available and use it to + // improve performance (probably wait for explicit request + // before spending time on this). + // For now use dynamically allocated per-private-buffer lock, + // free memory in __kmp_dispatch_next when status==0. + pr->u.p.steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); + __kmp_init_lock(pr->u.p.steal_lock); + } small_chunk = ntc / nproc; extras = ntc % nproc; init = id * small_chunk + (id < extras ? id : extras); pr->u.p.count = init; - pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); - - pr->u.p.parm2 = lb; + if (claimed) { // are we succeeded in claiming own buffer? + pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); + // Other threads will inspect steal_flag when searching for a victim. + // READY means other threads may steal from this thread from now on. + KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); + } else { + // other thread has stolen whole our range + KMP_DEBUG_ASSERT(pr->steal_flag == THIEF); + pr->u.p.ub = init; // mark there is no iterations to work on + } + pr->u.p.parm2 = ntc; // save number of chunks // parm3 is the number of times to attempt stealing which is - // proportional to the number of chunks per thread up until - // the maximum value of nproc. - pr->u.p.parm3 = KMP_MIN(small_chunk + extras, nproc); + // nproc (just a heuristics, could be optimized later on). + pr->u.p.parm3 = nproc; pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid - pr->u.p.st = st; - if (traits_t::type_size > 4) { - // AC: TODO: check if 16-byte CAS available and use it to - // improve performance (probably wait for explicit request - // before spending time on this). - // For now use dynamically allocated per-thread lock, - // free memory in __kmp_dispatch_next when status==0. - KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); - pr->u.p.th_steal_lock = - (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); - __kmp_init_lock(pr->u.p.th_steal_lock); - } break; } else { /* too few chunks: switching to kmp_sch_dynamic_chunked */ @@ -881,6 +904,18 @@ &team->t.t_disp_buffer[my_buffer_index % __kmp_dispatch_num_buffers]); KD_TRACE(10, ("__kmp_dispatch_init: T#%d my_buffer_index:%d\n", gtid, my_buffer_index)); + if (sh->buffer_index != my_buffer_index) { // too many loops in progress? + KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d" + " sh->buffer_index:%d\n", + gtid, my_buffer_index, sh->buffer_index)); + __kmp_wait(&sh->buffer_index, my_buffer_index, + __kmp_eq USE_ITT_BUILD_ARG(NULL)); + // Note: KMP_WAIT() cannot be used there: buffer index and + // my_buffer_index are *always* 32-bit integers. + KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " + "sh->buffer_index:%d\n", + gtid, my_buffer_index, sh->buffer_index)); + } } __kmp_dispatch_init_algorithm(loc, gtid, pr, schedule, lb, ub, st, @@ -897,24 +932,6 @@ th->th.th_dispatch->th_deo_fcn = __kmp_dispatch_deo; th->th.th_dispatch->th_dxo_fcn = __kmp_dispatch_dxo; } - } - - if (active) { - /* The name of this buffer should be my_buffer_index when it's free to use - * it */ - - KD_TRACE(100, ("__kmp_dispatch_init: T#%d before wait: my_buffer_index:%d " - "sh->buffer_index:%d\n", - gtid, my_buffer_index, sh->buffer_index)); - __kmp_wait(&sh->buffer_index, my_buffer_index, - __kmp_eq USE_ITT_BUILD_ARG(NULL)); - // Note: KMP_WAIT() cannot be used there: buffer index and - // my_buffer_index are *always* 32-bit integers. - KMP_MB(); /* is this necessary? */ - KD_TRACE(100, ("__kmp_dispatch_init: T#%d after wait: my_buffer_index:%d " - "sh->buffer_index:%d\n", - gtid, my_buffer_index, sh->buffer_index)); - th->th.th_dispatch->th_dispatch_pr_current = (dispatch_private_info_t *)pr; th->th.th_dispatch->th_dispatch_sh_current = CCAST(dispatch_shared_info_t *, (volatile dispatch_shared_info_t *)sh); @@ -978,21 +995,6 @@ __kmp_str_free(&buff); } #endif -#if (KMP_STATIC_STEAL_ENABLED) - // It cannot be guaranteed that after execution of a loop with some other - // schedule kind all the parm3 variables will contain the same value. Even if - // all parm3 will be the same, it still exists a bad case like using 0 and 1 - // rather than program life-time increment. So the dedicated variable is - // required. The 'static_steal_counter' is used. - if (pr->schedule == kmp_sch_static_steal) { - // Other threads will inspect this variable when searching for a victim. - // This is a flag showing that other threads may steal from this thread - // since then. - volatile T *p = &pr->u.p.static_steal_counter; - *p = *p + 1; - } -#endif // ( KMP_STATIC_STEAL_ENABLED ) - #if OMPT_SUPPORT && OMPT_OPTIONAL if (ompt_enabled.ompt_callback_work) { ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); @@ -1200,10 +1202,10 @@ } switch (pr->schedule) { -#if (KMP_STATIC_STEAL_ENABLED) +#if KMP_STATIC_STEAL_ENABLED case kmp_sch_static_steal: { T chunk = pr->u.p.parm1; - + UT nchunks = pr->u.p.parm2; KD_TRACE(100, ("__kmp_dispatch_next_algorithm: T#%d kmp_sch_static_steal case\n", gtid)); @@ -1211,11 +1213,12 @@ trip = pr->u.p.tc - 1; if (traits_t::type_size > 4) { - // use lock for 8-byte and CAS for 4-byte induction - // variable. TODO (optional): check and use 16-byte CAS - kmp_lock_t *lck = pr->u.p.th_steal_lock; + // use lock for 8-byte induction variable. + // TODO (optional): check presence and use 16-byte CAS + kmp_lock_t *lck = pr->u.p.steal_lock; KMP_DEBUG_ASSERT(lck != NULL); if (pr->u.p.count < (UT)pr->u.p.ub) { + KMP_DEBUG_ASSERT(pr->steal_flag == READY); __kmp_acquire_lock(lck, gtid); // try to get own chunk of iterations init = (pr->u.p.count)++; @@ -1225,76 +1228,125 @@ status = 0; // no own chunks } if (!status) { // try to steal - kmp_info_t **other_threads = team->t.t_threads; + kmp_lock_t *lckv; // victim buffer's lock T while_limit = pr->u.p.parm3; T while_index = 0; - T id = pr->u.p.static_steal_counter; // loop id int idx = (th->th.th_dispatch->th_disp_index - 1) % __kmp_dispatch_num_buffers; // current loop index // note: victim thread can potentially execute another loop - // TODO: algorithm of searching for a victim - // should be cleaned up and measured + KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive while ((!status) && (while_limit != ++while_index)) { - dispatch_private_info_template *victim; + dispatch_private_info_template *v; T remaining; - T victimIdx = pr->u.p.parm4; - T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; - victim = reinterpret_cast *>( - &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); - KMP_DEBUG_ASSERT(victim); - while ((victim == pr || id != victim->u.p.static_steal_counter) && - oldVictimIdx != victimIdx) { - victimIdx = (victimIdx + 1) % nproc; - victim = reinterpret_cast *>( - &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); - KMP_DEBUG_ASSERT(victim); + T victimId = pr->u.p.parm4; + T oldVictimId = victimId ? victimId - 1 : nproc - 1; + v = reinterpret_cast *>( + &team->t.t_dispatch[victimId].th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(v); + while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && + oldVictimId != victimId) { + victimId = (victimId + 1) % nproc; + v = reinterpret_cast *>( + &team->t.t_dispatch[victimId].th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(v); } - if (victim == pr || id != victim->u.p.static_steal_counter) { + if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { + // if (nchunks == sh->u.s.iteration) + // break; // nothing to steal + // else continue; // try once more (nproc attempts in total) - // no victim is ready yet to participate in stealing - // because no victim passed kmp_init_dispatch yet } - if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { - pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid - continue; // not enough chunks to steal, goto next victim + if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { + kmp_uint32 old = UNUSED; + // try to steal whole range from inactive victim + status = v->steal_flag.compare_exchange_strong(old, THIEF); + if (status) { + // initialize self buffer with victim's whole range of chunks + T id = victimId; + T small_chunk, extras; + small_chunk = nchunks / nproc; // chunks per thread + extras = nchunks % nproc; + init = id * small_chunk + (id < extras ? id : extras); + __kmp_acquire_lock(lck, gtid); + pr->u.p.count = init + 1; // exclude one we execute immediately + pr->u.p.ub = init + small_chunk + (id < extras ? 1 : 0); + __kmp_release_lock(lck, gtid); + pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid +// no need to reinitialize other thread invariants: lb, st, etc. +#ifdef KMP_DEBUG + { + char *buff; + // create format specifiers before the debug output + buff = __kmp_str_format( + "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " + "count:%%%s ub:%%%s\n", + traits_t::spec, traits_t::spec); + KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); + __kmp_str_free(&buff); + } +#endif + // activate non-empty buffer and let others steal from us + if (pr->u.p.count < (UT)pr->u.p.ub) + KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); + break; + } } - - lck = victim->u.p.th_steal_lock; - KMP_ASSERT(lck != NULL); - __kmp_acquire_lock(lck, gtid); - limit = victim->u.p.ub; // keep initial ub - if (victim->u.p.count >= limit || - (remaining = limit - victim->u.p.count) < 2) { - __kmp_release_lock(lck, gtid); - pr->u.p.parm4 = (victimIdx + 1) % nproc; // next victim - continue; // not enough chunks to steal + if (KMP_ATOMIC_LD_RLX(&v->steal_flag) != READY || + v->u.p.count >= (UT)v->u.p.ub) { + pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid + continue; // no chunks to steal, try next victim } - // stealing succeeded, reduce victim's ub by 1/4 of undone chunks or - // by 1 - if (remaining > 3) { + lckv = v->u.p.steal_lock; + KMP_ASSERT(lckv != NULL); + __kmp_acquire_lock(lckv, gtid); + limit = v->u.p.ub; // keep initial ub + if (v->u.p.count >= limit) { + __kmp_release_lock(lckv, gtid); + pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim tid + continue; // no chunks to steal, try next victim + } + + // stealing succeded, reduce victim's ub by 1/4 of undone chunks + // TODO: is this heuristics good enough?? + remaining = limit - v->u.p.count; + if (remaining > 7) { // steal 1/4 of remaining KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, remaining >> 2); - init = (victim->u.p.ub -= (remaining >> 2)); + init = (v->u.p.ub -= (remaining >> 2)); } else { - // steal 1 chunk of 2 or 3 remaining + // steal 1 chunk of 1..7 remaining KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, 1); - init = (victim->u.p.ub -= 1); + init = (v->u.p.ub -= 1); } - __kmp_release_lock(lck, gtid); - + __kmp_release_lock(lckv, gtid); +#ifdef KMP_DEBUG + { + char *buff; + // create format specifiers before the debug output + buff = __kmp_str_format( + "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " + "count:%%%s ub:%%%s\n", + traits_t::spec, traits_t::spec); + KD_TRACE(10, (buff, gtid, victimId, init, limit)); + __kmp_str_free(&buff); + } +#endif KMP_DEBUG_ASSERT(init + 1 <= limit); - pr->u.p.parm4 = victimIdx; // remember victim to steal from + pr->u.p.parm4 = victimId; // remember victim to steal from status = 1; - while_index = 0; - // now update own count and ub with stolen range but init chunk - __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); + // now update own count and ub with stolen range excluding init chunk + __kmp_acquire_lock(lck, gtid); pr->u.p.count = init + 1; pr->u.p.ub = limit; - __kmp_release_lock(pr->u.p.th_steal_lock, gtid); + __kmp_release_lock(lck, gtid); + // activate non-empty buffer and let others steal from us + if (init + 1 < limit) + KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); } // while (search for victim) } // if (try to find victim and steal) } else { // 4-byte induction variable, use 8-byte CAS for pair (count, ub) + // as all operations on pair (count, ub) must be done atomically typedef union { struct { UT count; @@ -1302,86 +1354,132 @@ } p; kmp_int64 b; } union_i4; - // All operations on 'count' or 'ub' must be combined atomically - // together. - { - union_i4 vold, vnew; + union_i4 vold, vnew; + if (pr->u.p.count < (UT)pr->u.p.ub) { + KMP_DEBUG_ASSERT(pr->steal_flag == READY); vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); - vnew = vold; - vnew.p.count++; - while (!KMP_COMPARE_AND_STORE_ACQ64( + vnew.b = vold.b; + vnew.p.count++; // get chunk from head of self range + while (!KMP_COMPARE_AND_STORE_REL64( (volatile kmp_int64 *)&pr->u.p.count, *VOLATILE_CAST(kmp_int64 *) & vold.b, *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { KMP_CPU_PAUSE(); vold.b = *(volatile kmp_int64 *)(&pr->u.p.count); - vnew = vold; + vnew.b = vold.b; vnew.p.count++; } - vnew = vold; - init = vnew.p.count; - status = (init < (UT)vnew.p.ub); + init = vold.p.count; + status = (init < (UT)vold.p.ub); + } else { + status = 0; // no own chunks } - - if (!status) { - kmp_info_t **other_threads = team->t.t_threads; + if (!status) { // try to steal T while_limit = pr->u.p.parm3; T while_index = 0; - T id = pr->u.p.static_steal_counter; // loop id int idx = (th->th.th_dispatch->th_disp_index - 1) % __kmp_dispatch_num_buffers; // current loop index // note: victim thread can potentially execute another loop - // TODO: algorithm of searching for a victim - // should be cleaned up and measured + KMP_ATOMIC_ST_REL(&pr->steal_flag, THIEF); // mark self buffer inactive while ((!status) && (while_limit != ++while_index)) { - dispatch_private_info_template *victim; - union_i4 vold, vnew; + dispatch_private_info_template *v; T remaining; - T victimIdx = pr->u.p.parm4; - T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; - victim = reinterpret_cast *>( - &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); - KMP_DEBUG_ASSERT(victim); - while ((victim == pr || id != victim->u.p.static_steal_counter) && - oldVictimIdx != victimIdx) { - victimIdx = (victimIdx + 1) % nproc; - victim = reinterpret_cast *>( - &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); - KMP_DEBUG_ASSERT(victim); + T victimId = pr->u.p.parm4; + T oldVictimId = victimId ? victimId - 1 : nproc - 1; + v = reinterpret_cast *>( + &team->t.t_dispatch[victimId].th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(v); + while ((v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) && + oldVictimId != victimId) { + victimId = (victimId + 1) % nproc; + v = reinterpret_cast *>( + &team->t.t_dispatch[victimId].th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(v); } - if (victim == pr || id != victim->u.p.static_steal_counter) { + if (v == pr || KMP_ATOMIC_LD_RLX(&v->steal_flag) == THIEF) { + // if (nchunks == sh->u.s.iteration) + // break; // nothing to steal + // else continue; // try once more (nproc attempts in total) - // no victim is ready yet to participate in stealing - // because no victim passed kmp_init_dispatch yet } - pr->u.p.parm4 = victimIdx; // new victim found - while (1) { // CAS loop if victim has enough chunks to steal - vold.b = *(volatile kmp_int64 *)(&victim->u.p.count); - vnew = vold; - - KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); - if (vnew.p.count >= (UT)vnew.p.ub || - (remaining = vnew.p.ub - vnew.p.count) < 2) { - pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start victim id - break; // not enough chunks to steal, goto next victim + if (KMP_ATOMIC_LD_RLX(&v->steal_flag) == UNUSED) { + kmp_uint32 old = UNUSED; + // try to steal whole range from inactive victim + status = v->steal_flag.compare_exchange_strong(old, THIEF); + if (status) { + // initialize self buffer with victim's whole range of chunks + T id = victimId; + T small_chunk, extras; + small_chunk = nchunks / nproc; // chunks per thread + extras = nchunks % nproc; + init = id * small_chunk + (id < extras ? id : extras); + vnew.p.count = init + 1; + vnew.p.ub = init + small_chunk + (id < extras ? 1 : 0); +// write pair (count, ub) at once atomically +#if KMP_ARCH_X86 + KMP_XCHG_FIXED64((volatile kmp_int64 *)(&pr->u.p.count), vnew.b); +#else + *(volatile kmp_int64 *)(&pr->u.p.count) = vnew.b; +#endif + pr->u.p.parm4 = (id + 1) % nproc; // remember neighbour tid +// no need to initialize other thread invariants: lb, st, etc. +#ifdef KMP_DEBUG + { + char *buff; + // create format specifiers before the debug output + buff = __kmp_str_format( + "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " + "count:%%%s ub:%%%s\n", + traits_t::spec, traits_t::spec); + KD_TRACE(10, (buff, gtid, id, pr->u.p.count, pr->u.p.ub)); + __kmp_str_free(&buff); + } +#endif + // activate non-empty buffer and let others steal from us + if (pr->u.p.count < (UT)pr->u.p.ub) + KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); + break; } - if (remaining > 3) { - // try to steal 1/4 of remaining - vnew.p.ub -= remaining >> 2; + } + while (1) { // CAS loop with check if victim still has enough chunks + // many threads may be stealing concurrently from same victim + vold.b = *(volatile kmp_int64 *)(&v->u.p.count); + if (KMP_ATOMIC_LD_ACQ(&v->steal_flag) != READY || + vold.p.count >= (UT)vold.p.ub) { + pr->u.p.parm4 = (victimId + 1) % nproc; // shift start victim id + break; // no chunks to steal, try next victim + } + vnew.b = vold.b; + remaining = vold.p.ub - vold.p.count; + // try to steal 1/4 of remaining + // TODO: is this heuristics good enough?? + if (remaining > 7) { + vnew.p.ub -= remaining >> 2; // steal from tail of victim's range } else { - vnew.p.ub -= 1; // steal 1 chunk of 2 or 3 remaining + vnew.p.ub -= 1; // steal 1 chunk of 1..7 remaining } KMP_DEBUG_ASSERT((vnew.p.ub - 1) * (UT)chunk <= trip); - // TODO: Should this be acquire or release? - if (KMP_COMPARE_AND_STORE_ACQ64( - (volatile kmp_int64 *)&victim->u.p.count, + if (KMP_COMPARE_AND_STORE_REL64( + (volatile kmp_int64 *)&v->u.p.count, *VOLATILE_CAST(kmp_int64 *) & vold.b, *VOLATILE_CAST(kmp_int64 *) & vnew.b)) { - // stealing succeeded + // stealing succedded +#ifdef KMP_DEBUG + { + char *buff; + // create format specifiers before the debug output + buff = __kmp_str_format( + "__kmp_dispatch_next: T#%%d stolen chunks from T#%%d, " + "count:%%%s ub:%%%s\n", + traits_t::spec, traits_t::spec); + KD_TRACE(10, (buff, gtid, victimId, vnew.p.ub, vold.p.ub)); + __kmp_str_free(&buff); + } +#endif KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_stolen, vold.p.ub - vnew.p.ub); status = 1; - while_index = 0; + pr->u.p.parm4 = victimId; // keep victim id // now update own count and ub init = vnew.p.ub; vold.p.count = init + 1; @@ -1390,6 +1488,9 @@ #else *(volatile kmp_int64 *)(&pr->u.p.count) = vold.b; #endif + // activate non-empty buffer and let others steal from us + if (vold.p.count < (UT)vold.p.ub) + KMP_ATOMIC_ST_REL(&pr->steal_flag, READY); break; } // if (check CAS result) KMP_CPU_PAUSE(); // CAS failed, repeatedly attempt @@ -1403,13 +1504,16 @@ if (p_st != NULL) *p_st = 0; } else { - start = pr->u.p.parm2; + start = pr->u.p.lb; init *= chunk; limit = chunk + init - 1; incr = pr->u.p.st; KMP_COUNT_DEVELOPER_VALUE(FOR_static_steal_chunks, 1); KMP_DEBUG_ASSERT(init <= trip); + // keep track of done chunks for possible early exit from stealing + // TODO: count executed chunks locally with rare update of shared location + // test_then_inc((volatile ST *)&sh->u.s.iteration); if ((last = (limit >= trip)) != 0) limit = trip; if (p_st != NULL) @@ -1422,15 +1526,10 @@ *p_lb = start + init * incr; *p_ub = start + limit * incr; } - - if (pr->flags.ordered) { - pr->u.p.ordered_lower = init; - pr->u.p.ordered_upper = limit; - } // if } // if break; } // case -#endif // ( KMP_STATIC_STEAL_ENABLED ) +#endif // KMP_STATIC_STEAL_ENABLED case kmp_sch_static_balanced: { KD_TRACE( 10, @@ -2075,16 +2174,15 @@ th->th.th_info.ds.ds_tid); // status == 0: no more iterations to execute if (status == 0) { - UT num_done; - - num_done = test_then_inc((volatile ST *)&sh->u.s.num_done); + ST num_done; + num_done = test_then_inc(&sh->u.s.num_done); #ifdef KMP_DEBUG { char *buff; // create format specifiers before the debug output buff = __kmp_str_format( "__kmp_dispatch_next: T#%%d increment num_done:%%%s\n", - traits_t::spec); + traits_t::spec); KD_TRACE(10, (buff, gtid, sh->u.s.num_done)); __kmp_str_free(&buff); } @@ -2093,28 +2191,31 @@ #if KMP_USE_HIER_SCHED pr->flags.use_hier = FALSE; #endif - if ((ST)num_done == th->th.th_team_nproc - 1) { -#if (KMP_STATIC_STEAL_ENABLED) - if (pr->schedule == kmp_sch_static_steal && - traits_t::type_size > 4) { + if (num_done == th->th.th_team_nproc - 1) { +#if KMP_STATIC_STEAL_ENABLED + if (pr->schedule == kmp_sch_static_steal) { int i; int idx = (th->th.th_dispatch->th_disp_index - 1) % __kmp_dispatch_num_buffers; // current loop index - kmp_info_t **other_threads = team->t.t_threads; // loop complete, safe to destroy locks used for stealing for (i = 0; i < th->th.th_team_nproc; ++i) { dispatch_private_info_template *buf = reinterpret_cast *>( - &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); - kmp_lock_t *lck = buf->u.p.th_steal_lock; - KMP_ASSERT(lck != NULL); - __kmp_destroy_lock(lck); - __kmp_free(lck); - buf->u.p.th_steal_lock = NULL; + &team->t.t_dispatch[i].th_disp_buffer[idx]); + KMP_ASSERT(buf->steal_flag == THIEF); // buffer must be inactive + KMP_ATOMIC_ST_RLX(&buf->steal_flag, UNUSED); + if (traits_t::type_size > 4) { + // destroy locks used for stealing + kmp_lock_t *lck = buf->u.p.steal_lock; + KMP_ASSERT(lck != NULL); + __kmp_destroy_lock(lck); + __kmp_free(lck); + buf->u.p.steal_lock = NULL; + } } } #endif - /* NOTE: release this buffer to be reused */ + /* NOTE: release shared buffer to be reused */ KMP_MB(); /* Flush all pending memory write invalidates. */ @@ -2126,8 +2227,6 @@ sh->u.s.ordered_iteration = 0; } - KMP_MB(); /* Flush all pending memory write invalidates. */ - sh->buffer_index += __kmp_dispatch_num_buffers; KD_TRACE(100, ("__kmp_dispatch_next: T#%d change buffer_index:%d\n", gtid, sh->buffer_index)); diff --git a/openmp/runtime/src/kmp_dispatch_hier.h b/openmp/runtime/src/kmp_dispatch_hier.h --- a/openmp/runtime/src/kmp_dispatch_hier.h +++ b/openmp/runtime/src/kmp_dispatch_hier.h @@ -924,7 +924,7 @@ T lb, T ub, typename traits_t::signed_t st) { int tid, gtid, num_hw_threads, num_threads_per_layer1, active; - int my_buffer_index; + unsigned int my_buffer_index; kmp_info_t *th; kmp_team_t *team; dispatch_private_info_template *pr; diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp --- a/openmp/runtime/src/kmp_settings.cpp +++ b/openmp/runtime/src/kmp_settings.cpp @@ -3985,8 +3985,11 @@ else if (!__kmp_strcasecmp_with_sentinel("static", ptr, *delim)) sched = kmp_sch_static; #if KMP_STATIC_STEAL_ENABLED - else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim)) - sched = kmp_sch_static_steal; + else if (!__kmp_strcasecmp_with_sentinel("static_steal", ptr, *delim)) { + // replace static_steal with dynamic to better cope with ordered loops + sched = kmp_sch_dynamic_chunked; + sched_modifier = sched_type::kmp_sch_modifier_nonmonotonic; + } #endif else { // If there is no proper schedule kind, then this schedule is invalid diff --git a/openmp/runtime/test/env/kmp_set_dispatch_buf.c b/openmp/runtime/test/env/kmp_set_dispatch_buf.c --- a/openmp/runtime/test/env/kmp_set_dispatch_buf.c +++ b/openmp/runtime/test/env/kmp_set_dispatch_buf.c @@ -9,7 +9,7 @@ // RUN: env KMP_DISP_NUM_BUFFERS=3 %libomp-run // RUN: env KMP_DISP_NUM_BUFFERS=4 %libomp-run // RUN: env KMP_DISP_NUM_BUFFERS=7 %libomp-run -// UNSUPPORTED: clang-11, clang-12, clang-13 +// UNSUPPORTED: clang-11, clang-12 #include #include #include @@ -78,5 +78,9 @@ num_failed++; } } + if (num_failed == 0) + printf("passed\n"); + else + printf("failed %d\n", num_failed); return num_failed; } diff --git a/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c b/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c --- a/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c +++ b/openmp/runtime/test/worksharing/for/kmp_set_dispatch_buf.c @@ -3,7 +3,7 @@ // RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5 // RUN: %libomp-compile -DMY_SCHEDULE=guided && %libomp-run 7 // RUN: %libomp-run 1 && %libomp-run 2 && %libomp-run 5 -// UNSUPPORTED: clang-11, clang-12, clang-13 +// UNSUPPORTED: clang-11, clang-12 #include #include #include @@ -88,5 +88,9 @@ num_failed++; } } + if (num_failed == 0) + printf("passed\n"); + else + printf("failed %d\n", num_failed); return num_failed; } diff --git a/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c b/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c --- a/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c +++ b/openmp/runtime/test/worksharing/for/omp_for_schedule_runtime.c @@ -8,8 +8,8 @@ // RUN: env OMP_SCHEDULE=auto %libomp-run 4 1 // RUN: env OMP_SCHEDULE=trapezoidal %libomp-run 101 1 // RUN: env OMP_SCHEDULE=trapezoidal,13 %libomp-run 101 13 -// RUN: env OMP_SCHEDULE=static_steal %libomp-run 102 1 -// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 102 14 +// RUN: env OMP_SCHEDULE=static_steal %libomp-run 2 1 +// RUN: env OMP_SCHEDULE=static_steal,14 %libomp-run 2 14 #include #include diff --git a/openmp/runtime/test/worksharing/for/omp_par_in_loop.c b/openmp/runtime/test/worksharing/for/omp_par_in_loop.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/worksharing/for/omp_par_in_loop.c @@ -0,0 +1,28 @@ +// RUN: %libomp-compile-and-run +// +#include +#include +#include +#include + +#define TYPE long +#define MAX_ITER (TYPE)((TYPE)1000000) +#define EVERY (TYPE)((TYPE)100000) + +int main(int argc, char* argv[]) { + TYPE x = MAX_ITER; + omp_set_max_active_levels(2); + omp_set_num_threads(2); + #pragma omp parallel for schedule(nonmonotonic:dynamic,1) + for (TYPE i = 0; i < x; i++) { + int tid = omp_get_thread_num(); + omp_set_num_threads(1); + #pragma omp parallel proc_bind(spread) + { + if (i % EVERY == (TYPE)0) + printf("Outer thread %d at iter %ld\n", tid, i); + } + } + printf("passed\n"); + return 0; +}