diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -1548,7 +1548,7 @@ kmp_int32 tc; kmp_int32 static_steal_counter; /* for static_steal only; maybe better to put after ub */ - + kmp_lock_t *th_steal_lock; // lock used for chunk stealing // KMP_ALIGN( 16 ) ensures ( if the KMP_ALIGN macro is turned on ) // a) parm3 is properly aligned and // b) all parm1-4 are in the same cache line. @@ -1581,7 +1581,7 @@ kmp_int64 tc; /* trip count (number of iterations) */ kmp_int64 static_steal_counter; /* for static_steal only; maybe better to put after ub */ - + kmp_lock_t *th_steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) @@ -1722,11 +1722,7 @@ kmp_int32 th_disp_index; kmp_int32 th_doacross_buf_idx; // thread's doacross buffer index volatile kmp_uint32 *th_doacross_flags; // pointer to shared array of flags - union { // we can use union here because doacross cannot be used in - // nonmonotonic loops - kmp_int64 *th_doacross_info; // info on loop bounds - kmp_lock_t *th_steal_lock; // lock used for chunk stealing (8-byte variable) - }; + kmp_int64 *th_doacross_info; // info on loop bounds #if KMP_USE_INTERNODE_ALIGNMENT char more_padding[INTERNODE_CACHE_LINE]; #endif diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h --- a/openmp/runtime/src/kmp_dispatch.h +++ b/openmp/runtime/src/kmp_dispatch.h @@ -75,7 +75,7 @@ ST st; // signed UT tc; // unsigned T static_steal_counter; // for static_steal only; maybe better to put after ub - + kmp_lock_t *th_steal_lock; // lock used for chunk stealing /* parm[1-4] are used in different ways by different scheduling algorithms */ // KMP_ALIGN( 32 ) ensures ( if the KMP_ALIGN macro is turned on ) diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -372,10 +372,10 @@ // before spending time on this). // For now use dynamically allocated per-thread lock, // free memory in __kmp_dispatch_next when status==0. - KMP_DEBUG_ASSERT(th->th.th_dispatch->th_steal_lock == NULL); - th->th.th_dispatch->th_steal_lock = + KMP_DEBUG_ASSERT(pr->u.p.th_steal_lock == NULL); + pr->u.p.th_steal_lock = (kmp_lock_t *)__kmp_allocate(sizeof(kmp_lock_t)); - __kmp_init_lock(th->th.th_dispatch->th_steal_lock); + __kmp_init_lock(pr->u.p.th_steal_lock); } break; } else { @@ -968,7 +968,7 @@ // all parm3 will be the same, it still exists a bad case like using 0 and 1 // rather than program life-time increment. So the dedicated variable is // required. The 'static_steal_counter' is used. - if (schedule == kmp_sch_static_steal) { + if (pr->schedule == kmp_sch_static_steal) { // Other threads will inspect this variable when searching for a victim. // This is a flag showing that other threads may steal from this thread // since then. @@ -1195,7 +1195,7 @@ if (traits_t::type_size > 4) { // use lock for 8-byte and CAS for 4-byte induction // variable. TODO (optional): check and use 16-byte CAS - kmp_lock_t *lck = th->th.th_dispatch->th_steal_lock; + kmp_lock_t *lck = pr->u.p.th_steal_lock; KMP_DEBUG_ASSERT(lck != NULL); if (pr->u.p.count < (UT)pr->u.p.ub) { __kmp_acquire_lock(lck, gtid); @@ -1210,37 +1210,38 @@ kmp_info_t **other_threads = team->t.t_threads; int while_limit = pr->u.p.parm3; int while_index = 0; + T id = pr->u.p.static_steal_counter; // loop id + int idx = (th->th.th_dispatch->th_disp_index - 1) % + __kmp_dispatch_num_buffers; // current loop index + // note: victim thread can potentially execute another loop // TODO: algorithm of searching for a victim // should be cleaned up and measured while ((!status) && (while_limit != ++while_index)) { + dispatch_private_info_template *victim; T remaining; T victimIdx = pr->u.p.parm4; T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; - dispatch_private_info_template *victim = - reinterpret_cast *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); - while ((victim == NULL || victim == pr || - (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) && + victim = reinterpret_cast *>( + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); + while ((victim == pr || id != victim->u.p.static_steal_counter) && oldVictimIdx != victimIdx) { victimIdx = (victimIdx + 1) % nproc; victim = reinterpret_cast *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); } - if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) { + if (victim == pr || id != victim->u.p.static_steal_counter) { continue; // try once more (nproc attempts in total) // no victim is ready yet to participate in stealing - // because all victims are still in kmp_init_dispatch + // because no victim passed kmp_init_dispatch yet } if (victim->u.p.count + 2 > (UT)victim->u.p.ub) { pr->u.p.parm4 = (victimIdx + 1) % nproc; // shift start tid continue; // not enough chunks to steal, goto next victim } - lck = other_threads[victimIdx]->th.th_dispatch->th_steal_lock; + lck = victim->u.p.th_steal_lock; KMP_ASSERT(lck != NULL); __kmp_acquire_lock(lck, gtid); limit = victim->u.p.ub; // keep initial ub @@ -1268,10 +1269,10 @@ status = 1; while_index = 0; // now update own count and ub with stolen range but init chunk - __kmp_acquire_lock(th->th.th_dispatch->th_steal_lock, gtid); + __kmp_acquire_lock(pr->u.p.th_steal_lock, gtid); pr->u.p.count = init + 1; pr->u.p.ub = limit; - __kmp_release_lock(th->th.th_dispatch->th_steal_lock, gtid); + __kmp_release_lock(pr->u.p.th_steal_lock, gtid); } // while (search for victim) } // if (try to find victim and steal) } else { @@ -1308,32 +1309,32 @@ kmp_info_t **other_threads = team->t.t_threads; int while_limit = pr->u.p.parm3; int while_index = 0; - + T id = pr->u.p.static_steal_counter; // loop id + int idx = (th->th.th_dispatch->th_disp_index - 1) % + __kmp_dispatch_num_buffers; // current loop index + // note: victim thread can potentially execute another loop // TODO: algorithm of searching for a victim // should be cleaned up and measured while ((!status) && (while_limit != ++while_index)) { + dispatch_private_info_template *victim; union_i4 vold, vnew; kmp_int32 remaining; T victimIdx = pr->u.p.parm4; T oldVictimIdx = victimIdx ? victimIdx - 1 : nproc - 1; - dispatch_private_info_template *victim = - reinterpret_cast *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); - while ((victim == NULL || victim == pr || - (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) && + victim = reinterpret_cast *>( + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); + while ((victim == pr || id != victim->u.p.static_steal_counter) && oldVictimIdx != victimIdx) { victimIdx = (victimIdx + 1) % nproc; victim = reinterpret_cast *>( - other_threads[victimIdx] - ->th.th_dispatch->th_dispatch_pr_current); + &other_threads[victimIdx]->th.th_dispatch->th_disp_buffer[idx]); + KMP_DEBUG_ASSERT(victim); } - if (!victim || (*(volatile T *)&victim->u.p.static_steal_counter != - *(volatile T *)&pr->u.p.static_steal_counter)) { + if (victim == pr || id != victim->u.p.static_steal_counter) { continue; // try once more (nproc attempts in total) // no victim is ready yet to participate in stealing - // because all victims are still in kmp_init_dispatch + // because no victim passed kmp_init_dispatch yet } pr->u.p.parm4 = victimIdx; // new victim found while (1) { // CAS loop if victim has enough chunks to steal @@ -2068,14 +2069,19 @@ if (pr->schedule == kmp_sch_static_steal && traits_t::type_size > 4) { int i; + int idx = (th->th.th_dispatch->th_disp_index - 1) % + __kmp_dispatch_num_buffers; // current loop index kmp_info_t **other_threads = team->t.t_threads; // loop complete, safe to destroy locks used for stealing for (i = 0; i < th->th.th_team_nproc; ++i) { - kmp_lock_t *lck = other_threads[i]->th.th_dispatch->th_steal_lock; + dispatch_private_info_template *buf = + reinterpret_cast *>( + &other_threads[i]->th.th_dispatch->th_disp_buffer[idx]); + kmp_lock_t *lck = buf->u.p.th_steal_lock; KMP_ASSERT(lck != NULL); __kmp_destroy_lock(lck); __kmp_free(lck); - other_threads[i]->th.th_dispatch->th_steal_lock = NULL; + buf->u.p.th_steal_lock = NULL; } } #endif diff --git a/openmp/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c b/openmp/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/worksharing/for/omp_nonmonotonic_nowait.c @@ -0,0 +1,34 @@ +// RUN: %libomp-compile-and-run + +// The test checks nonmonotonic scheduling works correctly when threads +// may execute different loops concurrently. + +#include +#include + +#define N 200 +#define C 20 +int main() +{ + int i, l0 = 0, l1 = 0; + #pragma omp parallel num_threads(8) + { + #pragma omp for schedule(nonmonotonic:dynamic,C) nowait + for (i = 0; i < N; ++i) { + #pragma omp atomic + l0++; + } + #pragma omp for schedule(nonmonotonic:dynamic,C) nowait + for (i = 0; i < N * N; ++i) { + #pragma omp atomic + l1++; + } + } + if (l0 != N || l1 != N * N) { + printf("failed l0 = %d, l1 = %d, should be %d %d\n", l0, l1, N, N * N); + return 1; + } else { + printf("passed\n"); + return 0; + } +}