diff --git a/openmp/runtime/src/dllexports b/openmp/runtime/src/dllexports --- a/openmp/runtime/src/dllexports +++ b/openmp/runtime/src/dllexports @@ -371,6 +371,7 @@ __kmpc_doacross_fini 264 __kmpc_taskloop 266 __kmpc_critical_with_hint 270 + __kmpc_taskloop_5 285 %endif kmpc_aligned_malloc 265 kmpc_set_disp_num_buffers 267 diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h --- a/openmp/runtime/src/kmp.h +++ b/openmp/runtime/src/kmp.h @@ -3783,6 +3783,12 @@ kmp_uint64 *ub, kmp_int64 st, kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void *task_dup); +KMP_EXPORT void __kmpc_taskloop_5(ident_t *loc, kmp_int32 gtid, + kmp_task_t *task, kmp_int32 if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + kmp_int32 nogroup, kmp_int32 sched, + kmp_uint64 grainsize, kmp_int32 modifier, + void *task_dup); KMP_EXPORT void *__kmpc_task_reduction_init(int gtid, int num_data, void *data); KMP_EXPORT void *__kmpc_taskred_init(int gtid, int num_data, void *data); KMP_EXPORT void *__kmpc_task_reduction_get_th_data(int gtid, void *tg, void *d); diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -4142,6 +4142,7 @@ // num_tasks Number of tasks to execute // grainsize Number of loop iterations per task // extras Number of chunks with grainsize+1 iterations +// last_chunk Reduction of grainsize for last task // tc Iterations count // task_dup Tasks duplication routine // codeptr_ra Return address for OMPT events @@ -4149,7 +4150,7 @@ kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, kmp_uint64 ub_glob, kmp_uint64 num_tasks, kmp_uint64 grainsize, kmp_uint64 extras, - kmp_uint64 tc, + kmp_int64 last_chunk, kmp_uint64 tc, #if OMPT_SUPPORT void *codeptr_ra, #endif @@ -4167,13 +4168,14 @@ kmp_task_t *next_task; kmp_int32 lastpriv = 0; - KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + KMP_DEBUG_ASSERT( + tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras)); KMP_DEBUG_ASSERT(num_tasks > extras); KMP_DEBUG_ASSERT(num_tasks > 0); KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " - "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", - gtid, num_tasks, grainsize, extras, lower, upper, ub_glob, st, - task_dup)); + "extras %lld, last_chunk %lld, i=%lld,%lld(%d)%lld, dup %p\n", + gtid, num_tasks, grainsize, extras, last_chunk, lower, upper, + ub_glob, st, task_dup)); // Launch num_tasks tasks, assign grainsize iterations each task for (i = 0; i < num_tasks; ++i) { @@ -4185,6 +4187,9 @@ --extras; // first extras iterations get bigger chunk (grainsize+1) } upper = lower + st * chunk_minus_1; + if (upper > *ub) { + upper = *ub; + } if (i == num_tasks - 1) { // schedule the last task, set lastprivate flag if needed if (st == 1) { // most common case @@ -4248,6 +4253,7 @@ kmp_uint64 num_tasks; kmp_uint64 grainsize; kmp_uint64 extras; + kmp_int64 last_chunk; kmp_uint64 tc; kmp_uint64 num_t_min; #if OMPT_SUPPORT @@ -4257,7 +4263,8 @@ void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, - kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, + kmp_uint64, kmp_uint64, kmp_int64, kmp_uint64, + kmp_uint64, #if OMPT_SUPPORT void *, #endif @@ -4277,6 +4284,7 @@ kmp_uint64 num_tasks = p->num_tasks; kmp_uint64 grainsize = p->grainsize; kmp_uint64 extras = p->extras; + kmp_int64 last_chunk = p->last_chunk; kmp_uint64 tc = p->tc; kmp_uint64 num_t_min = p->num_t_min; #if OMPT_SUPPORT @@ -4285,22 +4293,23 @@ #if KMP_DEBUG kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); KMP_DEBUG_ASSERT(task != NULL); - KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" - " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", - gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, - task_dup)); + KA_TRACE(20, + ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" + " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", + gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, + st, task_dup)); #endif KMP_DEBUG_ASSERT(num_tasks * 2 + 1 > num_t_min); if (num_tasks > num_t_min) __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, - grainsize, extras, tc, num_t_min, + grainsize, extras, last_chunk, tc, num_t_min, #if OMPT_SUPPORT codeptr_ra, #endif task_dup); else __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, - grainsize, extras, tc, + grainsize, extras, last_chunk, tc, #if OMPT_SUPPORT codeptr_ra, #endif @@ -4323,6 +4332,7 @@ // num_tasks Number of tasks to execute // grainsize Number of loop iterations per task // extras Number of chunks with grainsize+1 iterations +// last_chunk Reduction of grainsize for last task // tc Iterations count // num_t_min Threshold to launch tasks recursively // task_dup Tasks duplication routine @@ -4331,7 +4341,8 @@ kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, kmp_uint64 ub_glob, kmp_uint64 num_tasks, kmp_uint64 grainsize, kmp_uint64 extras, - kmp_uint64 tc, kmp_uint64 num_t_min, + kmp_int64 last_chunk, kmp_uint64 tc, + kmp_uint64 num_t_min, #if OMPT_SUPPORT void *codeptr_ra, #endif @@ -4339,10 +4350,11 @@ kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); KMP_DEBUG_ASSERT(task != NULL); KMP_DEBUG_ASSERT(num_tasks > num_t_min); - KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" - " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", - gtid, taskdata, num_tasks, grainsize, extras, *lb, *ub, st, - task_dup)); + KA_TRACE(20, + ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" + " %lld, extras %lld, last_chunk %lld, i=%lld,%lld(%d), dup %p\n", + gtid, taskdata, num_tasks, grainsize, extras, last_chunk, *lb, *ub, + st, task_dup)); p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; kmp_uint64 lower = *lb; kmp_info_t *thread = __kmp_threads[gtid]; @@ -4353,16 +4365,23 @@ size_t upper_offset = (char *)ub - (char *)task; // remember offset of ub in the task structure - KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + KMP_DEBUG_ASSERT( + tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras)); KMP_DEBUG_ASSERT(num_tasks > extras); KMP_DEBUG_ASSERT(num_tasks > 0); // split the loop in two halves kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; + kmp_int64 last_chunk0 = 0, last_chunk1 = 0; kmp_uint64 gr_size0 = grainsize; kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task - if (n_tsk0 <= extras) { + if (last_chunk < 0) { + ext0 = ext1 = 0; + last_chunk1 = last_chunk; + tc0 = grainsize * n_tsk0; + tc1 = tc - tc0; + } else if (n_tsk0 <= extras) { gr_size0++; // integrate extras into grainsize ext0 = 0; // no extra iters in 1st half ext1 = extras - n_tsk0; // remaining extras @@ -4404,6 +4423,7 @@ p->num_tasks = n_tsk1; p->grainsize = grainsize; p->extras = ext1; + p->last_chunk = last_chunk1; p->tc = tc1; p->num_t_min = num_t_min; #if OMPT_SUPPORT @@ -4420,44 +4440,28 @@ // execute the 1st half of current subrange if (n_tsk0 > num_t_min) __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, gr_size0, - ext0, tc0, num_t_min, + ext0, last_chunk0, tc0, num_t_min, #if OMPT_SUPPORT codeptr_ra, #endif task_dup); else __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, - gr_size0, ext0, tc0, + gr_size0, ext0, last_chunk0, tc0, #if OMPT_SUPPORT codeptr_ra, #endif task_dup); - KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); + KA_TRACE(40, ("__kmp_taskloop_recur(exit): T#%d\n", gtid)); } -/*! -@ingroup TASKING -@param loc Source location information -@param gtid Global thread ID -@param task Task structure -@param if_val Value of the if clause -@param lb Pointer to loop lower bound in task structure -@param ub Pointer to loop upper bound in task structure -@param st Loop stride -@param nogroup Flag, 1 if no taskgroup needs to be added, 0 otherwise -@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks -@param grainsize Schedule value if specified -@param task_dup Tasks duplication routine - -Execute the taskloop construct. -*/ -void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, - kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, - int sched, kmp_uint64 grainsize, void *task_dup) { +static void __kmp_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_uint64 grainsize, + int modifier, void *task_dup) { kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); KMP_DEBUG_ASSERT(task != NULL); - __kmp_assert_valid_gtid(gtid); if (nogroup == 0) { #if OMPT_SUPPORT && OMPT_OPTIONAL OMPT_STORE_RETURN_ADDRESS(gtid); @@ -4474,13 +4478,16 @@ kmp_uint64 upper = task_bounds.get_ub(); kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag kmp_uint64 num_tasks = 0, extras = 0; + kmp_int64 last_chunk = + 0; // reduce grainsize of last task by last_chunk in strict mode kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *current_task = thread->th.th_current_task; - KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " - "grain %llu(%d), dup %p\n", - gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); + KA_TRACE(20, ("__kmp_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " + "grain %llu(%d, %d), dup %p\n", + gtid, taskdata, lower, upper, st, grainsize, sched, modifier, + task_dup)); // compute trip count if (st == 1) { // most common case @@ -4491,7 +4498,7 @@ tc = (upper - lower) / st + 1; } if (tc == 0) { - KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d zero-trip loop\n", gtid)); + KA_TRACE(20, ("__kmp_taskloop(exit): T#%d zero-trip loop\n", gtid)); // free the pattern task and exit __kmp_task_start(gtid, task, current_task); // do not execute anything for zero-trip loop @@ -4533,20 +4540,28 @@ break; case 1: // grainsize provided if (grainsize > tc) { - num_tasks = 1; // too big grainsize requested, adjust values - grainsize = tc; + num_tasks = 1; + grainsize = tc; // too big grainsize requested, adjust values extras = 0; } else { - num_tasks = tc / grainsize; - // adjust grainsize for balanced distribution of iterations - grainsize = tc / num_tasks; - extras = tc % num_tasks; + if (modifier) { + num_tasks = (tc + grainsize - 1) / grainsize; + last_chunk = tc - (num_tasks * grainsize); + extras = 0; + } else { + num_tasks = tc / grainsize; + // adjust grainsize for balanced distribution of iterations + grainsize = tc / num_tasks; + extras = tc % num_tasks; + } } break; default: KMP_ASSERT2(0, "unknown scheduling of taskloop"); } - KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + + KMP_DEBUG_ASSERT( + tc == num_tasks * grainsize + (last_chunk < 0 ? last_chunk : extras)); KMP_DEBUG_ASSERT(num_tasks > extras); KMP_DEBUG_ASSERT(num_tasks > 0); // ========================================================================= @@ -4558,7 +4573,7 @@ taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied // always start serial tasks linearly __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, - grainsize, extras, tc, + grainsize, extras, last_chunk, tc, #if OMPT_SUPPORT OMPT_GET_RETURN_ADDRESS(0), #endif @@ -4566,21 +4581,23 @@ // !taskdata->td_flags.native => currently force linear spawning of tasks // for GOMP_taskloop } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { - KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" - "(%lld), grain %llu, extras %llu\n", - gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); + KA_TRACE(20, ("__kmp_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" + "(%lld), grain %llu, extras %llu, last_chunk %lld\n", + gtid, tc, num_tasks, num_tasks_min, grainsize, extras, + last_chunk)); __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, - grainsize, extras, tc, num_tasks_min, + grainsize, extras, last_chunk, tc, num_tasks_min, #if OMPT_SUPPORT OMPT_GET_RETURN_ADDRESS(0), #endif task_dup); } else { - KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" - "(%lld), grain %llu, extras %llu\n", - gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); + KA_TRACE(20, ("__kmp_taskloop: T#%d, go linear: tc %llu, #tasks %llu" + "(%lld), grain %llu, extras %llu, last_chunk %lld\n", + gtid, tc, num_tasks, num_tasks_min, grainsize, extras, + last_chunk)); __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, - grainsize, extras, tc, + grainsize, extras, last_chunk, tc, #if OMPT_SUPPORT OMPT_GET_RETURN_ADDRESS(0), #endif @@ -4601,5 +4618,59 @@ #endif __kmpc_end_taskgroup(loc, gtid); } + KA_TRACE(20, ("__kmp_taskloop(exit): T#%d\n", gtid)); +} + +/*! +@ingroup TASKING +@param loc Source location information +@param gtid Global thread ID +@param task Task structure +@param if_val Value of the if clause +@param lb Pointer to loop lower bound in task structure +@param ub Pointer to loop upper bound in task structure +@param st Loop stride +@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise +@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks +@param grainsize Schedule value if specified +@param task_dup Tasks duplication routine + +Execute the taskloop construct. +*/ +void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, + int sched, kmp_uint64 grainsize, void *task_dup) { + __kmp_assert_valid_gtid(gtid); + KA_TRACE(20, ("__kmpc_taskloop(enter): T#%d\n", gtid)); + __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, + 0, task_dup); KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); } + +/*! +@ingroup TASKING +@param loc Source location information +@param gtid Global thread ID +@param task Task structure +@param if_val Value of the if clause +@param lb Pointer to loop lower bound in task structure +@param ub Pointer to loop upper bound in task structure +@param st Loop stride +@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise +@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks +@param grainsize Schedule value if specified +@param modifer Modifier 'strict' for sched, 1 if present, 0 otherwise +@param task_dup Tasks duplication routine + +Execute the taskloop construct. +*/ +void __kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_uint64 grainsize, + int modifier, void *task_dup) { + __kmp_assert_valid_gtid(gtid); + KA_TRACE(20, ("__kmpc_taskloop_5(enter): T#%d\n", gtid)); + __kmp_taskloop(loc, gtid, task, if_val, lb, ub, st, nogroup, sched, grainsize, + modifier, task_dup); + KA_TRACE(20, ("__kmpc_taskloop_5(exit): T#%d\n", gtid)); +} diff --git a/openmp/runtime/test/tasking/kmp_taskloop_5.c b/openmp/runtime/test/tasking/kmp_taskloop_5.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/tasking/kmp_taskloop_5.c @@ -0,0 +1,167 @@ +// RUN: %libomp-compile-and-run +// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run + +#include +#include +#include "omp_my_sleep.h" + +#define N 4 +#define ST 3 +#define UB 118 +#define LB 0 + +// globals +int counter; +int task_count; + +// Compiler-generated code (emulation) +typedef struct ident { + void* dummy; +} ident_t; + +typedef struct shar { + int *pcounter; + int *pj; + int *ptask_count; +} *pshareds; + +typedef struct task { + pshareds shareds; + int(* routine)(int,struct task*); + int part_id; + unsigned long long lb; // library always uses ULONG + unsigned long long ub; + int st; + int last; + int i; + int j; + int th; +} *ptask, kmp_task_t; + +typedef int(* task_entry_t)( int, ptask ); + +void +__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv) +{ +// setup lastprivate flag + task_dst->last = lastpriv; +// could be constructor calls here... +} + +// OpenMP RTL interfaces +typedef unsigned long long kmp_uint64; +typedef long long kmp_int64; + +#ifdef __cplusplus +extern "C" { +#endif +void +__kmpc_taskloop_5(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_int64 grainsize, int modifier, + void *task_dup); +ptask +__kmpc_omp_task_alloc(ident_t *loc, int gtid, int flags, + size_t sizeof_kmp_task_t, size_t sizeof_shareds, + task_entry_t task_entry); +void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs); +int __kmpc_global_thread_num(void *id_ref); +#ifdef __cplusplus +} +#endif + +// User's code +int task_entry(int gtid, ptask task) +{ + pshareds pshar = task->shareds; + __kmpc_atomic_fixed4_add(NULL, gtid, pshar->ptask_count, 1); + + for (task->i = task->lb; task->i <= (int)task->ub; task->i += task->st) { + task->th = omp_get_thread_num(); + __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1); + task->j = task->i; + } + my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks + if (task->last) { + *(pshar->pj) = task->j; // lastprivate + } + return 0; +} + +void task_loop(int sched_type, int sched_val, int modifier) +{ + int i, j, gtid = __kmpc_global_thread_num(NULL); + ptask task; + pshareds psh; + omp_set_dynamic(0); + counter = 0; + task_count = 0; + #pragma omp parallel num_threads(N) + { + #pragma omp master + { + int gtid = __kmpc_global_thread_num(NULL); + task = __kmpc_omp_task_alloc(NULL, gtid, 1, sizeof(struct task), + sizeof(struct shar), &task_entry); + psh = task->shareds; + psh->pcounter = &counter; + psh->ptask_count = &task_count; + psh->pj = &j; + task->lb = LB; + task->ub = UB; + task->st = ST; + + __kmpc_taskloop_5( + NULL, // location + gtid, // gtid + task, // task structure + 1, // if clause value + &task->lb, // lower bound + &task->ub, // upper bound + ST, // loop increment + 0, // 1 if nogroup specified + sched_type, // schedule type: 0-none, 1-grainsize, 2-num_tasks + sched_val, // schedule value (ignored for type 0) + modifier, // strict modifier + (void*)&__task_dup_entry // tasks duplication routine + ); + } // end master + } // end parallel +// check results + int tc; + if (ST == 1) { // most common case + tc = UB - LB + 1; + } else if (ST < 0) { + tc = (LB - UB) / (-ST) + 1; + } else { // ST > 0 + tc = (UB - LB) / ST + 1; + } + int count; + if (sched_type == 1) { + count = (sched_val > tc) ? 1 : (tc + sched_val - 1) / sched_val; + } else { + count = (sched_val > tc) ? tc : sched_val; + } + if (j != LB + (tc - 1) * ST) { + printf("Error in lastprivate, %d != %d\n", j, LB + (tc - 1) * ST); + exit(1); + } + if (counter != tc) { + printf("Error, counter %d != %d\n", counter, tc); + exit(1); + } + if (task_count != count) { + printf("Error, task count %d != %d\n", task_count, count); + exit(1); + } +} + +int main(int argc, char *argv[]) { + task_loop(1, 6, 1); // create 7 tasks + task_loop(2, 6, 1); // create 6 tasks + task_loop(1, 50, 1); // create 1 task + task_loop(2, 50, 1); // create 40 tasks + + printf("Test passed\n"); + return 0; +}