Index: runtime/src/kmp.h =================================================================== --- runtime/src/kmp.h +++ runtime/src/kmp.h @@ -2050,8 +2050,10 @@ // specified, defaults to 0 otherwise #endif #if OMP_45_ENABLED -extern kmp_int32 __kmp_max_task_priority; // Set via OMP_MAX_TASK_PRIORITY if -// specified, defaults to 0 otherwise +// Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise +extern kmp_int32 __kmp_max_task_priority; +// Set via KMP_TASKLOOP_MIN_TASKS if specified, defaults to 0 otherwise +extern kmp_uint64 __kmp_taskloop_min_tasks; #endif /* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with Index: runtime/src/kmp_global.cpp =================================================================== --- runtime/src/kmp_global.cpp +++ runtime/src/kmp_global.cpp @@ -301,6 +301,7 @@ kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams; #if OMP_45_ENABLED kmp_int32 __kmp_max_task_priority = 0; +kmp_uint64 __kmp_taskloop_min_tasks = 0; #endif /* This check ensures that the compiler is passing the correct data type for the Index: runtime/src/kmp_settings.cpp =================================================================== --- runtime/src/kmp_settings.cpp +++ runtime/src/kmp_settings.cpp @@ -1167,6 +1167,20 @@ char const *name, void *data) { __kmp_stg_print_int(buffer, name, __kmp_max_task_priority); } // __kmp_stg_print_max_task_priority + +// KMP_TASKLOOP_MIN_TASKS +// taskloop threashold to switch from recursive to linear tasks creation +static void __kmp_stg_parse_taskloop_min_tasks(char const *name, + char const *value, void *data) { + int tmp; + __kmp_stg_parse_int(name, value, 0, INT_MAX, &tmp); + __kmp_taskloop_min_tasks = tmp; +} // __kmp_stg_parse_taskloop_min_tasks + +static void __kmp_stg_print_taskloop_min_tasks(kmp_str_buf_t *buffer, + char const *name, void *data) { + __kmp_stg_print_int(buffer, name, __kmp_taskloop_min_tasks); +} // __kmp_stg_print_taskloop_min_tasks #endif // OMP_45_ENABLED // ----------------------------------------------------------------------------- @@ -4375,6 +4389,8 @@ #if OMP_45_ENABLED {"OMP_MAX_TASK_PRIORITY", __kmp_stg_parse_max_task_priority, __kmp_stg_print_max_task_priority, NULL, 0, 0}, + {"KMP_TASKLOOP_MIN_TASKS", __kmp_stg_parse_taskloop_min_tasks, + __kmp_stg_print_taskloop_min_tasks, NULL, 0, 0}, #endif {"OMP_THREAD_LIMIT", __kmp_stg_parse_all_threads, __kmp_stg_print_all_threads, NULL, 0, 0}, Index: runtime/src/kmp_tasking.cpp =================================================================== --- runtime/src/kmp_tasking.cpp +++ runtime/src/kmp_tasking.cpp @@ -3245,6 +3245,7 @@ 0); } taskdata->td_alloc_thread = thread; + taskdata->td_parent = parent_task; taskdata->td_taskgroup = parent_task ->td_taskgroup; // task inherits the taskgroup from the parent task @@ -3273,32 +3274,37 @@ return task; } -// Routine optionally generated by th ecompiler for setting the lastprivate flag +// Routine optionally generated by the compiler for setting the lastprivate flag // and calling needed constructors for private/firstprivate objects // (used to form taskloop tasks from pattern task) +// Parameters: dest task, src task, lastprivate flag. typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); // __kmp_taskloop_linear: Start tasks of the taskloop linearly // // loc Source location information // gtid Global thread ID -// task Task with whole loop iteration range -// lb Pointer to loop lower bound -// ub Pointer to loop upper bound +// task Pattern task, exposes the loop iteration range +// lb Pointer to loop lower bound in task structure +// ub Pointer to loop upper bound in task structure // st Loop stride -// sched Schedule specified 0/1/2 for none/grainsize/num_tasks -// grainsize Schedule value if specified +// ub_glob Global upper bound (used for lastprivate check) +// num_tasks Number of tasks to execute +// grainsize Number of loop iterations per task +// extras Number of chunks with grainsize+1 iterations +// tc Iterations count // task_dup Tasks duplication routine void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, - int sched, kmp_uint64 grainsize, void *task_dup) { + kmp_uint64 ub_glob, kmp_uint64 num_tasks, + kmp_uint64 grainsize, kmp_uint64 extras, + kmp_uint64 tc, void *task_dup) { KMP_COUNT_BLOCK(OMP_TASKLOOP); KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; - kmp_uint64 tc; kmp_uint64 lower = *lb; // compiler provides global bounds here kmp_uint64 upper = *ub; - kmp_uint64 i, num_tasks = 0, extras = 0; + kmp_uint64 i; kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *current_task = thread->th.th_current_task; kmp_task_t *next_task; @@ -3308,6 +3314,254 @@ size_t upper_offset = (char *)ub - (char *)task; // remember offset of ub in the task structure + KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + KMP_DEBUG_ASSERT(num_tasks > extras); + KMP_DEBUG_ASSERT(num_tasks > 0); + KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, " + "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", gtid, num_tasks, + grainsize, extras, lower, upper, ub_glob, st, task_dup)); + + // Launch num_tasks tasks, assign grainsize iterations each task + for (i = 0; i < num_tasks; ++i) { + kmp_uint64 chunk_minus_1; + if (extras == 0) { + chunk_minus_1 = grainsize - 1; + } else { + chunk_minus_1 = grainsize; + --extras; // first extras iterations get bigger chunk (grainsize+1) + } + upper = lower + st * chunk_minus_1; + if (i == num_tasks - 1) { + // schedule the last task, set lastprivate flag if needed + if (st == 1) { // most common case + KMP_DEBUG_ASSERT(upper == *ub); + if (upper == ub_glob) + lastpriv = 1; + } else if (st > 0) { // positive loop stride + KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper); + if ((kmp_uint64)st > ub_glob - upper) + lastpriv = 1; + } else { // negative loop stride + KMP_DEBUG_ASSERT(upper + st < *ub); + if (upper - ub_glob < (kmp_uint64)(-st)) + lastpriv = 1; + } + } + next_task = __kmp_task_dup_alloc(thread, task); // allocate new task + // adjust task-specific bounds + *(kmp_uint64 *)((char *)next_task + lower_offset) = lower; + *(kmp_uint64 *)((char *)next_task + upper_offset) = upper; + if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. + ptask_dup(next_task, task, lastpriv); + KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, " + "upper %lld (offsets %p %p)\n", + gtid, next_task, lower, upper, lower_offset, upper_offset)); + __kmp_omp_task(gtid, next_task, true); // schedule new task + lower = upper + st; // adjust lower bound for the next iteration + } + // free the pattern task and exit + __kmp_task_start(gtid, task, current_task); // make internal bookkeeping + // do not execute the pattern task, just do internal bookkeeping + __kmp_task_finish(gtid, task, current_task); +} + +// Structure to keep taskloop parameters for auxiliary task +// kept in the shareds of the task structure. +typedef struct __taskloop_params { + kmp_task_t *task; + kmp_uint64 *lb; + kmp_uint64 *ub; + void *task_dup; + kmp_int64 st; + kmp_uint64 ub_glob; + kmp_uint64 num_tasks; + kmp_uint64 grainsize; + kmp_uint64 extras; + kmp_uint64 tc; + kmp_uint64 num_t_min; +} __taskloop_params_t; + +void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *, + kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64, + kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64, + void *); + +// Execute part of the the taskloop submitted as a task. +int __kmp_taskloop_task(int gtid, void *ptask) { + __taskloop_params_t *p = (__taskloop_params_t*)((kmp_task_t*)ptask)->shareds; + kmp_task_t *task = p->task; + kmp_uint64 *lb = p->lb; + kmp_uint64 *ub = p->ub; + void *task_dup = p->task_dup; +// p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; + kmp_int64 st = p->st; + kmp_uint64 ub_glob = p->ub_glob; + kmp_uint64 num_tasks = p->num_tasks; + kmp_uint64 grainsize = p->grainsize; + kmp_uint64 extras = p->extras; + kmp_uint64 tc = p->tc; + kmp_uint64 num_t_min = p->num_t_min; +#if KMP_DEBUG + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + KMP_DEBUG_ASSERT(task != NULL); + KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize" + " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata, + num_tasks, grainsize, extras, *lb, *ub, st, task_dup)); +#endif + KMP_DEBUG_ASSERT(num_tasks*2+1 > num_t_min); + if (num_tasks > num_t_min) + __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, + grainsize, extras, tc, num_t_min, task_dup); + else + __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks, + grainsize, extras, tc, task_dup); + + KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid)); + return 0; +} + +// Schedule part of the the taskloop as a task, +// execute the rest of the the taskloop. +// +// loc Source location information +// gtid Global thread ID +// task Pattern task, exposes the loop iteration range +// lb Pointer to loop lower bound in task structure +// ub Pointer to loop upper bound in task structure +// st Loop stride +// ub_glob Global upper bound (used for lastprivate check) +// num_tasks Number of tasks to execute +// grainsize Number of loop iterations per task +// extras Number of chunks with grainsize+1 iterations +// tc Iterations count +// num_t_min Threashold to launch tasks recursively +// task_dup Tasks duplication routine +void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + kmp_uint64 ub_glob, kmp_uint64 num_tasks, + kmp_uint64 grainsize, kmp_uint64 extras, + kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) { +#if KMP_DEBUG + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + KMP_DEBUG_ASSERT(task != NULL); + KMP_DEBUG_ASSERT(num_tasks > num_t_min); + KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize" + " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata, + num_tasks, grainsize, extras, *lb, *ub, st, task_dup)); +#endif + p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; + kmp_uint64 lower = *lb; + kmp_uint64 upper = *ub; + kmp_info_t *thread = __kmp_threads[gtid]; +// kmp_taskdata_t *current_task = thread->th.th_current_task; + kmp_task_t *next_task; + kmp_int32 lastpriv = 0; + size_t lower_offset = + (char *)lb - (char *)task; // remember offset of lb in the task structure + size_t upper_offset = + (char *)ub - (char *)task; // remember offset of ub in the task structure + + KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + KMP_DEBUG_ASSERT(num_tasks > extras); + KMP_DEBUG_ASSERT(num_tasks > 0); + + // split the loop in two halves + kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1; + kmp_uint64 gr_size0 = grainsize; + kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute + kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task + if (n_tsk0 <= extras) { + gr_size0++; // integrate extras into grainsize + ext0 = 0; // no extra iters in 1st half + ext1 = extras - n_tsk0; // remaining extras + tc0 = gr_size0 * n_tsk0; + tc1 = tc - tc0; + } else { // n_tsk0 > extras + ext1 = 0; // no extra iters in 2nd half + ext0 = extras; + tc1 = grainsize * n_tsk1; + tc0 = tc - tc1; + } + ub0 = lower + st * (tc0 - 1); + lb1 = ub0 + st; + + // create pattern task for 2nd half of the loop + next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task + // adjust lower bound (upper bound is not changed) for the 2nd half + *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1; + if (ptask_dup != NULL) // construct fistprivates, etc. + ptask_dup(next_task, task, 0); + *ub = ub0; // adjust upper bound for the 1st half + + // create auxiliary task for 2nd half of the loop + kmp_task_t *new_task = + __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void*), + sizeof(__taskloop_params_t), &__kmp_taskloop_task); + __taskloop_params_t * p = (__taskloop_params_t *)new_task->shareds; + p->task = next_task; + p->lb = (kmp_uint64 *)((char *)next_task + lower_offset); + p->ub = (kmp_uint64 *)((char *)next_task + upper_offset); + p->task_dup = task_dup; + p->st = st; + p->ub_glob = ub_glob; + p->num_tasks = n_tsk1; + p->grainsize = grainsize; + p->extras = ext1; + p->tc = tc1; + p->num_t_min = num_t_min; + __kmp_omp_task(gtid, new_task, true); // schedule new task + + // execute the 1st half of current subrange + if (n_tsk0 > num_t_min) + __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, + gr_size0, ext0, tc0, num_t_min, task_dup); + else + __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0, + gr_size0, ext0, tc0, task_dup); + + KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid)); +} + +/*! +@ingroup TASKING +@param loc Source location information +@param gtid Global thread ID +@param task Task structure +@param if_val Value of the if clause +@param lb Pointer to loop lower bound in task structure +@param ub Pointer to loop upper bound in task structure +@param st Loop stride +@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise +@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks +@param grainsize Schedule value if specified +@param task_dup Tasks duplication routine + +Execute the taskloop construct. +*/ +void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, + int sched, kmp_uint64 grainsize, void *task_dup) { + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + KMP_DEBUG_ASSERT(task != NULL); + + KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " + "grain %llu(%d), dup %p\n", gtid, taskdata, *lb, *ub, st, + grainsize, sched, task_dup)); + + if (nogroup == 0) + __kmpc_taskgroup(loc, gtid); + + // ========================================================================= + // calculate loop parameters + kmp_uint64 tc; + kmp_uint64 lower = *lb; // compiler provides global bounds here + kmp_uint64 upper = *ub; + kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag + kmp_uint64 num_tasks = 0, extras = 0; + kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_taskdata_t *current_task = thread->th.th_current_task; + // compute trip count if (st == 1) { // most common case tc = upper - lower + 1; @@ -3324,6 +3578,10 @@ __kmp_task_finish(gtid, task, current_task); return; } + if (num_tasks_min == 0) + // TODO: can we choose better default heuristic? + num_tasks_min = KMP_MIN(thread->th.th_team_nproc * 10, + INITIAL_TASK_DEQUE_SIZE); // compute num_tasks/grainsize based on the input provided switch (sched) { @@ -3348,9 +3606,8 @@ extras = 0; } else { num_tasks = tc / grainsize; - grainsize = - tc / - num_tasks; // adjust grainsize for balanced distribution of iterations + // adjust grainsize for balanced distribution of iterations + grainsize = tc / num_tasks; extras = tc % num_tasks; } break; @@ -3360,95 +3617,32 @@ KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); KMP_DEBUG_ASSERT(num_tasks > extras); KMP_DEBUG_ASSERT(num_tasks > 0); - KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize " - "%lld, extras %lld\n", - gtid, num_tasks, grainsize, extras)); - - // Main loop, launch num_tasks tasks, assign grainsize iterations each task - for (i = 0; i < num_tasks; ++i) { - kmp_uint64 chunk_minus_1; - if (extras == 0) { - chunk_minus_1 = grainsize - 1; - } else { - chunk_minus_1 = grainsize; - --extras; // first extras iterations get bigger chunk (grainsize+1) - } - upper = lower + st * chunk_minus_1; - if (i == num_tasks - 1) { - // schedule the last task, set lastprivate flag - lastpriv = 1; -#if KMP_DEBUG - if (st == 1) - KMP_DEBUG_ASSERT(upper == *ub); - else if (st > 0) - KMP_DEBUG_ASSERT(upper + st > *ub); - else - KMP_DEBUG_ASSERT(upper + st < *ub); -#endif - } - next_task = __kmp_task_dup_alloc(thread, task); // allocate new task - *(kmp_uint64 *)((char *)next_task + lower_offset) = - lower; // adjust task-specific bounds - *(kmp_uint64 *)((char *)next_task + upper_offset) = upper; - if (ptask_dup != NULL) - ptask_dup(next_task, task, - lastpriv); // set lastprivate flag, construct fistprivates, etc. - KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper " - "%lld (offsets %p %p)\n", - gtid, next_task, lower, upper, lower_offset, upper_offset)); - __kmp_omp_task(gtid, next_task, true); // schedule new task - lower = upper + st; // adjust lower bound for the next iteration - } - // free the pattern task and exit - __kmp_task_start(gtid, task, current_task); - // do not execute the pattern task, just do bookkeeping - __kmp_task_finish(gtid, task, current_task); -} - -/*! -@ingroup TASKING -@param loc Source location information -@param gtid Global thread ID -@param task Task structure -@param if_val Value of the if clause -@param lb Pointer to loop lower bound -@param ub Pointer to loop upper bound -@param st Loop stride -@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise -@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks -@param grainsize Schedule value if specified -@param task_dup Tasks duplication routine - -Execute the taskloop construct. -*/ -void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, - kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup, - int sched, kmp_uint64 grainsize, void *task_dup) { - kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); - KMP_DEBUG_ASSERT(task != NULL); - - KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub " - "%lld st %lld, grain %llu(%d)\n", - gtid, taskdata, *lb, *ub, st, grainsize, sched)); + // ========================================================================= // check if clause value first if (if_val == 0) { // if(0) specified, mark task as serial taskdata->td_flags.task_serial = 1; taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied - } - if (nogroup == 0) { - __kmpc_taskgroup(loc, gtid); - } - - if (1 /* AC: use some heuristic here to choose task scheduling method */) { - __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize, - task_dup); + // always start serial tasks linearly + __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, + grainsize, extras, tc, task_dup); + } else if (num_tasks > num_tasks_min) { + KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" + "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks, + num_tasks_min, grainsize, extras)); + __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, + grainsize, extras, tc, num_tasks_min, task_dup); + } else { + KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu" + "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks, + num_tasks_min, grainsize, extras)); + __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, + grainsize, extras, tc, task_dup); } - if (nogroup == 0) { + if (nogroup == 0) __kmpc_end_taskgroup(loc, gtid); - } - KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid)); + KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid)); } #endif Index: runtime/test/tasking/kmp_taskloop.c =================================================================== --- runtime/test/tasking/kmp_taskloop.c +++ runtime/test/tasking/kmp_taskloop.c @@ -1,4 +1,5 @@ // RUN: %libomp-compile-and-run +// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run #include #include #include "omp_my_sleep.h" Index: runtime/test/tasking/omp_taskloop_grainsize.c =================================================================== --- /dev/null +++ runtime/test/tasking/omp_taskloop_grainsize.c @@ -0,0 +1,106 @@ +// RUN: %libomp-compile-and-run +// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run +/* + * Test for taskloop + * Method: caculate how many times the iteration space is dispatched + * and judge if each dispatch has the requested grainsize + * It is possible for two adjacent chunks are executed by the same thread + */ +#include +#include +#include +#include "omp_testsuite.h" + +#define CFDMAX_SIZE 1120 + +int test_omp_taskloop_grainsize() +{ + int i, grainsize, count, tmp_count, result, num_off; + int *tmp, *tids, *tidsArray; + + tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE); + tids = tidsArray; + + for (grainsize = 1; grainsize < 48; ++grainsize) { + fprintf(stderr, "Grainsize %d\n", grainsize); + count = tmp_count = num_off = 0; + + for (i = 0; i < CFDMAX_SIZE; ++i) { + tids[i] = -1; + } + + #pragma omp parallel shared(tids) + { + #pragma omp master + #pragma omp taskloop grainsize(grainsize) + for (i = 0; i < CFDMAX_SIZE; i++) { + tids[i] = omp_get_thread_num(); + } + } + + for (i = 0; i < CFDMAX_SIZE; ++i) { + if (tids[i] == -1) { + fprintf(stderr, " Iteration %d not touched!\n", i); + result++; + } + } + + for (i = 0; i < CFDMAX_SIZE - 1; ++i) { + if (tids[i] != tids[i + 1]) { + count++; + } + } + + tmp = (int *)malloc(sizeof(int) * (count + 1)); + tmp[0] = 1; + + for (i = 0; i < CFDMAX_SIZE - 1; ++i) { + if (tmp_count > count) { + printf("--------------------\nTestinternal Error: List too " + "small!!!\n--------------------\n"); + break; + } + if (tids[i] != tids[i + 1]) { + tmp_count++; + tmp[tmp_count] = 1; + } else { + tmp[tmp_count]++; + } + } + + // is grainsize statement working? + int num_tasks = CFDMAX_SIZE / grainsize; + int multiple1 = CFDMAX_SIZE / num_tasks; + int multiple2 = CFDMAX_SIZE / num_tasks + 1; + for (i = 0; i < count; i++) { + // it is possible for 2 adjacent chunks assigned to a same thread + if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) { + num_off++; + } + } + + if (num_off > 1) { + fprintf(stderr, " The number of bad chunks is %d\n", num_off); + result++; + } else { + fprintf(stderr, " Everything ok\n"); + } + + free(tmp); + } + free(tidsArray); + return (result==0); +} + +int main() +{ + int i; + int num_failed=0; + + for (i = 0; i < REPETITIONS; i++) { + if (!test_omp_taskloop_grainsize()) { + num_failed++; + } + } + return num_failed; +} Index: runtime/test/tasking/omp_taskloop_num_tasks.c =================================================================== --- /dev/null +++ runtime/test/tasking/omp_taskloop_num_tasks.c @@ -0,0 +1,67 @@ +// RUN: %libomp-compile-and-run +// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run +/* + * Test for taskloop + * Method: caculate how many times the iteration space is dispatched + * and judge if each dispatch has the requested grainsize + * It is possible for two adjacent chunks are executed by the same thread + */ +#include +#include +#include +#include "omp_testsuite.h" + +#define CFDMAX_SIZE 1120 + +int test_omp_taskloop_num_tasks() +{ + int i; + int *tids; + int *tidsArray; + int count; + int result = 0; + int num_tasks; + + for (num_tasks = 1; num_tasks < 120; ++num_tasks) { + count = 0; + tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE); + tids = tidsArray; + + #pragma omp parallel shared(tids) + { + int i; + #pragma omp master + #pragma omp taskloop num_tasks(num_tasks) + for (i = 0; i < CFDMAX_SIZE; i++) { + tids[i] = omp_get_thread_num(); + } + } + + for (i = 0; i < CFDMAX_SIZE - 1; ++i) { + if (tids[i] != tids[i + 1]) { + count++; + } + } + + if (count > num_tasks) { + fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n", + num_tasks, count); + result++; + } + } + + return (result==0); +} + +int main() +{ + int i; + int num_failed=0; + + for (i = 0; i < REPETITIONS; i++) { + if (!test_omp_taskloop_num_tasks()) { + num_failed++; + } + } + return num_failed; +}