Index: openmp/trunk/runtime/src/exports_so.txt =================================================================== --- openmp/trunk/runtime/src/exports_so.txt +++ openmp/trunk/runtime/src/exports_so.txt @@ -117,5 +117,7 @@ } GOMP_2.0; GOMP_4.0 { } GOMP_3.0; +GOMP_4.5 { +} GOMP_4.0; # end of file # Index: openmp/trunk/runtime/src/kmp.h =================================================================== --- openmp/trunk/runtime/src/kmp.h +++ openmp/trunk/runtime/src/kmp.h @@ -2261,8 +2261,16 @@ #if OMP_45_ENABLED kmp_task_team_t *td_task_team; kmp_int32 td_size_alloc; // The size of task structure, including shareds etc. +#if defined(KMP_GOMP_COMPAT) + // 4 or 8 byte integers for the loop bounds in GOMP_taskloop + kmp_int32 td_size_loop_bounds; +#endif #endif // OMP_45_ENABLED kmp_taskdata_t *td_last_tied; // keep tied task for task scheduling constraint +#if defined(KMP_GOMP_COMPAT) && OMP_45_ENABLED + // GOMP sends in a copy function for copy constructors + void (*td_copy_func)(void *, void *); +#endif #if OMPT_SUPPORT ompt_task_info_t ompt_task_info; #endif Index: openmp/trunk/runtime/src/kmp_ftn_os.h =================================================================== --- openmp/trunk/runtime/src/kmp_ftn_os.h +++ openmp/trunk/runtime/src/kmp_ftn_os.h @@ -613,4 +613,8 @@ #define KMP_API_NAME_GOMP_TARGET_UPDATE GOMP_target_update #define KMP_API_NAME_GOMP_TEAMS GOMP_teams +// All GOMP_4.5 symbols +#define KMP_API_NAME_GOMP_TASKLOOP GOMP_taskloop +#define KMP_API_NAME_GOMP_TASKLOOP_ULL GOMP_taskloop_ull + #endif /* KMP_FTN_OS_H */ Index: openmp/trunk/runtime/src/kmp_gsupport.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_gsupport.cpp +++ openmp/trunk/runtime/src/kmp_gsupport.cpp @@ -1413,6 +1413,137 @@ } #endif // OMP_40_ENABLED +#if OMP_45_ENABLED + +// Task duplication function which copies src to dest (both are +// preallocated task structures) +static void __kmp_gomp_task_dup(kmp_task_t *dest, kmp_task_t *src, + kmp_int32 last_private) { + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(src); + if (taskdata->td_copy_func) { + (taskdata->td_copy_func)(dest->shareds, src->shareds); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +template +void __GOMP_taskloop(void (*func)(void *), void *data, + void (*copy_func)(void *, void *), long arg_size, + long arg_align, unsigned gomp_flags, + unsigned long num_tasks, int priority, T start, T end, + T step) { + typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); + MKLOC(loc, "GOMP_taskloop"); + int sched; + T *loop_bounds; + int gtid = __kmp_entry_gtid(); + kmp_int32 flags = 0; + int if_val = gomp_flags & (1u << 10); + int nogroup = gomp_flags & (1u << 11); + int up = gomp_flags & (1u << 8); + p_task_dup_t task_dup = NULL; + kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags; +#ifdef KMP_DEBUG + { + const char *buff; + buff = __kmp_str_format( + "GOMP_taskloop: T#%%d: func:%%p data:%%p copy_func:%%p " + "arg_size:%%ld arg_align:%%ld gomp_flags:0x%%x num_tasks:%%lu " + "priority:%%d start:%%%s end:%%%s step:%%%s\n", + traits_t::spec, traits_t::spec, traits_t::spec); + KA_TRACE(20, (buff, gtid, func, data, copy_func, arg_size, arg_align, + gomp_flags, num_tasks, priority, start, end, step)); + __kmp_str_free(&buff); + } +#endif + KMP_ASSERT((size_t)arg_size >= 2 * sizeof(T)); + KMP_ASSERT(arg_align > 0); + // The low-order bit is the "untied" flag + if (!(gomp_flags & 1)) { + input_flags->tiedness = 1; + } + // The second low-order bit is the "final" flag + if (gomp_flags & 2) { + input_flags->final = 1; + } + // Negative step flag + if (!up) { + // If step is flagged as negative, but isn't properly sign extended + // Then manually sign extend it. Could be a short, int, char embedded + // in a long. So cannot assume any cast. + if (step > 0) { + for (int i = sizeof(T) * CHAR_BIT - 1; i >= 0L; --i) { + // break at the first 1 bit + if (step & ((T)1 << i)) + break; + step |= ((T)1 << i); + } + } + } + input_flags->native = 1; + // Figure out if none/grainsize/num_tasks clause specified + if (num_tasks > 0) { + if (gomp_flags & (1u << 9)) + sched = 1; // grainsize specified + else + sched = 2; // num_tasks specified + // neither grainsize nor num_tasks specified + } else { + sched = 0; + } + + // __kmp_task_alloc() sets up all other flags + kmp_task_t *task = + __kmp_task_alloc(&loc, gtid, input_flags, sizeof(kmp_task_t), + arg_size + arg_align - 1, (kmp_routine_entry_t)func); + kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); + taskdata->td_copy_func = copy_func; + taskdata->td_size_loop_bounds = sizeof(T); + + // re-align shareds if needed and setup firstprivate copy constructors + // through the task_dup mechanism + task->shareds = (void *)((((size_t)task->shareds) + arg_align - 1) / + arg_align * arg_align); + if (copy_func) { + task_dup = __kmp_gomp_task_dup; + } + KMP_MEMCPY(task->shareds, data, arg_size); + + loop_bounds = (T *)task->shareds; + loop_bounds[0] = start; + loop_bounds[1] = end + (up ? -1 : 1); + __kmpc_taskloop(&loc, gtid, task, if_val, (kmp_uint64 *)&(loop_bounds[0]), + (kmp_uint64 *)&(loop_bounds[1]), (kmp_int64)step, nogroup, + sched, (kmp_uint64)num_tasks, (void *)task_dup); +} + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP)( + void (*func)(void *), void *data, void (*copy_func)(void *, void *), + long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks, + int priority, long start, long end, long step) { + __GOMP_taskloop(func, data, copy_func, arg_size, arg_align, gomp_flags, + num_tasks, priority, start, end, step); +} + +void KMP_EXPAND_NAME(KMP_API_NAME_GOMP_TASKLOOP_ULL)( + void (*func)(void *), void *data, void (*copy_func)(void *, void *), + long arg_size, long arg_align, unsigned gomp_flags, unsigned long num_tasks, + int priority, unsigned long long start, unsigned long long end, + unsigned long long step) { + __GOMP_taskloop(func, data, copy_func, arg_size, + arg_align, gomp_flags, num_tasks, + priority, start, end, step); +} + +#endif + /* The following sections of code create aliases for the GOMP_* functions, then create versioned symbols using the assembler directive .symver. This is only pertinent for ELF .so library. The KMP_VERSION_SYMBOL macro is defined in @@ -1521,6 +1652,11 @@ KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TEAMS, 40, "GOMP_4.0"); #endif +#if OMP_45_ENABLED +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP, 45, "GOMP_4.5"); +KMP_VERSION_SYMBOL(KMP_API_NAME_GOMP_TASKLOOP_ULL, 45, "GOMP_4.5"); +#endif + #endif // KMP_USE_VERSION_SYMBOLS #ifdef __cplusplus Index: openmp/trunk/runtime/src/kmp_os.h =================================================================== --- openmp/trunk/runtime/src/kmp_os.h +++ openmp/trunk/runtime/src/kmp_os.h @@ -209,6 +209,14 @@ static const unsigned_t min_value = 0x00000000; static const int type_size = sizeof(unsigned_t); }; +// long +template <> struct traits_t { + typedef signed long signed_t; + typedef unsigned long unsigned_t; + typedef long double floating_t; + static char const *spec; + static const int type_size = sizeof(signed_t); +}; // long long template <> struct traits_t { typedef signed long long signed_t; Index: openmp/trunk/runtime/src/kmp_sched.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_sched.cpp +++ openmp/trunk/runtime/src/kmp_sched.cpp @@ -35,6 +35,7 @@ char const *traits_t::spec = "u"; char const *traits_t::spec = "lld"; char const *traits_t::spec = "llu"; +char const *traits_t::spec = "ld"; //------------------------------------------------------------------------- #endif Index: openmp/trunk/runtime/src/kmp_tasking.cpp =================================================================== --- openmp/trunk/runtime/src/kmp_tasking.cpp +++ openmp/trunk/runtime/src/kmp_tasking.cpp @@ -3559,6 +3559,112 @@ // Parameters: dest task, src task, lastprivate flag. typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); +KMP_BUILD_ASSERT(sizeof(long) == 4 || sizeof(long) == 8); + +// class to encapsulate manipulating loop bounds in a taskloop task. +// this abstracts away the Intel vs GOMP taskloop interface for setting/getting +// the loop bound variables. +class kmp_taskloop_bounds_t { + kmp_task_t *task; + const kmp_taskdata_t *taskdata; + size_t lower_offset; + size_t upper_offset; + +public: + kmp_taskloop_bounds_t(kmp_task_t *_task, kmp_uint64 *lb, kmp_uint64 *ub) + : task(_task), taskdata(KMP_TASK_TO_TASKDATA(task)), + lower_offset((char *)lb - (char *)task), + upper_offset((char *)ub - (char *)task) { + KMP_DEBUG_ASSERT((char *)lb > (char *)_task); + KMP_DEBUG_ASSERT((char *)ub > (char *)_task); + } + kmp_taskloop_bounds_t(kmp_task_t *_task, const kmp_taskloop_bounds_t &bounds) + : task(_task), taskdata(KMP_TASK_TO_TASKDATA(_task)), + lower_offset(bounds.lower_offset), upper_offset(bounds.upper_offset) {} + size_t get_lower_offset() const { return lower_offset; } + size_t get_upper_offset() const { return upper_offset; } + kmp_uint64 get_lb() const { + kmp_int64 retval; +#if defined(KMP_GOMP_COMPAT) + // Intel task just returns the lower bound normally + if (!taskdata->td_flags.native) { + retval = *(kmp_int64 *)((char *)task + lower_offset); + } else { + // GOMP task has to take into account the sizeof(long) + if (taskdata->td_size_loop_bounds == 4) { + kmp_int32 *lb = RCAST(kmp_int32 *, task->shareds); + retval = (kmp_int64)*lb; + } else { + kmp_int64 *lb = RCAST(kmp_int64 *, task->shareds); + retval = (kmp_int64)*lb; + } + } +#else + retval = *(kmp_int64 *)((char *)task + lower_offset); +#endif // defined(KMP_GOMP_COMPAT) + return retval; + } + kmp_uint64 get_ub() const { + kmp_int64 retval; +#if defined(KMP_GOMP_COMPAT) + // Intel task just returns the upper bound normally + if (!taskdata->td_flags.native) { + retval = *(kmp_int64 *)((char *)task + upper_offset); + } else { + // GOMP task has to take into account the sizeof(long) + if (taskdata->td_size_loop_bounds == 4) { + kmp_int32 *ub = RCAST(kmp_int32 *, task->shareds) + 1; + retval = (kmp_int64)*ub; + } else { + kmp_int64 *ub = RCAST(kmp_int64 *, task->shareds) + 1; + retval = (kmp_int64)*ub; + } + } +#else + retval = *(kmp_int64 *)((char *)task + upper_offset); +#endif // defined(KMP_GOMP_COMPAT) + return retval; + } + void set_lb(kmp_uint64 lb) { +#if defined(KMP_GOMP_COMPAT) + // Intel task just sets the lower bound normally + if (!taskdata->td_flags.native) { + *(kmp_uint64 *)((char *)task + lower_offset) = lb; + } else { + // GOMP task has to take into account the sizeof(long) + if (taskdata->td_size_loop_bounds == 4) { + kmp_uint32 *lower = RCAST(kmp_uint32 *, task->shareds); + *lower = (kmp_uint32)lb; + } else { + kmp_uint64 *lower = RCAST(kmp_uint64 *, task->shareds); + *lower = (kmp_uint64)lb; + } + } +#else + *(kmp_uint64 *)((char *)task + lower_offset) = lb; +#endif // defined(KMP_GOMP_COMPAT) + } + void set_ub(kmp_uint64 ub) { +#if defined(KMP_GOMP_COMPAT) + // Intel task just sets the upper bound normally + if (!taskdata->td_flags.native) { + *(kmp_uint64 *)((char *)task + upper_offset) = ub; + } else { + // GOMP task has to take into account the sizeof(long) + if (taskdata->td_size_loop_bounds == 4) { + kmp_uint32 *upper = RCAST(kmp_uint32 *, task->shareds) + 1; + *upper = (kmp_uint32)ub; + } else { + kmp_uint64 *upper = RCAST(kmp_uint64 *, task->shareds) + 1; + *upper = (kmp_uint64)ub; + } + } +#else + *(kmp_uint64 *)((char *)task + upper_offset) = ub; +#endif // defined(KMP_GOMP_COMPAT) + } +}; + // __kmp_taskloop_linear: Start tasks of the taskloop linearly // // loc Source location information @@ -3581,17 +3687,15 @@ KMP_COUNT_BLOCK(OMP_TASKLOOP); KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling); p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; - kmp_uint64 lower = *lb; // compiler provides global bounds here - kmp_uint64 upper = *ub; + // compiler provides global bounds here + kmp_taskloop_bounds_t task_bounds(task, lb, ub); + kmp_uint64 lower = task_bounds.get_lb(); + kmp_uint64 upper = task_bounds.get_ub(); kmp_uint64 i; kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *current_task = thread->th.th_current_task; kmp_task_t *next_task; kmp_int32 lastpriv = 0; - size_t lower_offset = - (char *)lb - (char *)task; // remember offset of lb in the task structure - size_t upper_offset = - (char *)ub - (char *)task; // remember offset of ub in the task structure KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); KMP_DEBUG_ASSERT(num_tasks > extras); @@ -3628,14 +3732,25 @@ } } next_task = __kmp_task_dup_alloc(thread, task); // allocate new task + kmp_taskdata_t *next_taskdata = KMP_TASK_TO_TASKDATA(next_task); + kmp_taskloop_bounds_t next_task_bounds = + kmp_taskloop_bounds_t(next_task, task_bounds); + // adjust task-specific bounds - *(kmp_uint64 *)((char *)next_task + lower_offset) = lower; - *(kmp_uint64 *)((char *)next_task + upper_offset) = upper; + next_task_bounds.set_lb(lower); + if (next_taskdata->td_flags.native) { + next_task_bounds.set_ub(upper + (st > 0 ? 1 : -1)); + } else { + next_task_bounds.set_ub(upper); + } if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc. ptask_dup(next_task, task, lastpriv); - KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, " - "upper %lld (offsets %p %p)\n", - gtid, next_task, lower, upper, lower_offset, upper_offset)); + KA_TRACE(40, + ("__kmp_taskloop_linear: T#%d; task #%llu: task %p: lower %lld, " + "upper %lld stride %lld, (offsets %p %p)\n", + gtid, i, next_task, lower, upper, st, + next_task_bounds.get_lower_offset(), + next_task_bounds.get_upper_offset())); __kmp_omp_task(gtid, next_task, true); // schedule new task lower = upper + st; // adjust lower bound for the next iteration } @@ -3827,10 +3942,6 @@ kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task); KMP_DEBUG_ASSERT(task != NULL); - KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " - "grain %llu(%d), dup %p\n", - gtid, taskdata, *lb, *ub, st, grainsize, sched, task_dup)); - #if OMPT_SUPPORT && OMPT_OPTIONAL ompt_team_info_t *team_info = __ompt_get_teaminfo(0, NULL); ompt_task_info_t *task_info = __ompt_get_task_info_object(0); @@ -3850,15 +3961,21 @@ // ========================================================================= // calculate loop parameters + kmp_taskloop_bounds_t task_bounds(task, lb, ub); kmp_uint64 tc; - kmp_uint64 lower = *lb; // compiler provides global bounds here - kmp_uint64 upper = *ub; + // compiler provides global bounds here + kmp_uint64 lower = task_bounds.get_lb(); + kmp_uint64 upper = task_bounds.get_ub(); kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag kmp_uint64 num_tasks = 0, extras = 0; kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks; kmp_info_t *thread = __kmp_threads[gtid]; kmp_taskdata_t *current_task = thread->th.th_current_task; + KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, " + "grain %llu(%d), dup %p\n", + gtid, taskdata, lower, upper, st, grainsize, sched, task_dup)); + // compute trip count if (st == 1) { // most common case tc = upper - lower + 1; @@ -3917,6 +4034,7 @@ // ========================================================================= // check if clause value first + // Also require GOMP_taskloop to reduce to linear (taskdata->td_flags.native) if (if_val == 0) { // if(0) specified, mark task as serial taskdata->td_flags.task_serial = 1; taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied @@ -3926,7 +4044,9 @@ // always start serial tasks linearly __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks, grainsize, extras, tc, task_dup); - } else if (num_tasks > num_tasks_min) { + // !taskdata->td_flags.native => currently force linear spawning of tasks + // for GOMP_taskloop + } else if (num_tasks > num_tasks_min && !taskdata->td_flags.native) { KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu" "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks, num_tasks_min, grainsize, extras)); Index: openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c =================================================================== --- openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c +++ openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c @@ -1,7 +1,5 @@ // RUN: %libomp-compile-and-run // RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run -// UNSUPPORTED: gcc -// We do not yet have the GOMP interface for taskloop /* * Test for taskloop * Method: caculate how many times the iteration space is dispatched Index: openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c =================================================================== --- openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c +++ openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c @@ -1,7 +1,5 @@ // RUN: %libomp-compile-and-run // RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run -// UNSUPPORTED: gcc -// We do not yet have the GOMP interface for taskloop /* * Test for taskloop * Method: caculate how many times the iteration space is dispatched