Index: openmp/trunk/runtime/src/dllexports =================================================================== --- openmp/trunk/runtime/src/dllexports +++ openmp/trunk/runtime/src/dllexports @@ -393,6 +393,7 @@ __kmpc_doacross_wait 262 __kmpc_doacross_post 263 __kmpc_doacross_fini 264 + __kmpc_taskloop 266 %endif %endif Index: openmp/trunk/runtime/src/kmp.h =================================================================== --- openmp/trunk/runtime/src/kmp.h +++ openmp/trunk/runtime/src/kmp.h @@ -2205,11 +2205,7 @@ #endif #if OMP_41_ENABLED kmp_task_team_t * td_task_team; -#endif -#if KMP_HAVE_QUAD - _Quad td_dummy; // Align structure 16-byte size since allocated just before kmp_task_t -#else - kmp_uint32 td_dummy[2]; + kmp_int32 td_size_alloc; // The size of task structure, including shareds etc. #endif }; // struct kmp_taskdata @@ -3478,7 +3474,9 @@ KMP_EXPORT void __kmpc_proxy_task_completed( kmp_int32 gtid, kmp_task_t *ptask ); KMP_EXPORT void __kmpc_proxy_task_completed_ooo ( kmp_task_t *ptask ); - +KMP_EXPORT void __kmpc_taskloop(ident_t *loc, kmp_int32 gtid, kmp_task_t *task, kmp_int32 if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + kmp_int32 nogroup, kmp_int32 sched, kmp_uint64 grainsize, void * task_dup ); #endif #endif Index: openmp/trunk/runtime/src/kmp_tasking.c =================================================================== --- openmp/trunk/runtime/src/kmp_tasking.c +++ openmp/trunk/runtime/src/kmp_tasking.c @@ -1000,6 +1000,7 @@ #if OMP_41_ENABLED taskdata->td_flags.proxy = flags->proxy; taskdata->td_task_team = thread->th.th_task_team; + taskdata->td_size_alloc = shareds_offset + sizeof_shareds; #endif taskdata->td_flags.tasktype = TASK_EXPLICIT; @@ -2877,4 +2878,231 @@ KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) ); } +//--------------------------------------------------------------------------------- +// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop +// +// thread: allocating thread +// task_src: pointer to source task to be duplicated +// returns: a pointer to the allocated kmp_task_t structure (task). +kmp_task_t * +__kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src ) +{ + kmp_task_t *task; + kmp_taskdata_t *taskdata; + kmp_taskdata_t *taskdata_src; + kmp_taskdata_t *parent_task = thread->th.th_current_task; + size_t shareds_offset; + size_t task_size; + + KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) ); + taskdata_src = KMP_TASK_TO_TASKDATA( task_src ); + KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task + KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT ); + task_size = taskdata_src->td_size_alloc; + + // Allocate a kmp_taskdata_t block and a kmp_task_t block. + KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) ); + #if USE_FAST_MEMORY + taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size ); + #else + taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size ); + #endif /* USE_FAST_MEMORY */ + KMP_MEMCPY(taskdata, taskdata_src, task_size); + + task = KMP_TASKDATA_TO_TASK(taskdata); + + // Initialize new task (only specific fields not affected by memcpy) + taskdata->td_task_id = KMP_GEN_TASK_ID(); + if( task->shareds != NULL ) { // need setup shareds pointer + shareds_offset = (char*)task_src->shareds - (char*)taskdata_src; + task->shareds = &((char*)taskdata)[shareds_offset]; + KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 ); + } + taskdata->td_alloc_thread = thread; + taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task + + // Only need to keep track of child task counts if team parallel and tasking not serialized + if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) { + KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) ); + if ( parent_task->td_taskgroup ) + KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) ); + // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated + if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT ) + KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) ); + } + + KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n", + thread, taskdata, taskdata->td_parent) ); +#if OMPT_SUPPORT + __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine); +#endif + return task; +} + +// Routine optionally generated by th ecompiler for setting the lastprivate flag +// and calling needed constructors for private/firstprivate objects +// (used to form taskloop tasks from pattern task) +typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32); + +//--------------------------------------------------------------------------------- +// __kmp_taskloop_linear: Start tasks of the taskloop linearly +// +// loc Source location information +// gtid Global thread ID +// task Task with whole loop iteration range +// lb Pointer to loop lower bound +// ub Pointer to loop upper bound +// st Loop stride +// sched Schedule specified 0/1/2 for none/grainsize/num_tasks +// grainsize Schedule value if specified +// task_dup Tasks duplication routine +void +__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int sched, kmp_uint64 grainsize, void *task_dup ) +{ + p_task_dup_t ptask_dup = (p_task_dup_t)task_dup; + kmp_uint64 tc; + kmp_uint64 lower = *lb; // compiler provides global bounds here + kmp_uint64 upper = *ub; + kmp_uint64 i, num_tasks, extras; + kmp_info_t *thread = __kmp_threads[gtid]; + kmp_taskdata_t *current_task = thread->th.th_current_task; + kmp_task_t *next_task; + kmp_int32 lastpriv = 0; + size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure + size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure + + // compute trip count + if ( st == 1 ) { // most common case + tc = upper - lower + 1; + } else if ( st < 0 ) { + tc = (lower - upper) / (-st) + 1; + } else { // st > 0 + tc = (upper - lower) / st + 1; + } + if(tc == 0) { + // free the pattern task and exit + __kmp_task_start( gtid, task, current_task ); + // do not execute anything for zero-trip loop + __kmp_task_finish( gtid, task, current_task ); + return; + } + + // compute num_tasks/grainsize based on the input provided + switch( sched ) { + case 0: // no schedule clause specified, we can choose the default + // let's try to schedule (team_size*10) tasks + grainsize = thread->th.th_team_nproc * 10; + case 2: // num_tasks provided + if( grainsize > tc ) { + num_tasks = tc; // too big num_tasks requested, adjust values + grainsize = 1; + extras = 0; + } else { + num_tasks = grainsize; + grainsize = tc / num_tasks; + extras = tc % num_tasks; + } + break; + case 1: // grainsize provided + if( grainsize > tc ) { + num_tasks = 1; // too big grainsize requested, adjust values + grainsize = tc; + extras = 0; + } else { + num_tasks = tc / grainsize; + grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations + extras = tc % num_tasks; + } + break; + default: + KMP_ASSERT2(0, "unknown scheduling of taskloop"); + } + KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras); + KMP_DEBUG_ASSERT(num_tasks > extras); + KMP_DEBUG_ASSERT(num_tasks > 0); + + // Main loop, launch num_tasks tasks, assign grainsize iterations each task + for( i = 0; i < num_tasks; ++i ) { + kmp_uint64 chunk_minus_1; + if( extras == 0 ) { + chunk_minus_1 = grainsize - 1; + } else { + chunk_minus_1 = grainsize; + --extras; // first extras iterations get bigger chunk (grainsize+1) + } + upper = lower + st * chunk_minus_1; + if( i == num_tasks - 1 ) { + // schedule the last task, set lastprivate flag + lastpriv = 1; +#if KMP_DEBUG + if( st == 1 ) + KMP_DEBUG_ASSERT(upper == *ub); + else if( st > 0 ) + KMP_DEBUG_ASSERT(upper+st > *ub); + else + KMP_DEBUG_ASSERT(upper+st < *ub); +#endif + } + next_task = __kmp_task_dup_alloc(thread, task); // allocate new task + *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds + *(kmp_uint64*)((char*)next_task + upper_offset) = upper; + if( ptask_dup != NULL ) + ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc. + __kmp_omp_task(gtid, next_task, true); // schedule new task + lower = upper + st; // adjust lower bound for the next iteration + } + // free the pattern task and exit + __kmp_task_start( gtid, task, current_task ); + // do not execute the pattern task, just do bookkeeping + __kmp_task_finish( gtid, task, current_task ); +} + +/*! +@ingroup TASKING +@param loc Source location information +@param gtid Global thread ID +@param task Task structure +@param if_val Value of the if clause +@param lb Pointer to loop lower bound +@param ub Pointer to loop upper bound +@param st Loop stride +@param nogroup Flag, 1 if nogroup clause specified, 0 otherwise +@param sched Schedule specified 0/1/2 for none/grainsize/num_tasks +@param grainsize Schedule value if specified +@param task_dup Tasks duplication routine + +Execute the taskloop construct. +*/ +void +__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_uint64 grainsize, void *task_dup ) +{ + kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task); + KMP_DEBUG_ASSERT( task != NULL ); + + KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n", + gtid, taskdata, *lb, *ub, st, grainsize, sched)); + + // check if clause value first + if( if_val == 0 ) { // if(0) specified, mark task as serial + taskdata->td_flags.task_serial = 1; + taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied + } + if( nogroup == 0 ) { + __kmpc_taskgroup( loc, gtid ); + } + + if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) { + __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup ); + } + + if( nogroup == 0 ) { + __kmpc_end_taskgroup( loc, gtid ); + } + KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid)); +} + #endif Index: openmp/trunk/runtime/test/tasking/kmp_taskloop.c =================================================================== --- openmp/trunk/runtime/test/tasking/kmp_taskloop.c +++ openmp/trunk/runtime/test/tasking/kmp_taskloop.c @@ -0,0 +1,158 @@ +// RUN: %libomp-compile-and-run +#include +#include +#include "omp_my_sleep.h" + +#define N 4 +#define GRAIN 10 +#define STRIDE 3 + +// globals +int th_counter[N]; +int counter; + + +// Compiler-generated code (emulation) +typedef struct ident { + void* dummy; +} ident_t; + +typedef struct shar { + int(*pth_counter)[N]; + int *pcounter; + int *pj; +} *pshareds; + +typedef struct task { + pshareds shareds; + int(* routine)(int,struct task*); + int part_id; +// privates: + unsigned long long lb; // library always uses ULONG + unsigned long long ub; + int st; + int last; + int i; + int j; + int th; +} *ptask, kmp_task_t; + +typedef int(* task_entry_t)( int, ptask ); + +void +__task_dup_entry(ptask task_dst, ptask task_src, int lastpriv) +{ +// setup lastprivate flag + task_dst->last = lastpriv; +// could be constructor calls here... +} + + +// OpenMP RTL interfaces +typedef unsigned long long kmp_uint64; +typedef long long kmp_int64; + +#ifdef __cplusplus +extern "C" { +#endif +void +__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val, + kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, + int nogroup, int sched, kmp_int64 grainsize, void *task_dup ); +ptask +__kmpc_omp_task_alloc( ident_t *loc, int gtid, int flags, + size_t sizeof_kmp_task_t, size_t sizeof_shareds, + task_entry_t task_entry ); +void __kmpc_atomic_fixed4_add(void *id_ref, int gtid, int * lhs, int rhs); +int __kmpc_global_thread_num(void *id_ref); +#ifdef __cplusplus +} +#endif + + +// User's code +int task_entry(int gtid, ptask task) +{ + pshareds pshar = task->shareds; + for( task->i = task->lb; task->i <= (int)task->ub; task->i += task->st ) { + task->th = omp_get_thread_num(); + __kmpc_atomic_fixed4_add(NULL,gtid,pshar->pcounter,1); + __kmpc_atomic_fixed4_add(NULL,gtid,&((*pshar->pth_counter)[task->th]),1); + task->j = task->i; + } + my_sleep( 0.1 ); // sleep 100 ms in order to allow other threads to steal tasks + if( task->last ) { + *(pshar->pj) = task->j; // lastprivate + } + return 0; +} + +int main() +{ + int i, j, gtid = __kmpc_global_thread_num(NULL); + ptask task; + pshareds psh; + omp_set_dynamic(0); + counter = 0; + for( i=0; ishareds; + psh->pth_counter = &th_counter; + psh->pcounter = &counter; + psh->pj = &j; + task->lb = 0; + task->ub = N*GRAIN*STRIDE-2; + task->st = STRIDE; + + __kmpc_taskloop( + NULL, // location + gtid, // gtid + task, // task structure + 1, // if clause value + &task->lb, // lower bound + &task->ub, // upper bound + STRIDE, // loop increment + 0, // 1 if nogroup specified + 2, // schedule type: 0-none, 1-grainsize, 2-num_tasks + N, // schedule value (ignored for type 0) + (void*)&__task_dup_entry // tasks duplication routine + ); + } // end master + } // end parallel +// check results + if( j != N*GRAIN*STRIDE-STRIDE ) { + printf("Error in lastprivate, %d != %d\n",j,N*GRAIN*STRIDE-STRIDE); + return 1; + } + if( counter != N*GRAIN ) { + printf("Error, counter %d != %d\n",counter,N*GRAIN); + return 1; + } + for( i=0; i