Index: runtime/src/kmp.h =================================================================== --- runtime/src/kmp.h +++ runtime/src/kmp.h @@ -334,10 +334,12 @@ #if OMP_45_ENABLED /* static with chunk adjustment (e.g., simd) */ kmp_sch_static_balanced_chunked = 45, + kmp_sch_guided_simd = 46, /**< guided with chunk adjustment */ + kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */ #endif /* accessible only through KMP_SCHEDULE environment variable */ - kmp_sch_upper = 46, /**< upper bound for unordered values */ + kmp_sch_upper = 48, /**< upper bound for unordered values */ kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */ kmp_ord_static_chunked = 65, Index: runtime/src/kmp_dispatch.cpp =================================================================== --- runtime/src/kmp_dispatch.cpp +++ runtime/src/kmp_dispatch.cpp @@ -681,6 +681,35 @@ schedule = kmp_sch_guided_iterative_chunked; KMP_WARNING(DispatchManyThreads); } + if (schedule == kmp_sch_runtime_simd) { + // compiler provides simd_width in the chunk parameter + schedule = team->t.t_sched.r_sched_type; + // Detail the schedule if needed (global controls are differentiated + // appropriately) + if (schedule == kmp_sch_static || schedule == kmp_sch_auto || + schedule == __kmp_static) { + schedule = kmp_sch_static_balanced_chunked; + } else { + if (schedule == kmp_sch_guided_chunked || schedule == __kmp_guided) { + schedule = kmp_sch_guided_simd; + } + chunk = team->t.t_sched.chunk * chunk; + } +#if USE_ITT_BUILD + cur_chunk = chunk; +#endif +#ifdef KMP_DEBUG + { + const char *buff; + // create format specifiers before the debug output + buff = __kmp_str_format("__kmp_dispatch_init: T#%%d new: schedule:%%d" + " chunk:%%%s\n", + traits_t::spec); + KD_TRACE(10, (buff, gtid, schedule, chunk)); + __kmp_str_free(&buff); + } +#endif + } pr->u.p.parm1 = chunk; } KMP_ASSERT2((kmp_sch_lower < schedule && schedule < kmp_sch_upper), @@ -878,7 +907,21 @@ } break; } // case - case kmp_sch_guided_iterative_chunked: { + case kmp_sch_static_balanced_chunked: { + // similar to balanced, but chunk adjusted to multiple of simd width + T nth = th->th.th_team_nproc; + KD_TRACE(100, ("__kmp_dispatch_init: T#%d runtime(simd:static)" + " -> falling-through to static_greedy\n", + gtid)); + schedule = kmp_sch_static_greedy; + if (nth > 1) + pr->u.p.parm1 = ((tc + nth - 1) / nth + chunk - 1) & ~(chunk - 1); + else + pr->u.p.parm1 = tc; + break; + } // case + case kmp_sch_guided_iterative_chunked: + case kmp_sch_guided_simd: { T nproc = th->th.th_team_nproc; KD_TRACE(100, ("__kmp_dispatch_init: T#%d kmp_sch_guided_iterative_chunked" " case\n", @@ -1140,6 +1183,7 @@ break; case kmp_sch_guided_iterative_chunked: case kmp_sch_guided_analytical_chunked: + case kmp_sch_guided_simd: schedtype = 2; break; default: @@ -1991,6 +2035,89 @@ } // case break; + case kmp_sch_guided_simd: { + // same as iterative but curr-chunk adjusted to be multiple of given + // chunk + T chunk = pr->u.p.parm1; + KD_TRACE(100, ("__kmp_dispatch_next: T#%d kmp_sch_guided_simd case\n", + gtid)); + trip = pr->u.p.tc; + // Start atomic part of calculations + while (1) { + ST remaining; // signed, because can be < 0 + init = sh->u.s.iteration; // shared value + remaining = trip - init; + if (remaining <= 0) { // AC: need to compare with 0 first + status = 0; // nothing to do, don't try atomic op + break; + } + KMP_DEBUG_ASSERT(init % chunk == 0); + // compare with K*nproc*(chunk+1), K=2 by default + if ((T)remaining < pr->u.p.parm2) { + // use dynamic-style shcedule + // atomically inrement iterations, get old value + init = test_then_add((ST *)&sh->u.s.iteration, (ST)chunk); + remaining = trip - init; + if (remaining <= 0) { + status = 0; // all iterations got by other threads + } else { + // got some iterations to work on + status = 1; + if ((T)remaining > chunk) { + limit = init + chunk - 1; + } else { + last = 1; // the last chunk + limit = init + remaining - 1; + } // if + } // if + break; + } // if + // divide by K*nproc + UT span = remaining * (*(double *)&pr->u.p.parm3); + UT rem = span % chunk; + if (rem) // adjust so that span%chunk == 0 + span += chunk - rem; + limit = init + span; + if (compare_and_swap((ST *)&sh->u.s.iteration, (ST)init, + (ST)limit)) { + // CAS was successful, chunk obtained + status = 1; + --limit; + break; + } // if + } // while + if (status != 0) { + start = pr->u.p.lb; + incr = pr->u.p.st; + if (p_st != NULL) + *p_st = incr; + *p_lb = start + init * incr; + *p_ub = start + limit * incr; + if (pr->ordered) { + pr->u.p.ordered_lower = init; + pr->u.p.ordered_upper = limit; +#ifdef KMP_DEBUG + { + const char *buff; + // create format specifiers before the debug output + buff = __kmp_str_format("__kmp_dispatch_next: T#%%d " + "ordered_lower:%%%s ordered_upper:%%%s\n", + traits_t::spec, traits_t::spec); + KD_TRACE(1000, (buff, gtid, pr->u.p.ordered_lower, + pr->u.p.ordered_upper)); + __kmp_str_free(&buff); + } +#endif + } // if + } else { + *p_lb = 0; + *p_ub = 0; + if (p_st != NULL) + *p_st = 0; + } // if + } // case + break; + case kmp_sch_guided_analytical_chunked: { T chunkspec = pr->u.p.parm1; UT chunkIdx; Index: runtime/src/kmp_runtime.cpp =================================================================== --- runtime/src/kmp_runtime.cpp +++ runtime/src/kmp_runtime.cpp @@ -2744,7 +2744,7 @@ __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2]; } - if (kind == kmp_sched_auto) { + if (kind == kmp_sched_auto || chunk < 1) { // ignore parameter chunk for schedule auto thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; } else { Index: runtime/test/worksharing/for/kmp_sch_simd_guided.c =================================================================== --- runtime/test/worksharing/for/kmp_sch_simd_guided.c +++ runtime/test/worksharing/for/kmp_sch_simd_guided.c @@ -0,0 +1,410 @@ +// RUN: %libomp-compile-and-run +/* + Test for the 'schedule(simd:guided)' clause. + Compiler needs to generate a dynamic dispatching and pass the schedule + value 46 to the OpenMP RTL. Test uses numerous loop parameter combinations. +*/ +#include +#include + +#if defined(WIN32) || defined(_WIN32) +#include +#define delay() Sleep(1); +#else +#include +#define delay() usleep(10); +#endif + +// uncomment for debug diagnostics: +//#define DEBUG + +#define SIMD_LEN 4 + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL +enum sched { + kmp_sch_static_balanced_chunked = 45, + kmp_sch_guided_simd = 46, + kmp_sch_runtime_simd = 47, +}; +typedef unsigned u32; +typedef long long i64; +typedef unsigned long long u64; +typedef struct { + int reserved_1; + int flags; + int reserved_2; + int reserved_3; + char *psource; +} id; + +extern int __kmpc_global_thread_num(id*); +extern void __kmpc_barrier(id*, int gtid); +extern void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int); +extern void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64); +extern int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*); +extern int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*); +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- +static id loc = {0, 2, 0, 0, ";file;func;0;0;;"}; + +// --------------------------------------------------------------------------- +int run_loop_64(i64 loop_lb, i64 loop_ub, i64 loop_st, int loop_chunk) { + int err = 0; + static int volatile loop_sync = 0; + i64 lb; // Chunk lower bound + i64 ub; // Chunk upper bound + i64 st; // Chunk stride + int rc; + int tid = omp_get_thread_num(); + int gtid = tid; + int last; +#if DEBUG + printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n", + (int)sizeof(i64), gtid, tid, + (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk); +#endif + // Don't test degenerate cases that should have been discovered by codegen + if (loop_st == 0) + return 0; + if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) + return 0; + + __kmpc_dispatch_init_8(&loc, gtid, kmp_sch_guided_simd, + loop_lb, loop_ub, loop_st, loop_chunk); + if (tid == 0) { + // Let the master thread handle the chunks alone + int chunk; // No of current chunk + i64 next_lb; // Lower bound of the next chunk + i64 last_ub; // Upper bound of the last processed chunk + u64 cur; // Number of interations in current chunk + u64 max; // Max allowed iterations for current chunk + int undersized = 0; + + chunk = 0; + next_lb = loop_lb; + max = (loop_ub - loop_lb) / loop_st + 1; + // The first chunk can consume all iterations + while (__kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st)) { + ++ chunk; +#if DEBUG + printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub); +#endif + // Check if previous chunk (it is not the final chunk) is undersized + if (undersized) { + printf("Error with chunk %d\n", chunk); + err++; + } + // Check lower and upper bounds + if (lb != next_lb) { + printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk); + err++; + } + if (loop_st > 0) { + if (!(ub <= loop_ub)) { + printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk); + err++; + } + if (!(lb <= ub)) { + printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); + err++; + } + } else { + if (!(ub >= loop_ub)) { + printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk); + err++; + } + if (!(lb >= ub)) { + printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); + err++; + } + }; // if + // Stride should not change + if (!(st == loop_st)) { + printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk); + err++; + } + cur = (ub - lb) / loop_st + 1; + // Guided scheduling uses FP computations, so current chunk may + // be a bit bigger (+1) than allowed maximum + if (!(cur <= max + 1)) { + printf("Error with iter %d, %d\n", cur, max); + err++; + } + // Update maximum for the next chunk + if (cur < max) + max = cur; + next_lb = ub + loop_st; + last_ub = ub; + undersized = (cur < loop_chunk); + }; // while + // Must have at least one chunk + if (!(chunk > 0)) { + printf("Error with chunk %d\n", chunk); + err++; + } + // Must have the right last iteration index + if (loop_st > 0) { + if (!(last_ub <= loop_ub)) { + printf("Error with last1 %d, %d, ch %d\n", + (int)last_ub, (int)loop_ub, chunk); + err++; + } + if (!(last_ub + loop_st > loop_ub)) { + printf("Error with last2 %d, %d, %d, ch %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk); + err++; + } + } else { + if (!(last_ub >= loop_ub)) { + printf("Error with last1 %d, %d, ch %d\n", + (int)last_ub, (int)loop_ub, chunk); + err++; + } + if (!(last_ub + loop_st < loop_ub)) { + printf("Error with last2 %d, %d, %d, ch %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk); + err++; + } + }; // if + // Let non-master threads go + loop_sync = 1; + } else { + int i; + // Workers wait for master thread to finish, then call __kmpc_dispatch_next + for (i = 0; i < 1000000; ++ i) { + if (loop_sync != 0) { + break; + }; // if + }; // for i + while (loop_sync == 0) { + delay(); + }; // while + // At this moment we do not have any more chunks -- all the chunks already + // processed by master thread + rc = __kmpc_dispatch_next_8(&loc, gtid, &last, &lb, &ub, &st); + if (rc) { + printf("Error return value\n"); + err++; + } + }; // if + + __kmpc_barrier(&loc, gtid); + if (tid == 0) { + loop_sync = 0; // Restore original state +#if DEBUG + printf("run_loop_64(): at the end\n"); +#endif + }; // if + __kmpc_barrier(&loc, gtid); + return err; +} // run_loop + +// --------------------------------------------------------------------------- +int run_loop_32(int loop_lb, int loop_ub, int loop_st, int loop_chunk) { + int err = 0; + static int volatile loop_sync = 0; + int lb; // Chunk lower bound + int ub; // Chunk upper bound + int st; // Chunk stride + int rc; + int tid = omp_get_thread_num(); + int gtid = tid; + int last; +#if DEBUG + printf("run_loop_<%d>(lb=%d, ub=%d, st=%d, ch=%d)\n", + (int)sizeof(int), gtid, tid, + (int)loop_lb, (int)loop_ub, (int)loop_st, loop_chunk); +#endif + // Don't test degenerate cases that should have been discovered by codegen + if (loop_st == 0) + return 0; + if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) + return 0; + + __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_guided_simd, + loop_lb, loop_ub, loop_st, loop_chunk); + if (tid == 0) { + // Let the master thread handle the chunks alone + int chunk; // No of current chunk + int next_lb; // Lower bound of the next chunk + int last_ub; // Upper bound of the last processed chunk + u64 cur; // Number of interations in current chunk + u64 max; // Max allowed iterations for current chunk + int undersized = 0; + + chunk = 0; + next_lb = loop_lb; + max = (loop_ub - loop_lb) / loop_st + 1; + // The first chunk can consume all iterations + while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { + ++ chunk; +#if DEBUG + printf("chunk=%d, lb=%d, ub=%d\n", chunk, (int)lb, (int)ub); +#endif + // Check if previous chunk (it is not the final chunk) is undersized + if (undersized) { + printf("Error with chunk %d\n", chunk); + err++; + } + // Check lower and upper bounds + if (lb != next_lb) { + printf("Error with lb %d, %d, ch %d\n", (int)lb, (int)next_lb, chunk); + err++; + } + if (loop_st > 0) { + if (!(ub <= loop_ub)) { + printf("Error with ub %d, %d, ch %d\n", (int)ub, (int)loop_ub, chunk); + err++; + } + if (!(lb <= ub)) { + printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); + err++; + } + } else { + if (!(ub >= loop_ub)) { + printf("Error with ub %d, %d, %d\n", (int)ub, (int)loop_ub, chunk); + err++; + } + if (!(lb >= ub)) { + printf("Error with bounds %d, %d, %d\n", (int)lb, (int)ub, chunk); + err++; + } + }; // if + // Stride should not change + if (!(st == loop_st)) { + printf("Error with st %d, %d, ch %d\n", (int)st, (int)loop_st, chunk); + err++; + } + cur = (ub - lb) / loop_st + 1; + // Guided scheduling uses FP computations, so current chunk may + // be a bit bigger (+1) than allowed maximum + if (!(cur <= max + 1)) { + printf("Error with iter %d, %d\n", cur, max); + err++; + } + // Update maximum for the next chunk + if (cur < max) + max = cur; + next_lb = ub + loop_st; + last_ub = ub; + undersized = (cur < loop_chunk); + }; // while + // Must have at least one chunk + if (!(chunk > 0)) { + printf("Error with chunk %d\n", chunk); + err++; + } + // Must have the right last iteration index + if (loop_st > 0) { + if (!(last_ub <= loop_ub)) { + printf("Error with last1 %d, %d, ch %d\n", + (int)last_ub, (int)loop_ub, chunk); + err++; + } + if (!(last_ub + loop_st > loop_ub)) { + printf("Error with last2 %d, %d, %d, ch %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk); + err++; + } + } else { + if (!(last_ub >= loop_ub)) { + printf("Error with last1 %d, %d, ch %d\n", + (int)last_ub, (int)loop_ub, chunk); + err++; + } + if (!(last_ub + loop_st < loop_ub)) { + printf("Error with last2 %d, %d, %d, ch %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk); + err++; + } + }; // if + // Let non-master threads go + loop_sync = 1; + } else { + int i; + // Workers wait for master thread to finish, then call __kmpc_dispatch_next + for (i = 0; i < 1000000; ++ i) { + if (loop_sync != 0) { + break; + }; // if + }; // for i + while (loop_sync == 0) { + delay(); + }; // while + // At this moment we do not have any more chunks -- all the chunks already + // processed by the master thread + rc = __kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st); + if (rc) { + printf("Error return value\n"); + err++; + } + }; // if + + __kmpc_barrier(&loc, gtid); + if (tid == 0) { + loop_sync = 0; // Restore original state +#if DEBUG + printf("run_loop<>(): at the end\n"); +#endif + }; // if + __kmpc_barrier(&loc, gtid); + return err; +} // run_loop + +// --------------------------------------------------------------------------- +int run_64(int num_th) +{ + int err = 0; +#pragma omp parallel num_threads(num_th) + { + int chunk; + i64 st, lb, ub; + for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) { + for (st = 1; st <= 3; ++ st) { + for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) { + for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) { + err += run_loop_64(lb, ub, st, chunk); + err += run_loop_64(ub, lb, -st, chunk); + }; // for ub + }; // for lb + }; // for st + }; // for chunk + } + return err; +} // run_all + +int run_32(int num_th) +{ + int err = 0; +#pragma omp parallel num_threads(num_th) + { + int chunk, st, lb, ub; + for (chunk = SIMD_LEN; chunk <= 3*SIMD_LEN; chunk += SIMD_LEN) { + for (st = 1; st <= 3; ++ st) { + for (lb = -3 * num_th * st; lb <= 3 * num_th * st; ++ lb) { + for (ub = lb; ub < lb + num_th * (chunk+1) * st; ++ ub) { + err += run_loop_32(lb, ub, st, chunk); + err += run_loop_32(ub, lb, -st, chunk); + }; // for ub + }; // for lb + }; // for st + }; // for chunk + } + return err; +} // run_all + +// --------------------------------------------------------------------------- +int main() +{ + int n, err = 0; + for (n = 1; n <= 4; ++ n) { + err += run_32(n); + err += run_64(n); + }; // for n + if (err) + printf("failed with %d errors\n", err); + else + printf("passed\n"); + return err; +} Index: runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c =================================================================== --- runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c +++ runtime/test/worksharing/for/kmp_sch_simd_runtime_api.c @@ -0,0 +1,221 @@ +// RUN: %libomp-compile-and-run + +// The test checks schedule(simd:runtime) +// in combination with omp_set_schedule() +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) +#include +#define delay() Sleep(1); +#define seten(a,b,c) _putenv_s((a),(b)) +#else +#include +#define delay() usleep(10); +#define seten(a,b,c) setenv((a),(b),(c)) +#endif + +#define SIMD_LEN 4 +int err = 0; + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL. +enum sched { + kmp_sch_static_balanced_chunked = 45, + kmp_sch_guided_simd = 46, + kmp_sch_runtime_simd = 47, +}; +typedef unsigned u32; +typedef long long i64; +typedef unsigned long long u64; +typedef struct { + int reserved_1; + int flags; + int reserved_2; + int reserved_3; + char *psource; +} id; + +#ifdef __cplusplus +extern "C" { +#endif + int __kmpc_global_thread_num(id*); + void __kmpc_barrier(id*, int gtid); + void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int); + void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64); + int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*); + int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*); +#ifdef __cplusplus +} // extern "C" +#endif +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- +static id loc = {0, 2, 0, 0, ";file;func;0;0;;"}; + +// --------------------------------------------------------------------------- +void +run_loop( + int loop_lb, // Loop lower bound. + int loop_ub, // Loop upper bound. + int loop_st, // Loop stride. + int lchunk +) { + static int volatile loop_sync = 0; + int lb; // Chunk lower bound. + int ub; // Chunk upper bound. + int st; // Chunk stride. + int rc; + int tid = omp_get_thread_num(); + int gtid = __kmpc_global_thread_num(&loc); + int last; + int tc = (loop_ub - loop_lb) / loop_st + 1; + int ch; + int no_chunk = 0; + if (lchunk == 0) { + no_chunk = 1; + lchunk = 1; + } + ch = lchunk * SIMD_LEN; +#if _DEBUG > 1 + printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n", + gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk); +#endif + // Don't test degenerate cases that should have been discovered by codegen. + if (loop_st == 0) + return; + if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) + return; + __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd, + loop_lb, loop_ub, loop_st, SIMD_LEN); + { + // Let the master thread handle the chunks alone. + int chunk; // No of current chunk. + int last_ub; // Upper bound of the last processed chunk. + u64 cur; // Number of interations in current chunk. + u64 max; // Max allowed iterations for current chunk. + int undersized = 0; + last_ub = loop_ub; + chunk = 0; + max = (loop_ub - loop_lb) / loop_st + 1; + // The first chunk can consume all iterations. + while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { + ++ chunk; +#if _DEBUG + printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n", + tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1)); +#endif + // Check if previous chunk (it is not the final chunk) is undersized. + if (undersized) + printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err); + if (loop_st > 0) { + if (!(ub <= loop_ub)) + printf("Error with ub %d, %d, ch %d, err %d\n", + (int)ub, (int)loop_ub, chunk, ++err); + if (!(lb <= ub)) + printf("Error with bounds %d, %d, %d, err %d\n", + (int)lb, (int)ub, chunk, ++err); + } else { + if (!(ub >= loop_ub)) + printf("Error with ub %d, %d, %d, err %d\n", + (int)ub, (int)loop_ub, chunk, ++err); + if (!(lb >= ub)) + printf("Error with bounds %d, %d, %d, err %d\n", + (int)lb, (int)ub, chunk, ++err); + }; // if + // Stride should not change. + if (!(st == loop_st)) + printf("Error with st %d, %d, ch %d, err %d\n", + (int)st, (int)loop_st, chunk, ++err); + cur = ( ub - lb ) / loop_st + 1; + // Guided scheduling uses FP computations, so current chunk may + // be a bit bigger (+1) than allowed maximum. + if (!( cur <= max + 1)) + printf("Error with iter %d, %d, err %d\n", cur, max, ++err); + // Update maximum for the next chunk. + if (last) { + if (!no_chunk && cur > ch) + printf("Error: too big last chunk %d (%d), tid %d, err %d\n", + (int)cur, ch, tid, ++err); + } else { + if (cur % ch) + printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n", + chunk, (int)cur, ch, tid, ++err); + } + if (cur < max) + max = cur; + last_ub = ub; + undersized = (cur < ch); +#if _DEBUG > 1 + if (last) + printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n", + undersized,cur,ch,tid,ub,lb,loop_st); +#endif + } // while + // Must have the right last iteration index. + if (loop_st > 0) { + if (!(last_ub <= loop_ub)) + printf("Error with last1 %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_ub, chunk, ++err); + if (last && !(last_ub + loop_st > loop_ub)) + printf("Error with last2 %d, %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); + } else { + if (!(last_ub >= loop_ub)) + printf("Error with last1 %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_ub, chunk, ++err); + if (last && !(last_ub + loop_st < loop_ub)) + printf("Error with last2 %d, %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); + } // if + } + __kmpc_barrier(&loc, gtid); +} // run_loop + +int main(int argc, char *argv[]) +{ + int chunk = 0; +// static (no chunk) + omp_set_schedule(omp_sched_static,0); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + +// auto (chunk should be ignorted) + omp_set_schedule(omp_sched_auto,0); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + +// static,1 + chunk = 1; + omp_set_schedule(omp_sched_static,1); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + +// dynamic,1 + omp_set_schedule(omp_sched_dynamic,1); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + +// guided,1 + omp_set_schedule(omp_sched_guided,1); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + +// dynamic,0 - use default chunk size 1 + omp_set_schedule(omp_sched_dynamic,0); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + +// guided,0 - use default chunk size 1 + omp_set_schedule(omp_sched_guided,0); +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + + if (err) { + printf("failed, err = %d\n", err); + return 1; + } else { + printf("passed\n"); + return 0; + } +} Index: runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c =================================================================== --- runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c +++ runtime/test/worksharing/for/kmp_sch_simd_runtime_guided.c @@ -0,0 +1,196 @@ +// RUN: %libomp-compile +// RUN: env OMP_SCHEDULE=guided %libomp-run +// RUN: env OMP_SCHEDULE=guided,1 %libomp-run 1 +// RUN: env OMP_SCHEDULE=guided,2 %libomp-run 2 +// RUN: env OMP_SCHEDULE=dynamic %libomp-run +// RUN: env OMP_SCHEDULE=dynamic,1 %libomp-run 1 +// RUN: env OMP_SCHEDULE=dynamic,2 %libomp-run 2 +// RUN: env OMP_SCHEDULE=auto %libomp-run + +// The test checks schedule(simd:runtime) +// in combination with OMP_SCHEDULE=guided[,chunk] +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) +#include +#define delay() Sleep(1); +#define seten(a,b,c) _putenv_s((a),(b)) +#else +#include +#define delay() usleep(10); +#define seten(a,b,c) setenv((a),(b),(c)) +#endif + +#define UBOUND 100 +#define SIMD_LEN 4 +int err = 0; + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL. +enum sched { + kmp_sch_static_balanced_chunked = 45, + kmp_sch_guided_simd = 46, + kmp_sch_runtime_simd = 47, +}; +typedef unsigned u32; +typedef long long i64; +typedef unsigned long long u64; +typedef struct { + int reserved_1; + int flags; + int reserved_2; + int reserved_3; + char *psource; +} id; + +#ifdef __cplusplus +extern "C" { +#endif + int __kmpc_global_thread_num(id*); + void __kmpc_barrier(id*, int gtid); + void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int); + void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64); + int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*); + int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*); +#ifdef __cplusplus +} // extern "C" +#endif +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- +static id loc = {0, 2, 0, 0, ";file;func;0;0;;"}; + +// --------------------------------------------------------------------------- +void +run_loop( + int loop_lb, // Loop lower bound. + int loop_ub, // Loop upper bound. + int loop_st, // Loop stride. + int lchunk +) { + static int volatile loop_sync = 0; + int lb; // Chunk lower bound. + int ub; // Chunk upper bound. + int st; // Chunk stride. + int rc; + int tid = omp_get_thread_num(); + int gtid = __kmpc_global_thread_num(&loc); + int last; + int tc = (loop_ub - loop_lb) / loop_st + 1; + int ch; + int no_chunk = 0; + if (lchunk == 0) { + no_chunk = 1; + lchunk = 1; + } + ch = lchunk * SIMD_LEN; +#if _DEBUG > 1 + printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n", + gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk); +#endif + // Don't test degenerate cases that should have been discovered by codegen. + if (loop_st == 0) + return; + if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) + return; + __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd, + loop_lb, loop_ub, loop_st, SIMD_LEN); + { + // Let the master thread handle the chunks alone. + int chunk; // No of current chunk. + int last_ub; // Upper bound of the last processed chunk. + u64 cur; // Number of interations in current chunk. + u64 max; // Max allowed iterations for current chunk. + int undersized = 0; + last_ub = loop_ub; + chunk = 0; + max = (loop_ub - loop_lb) / loop_st + 1; + // The first chunk can consume all iterations. + while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { + ++ chunk; +#if _DEBUG + printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n", + tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1)); +#endif + // Check if previous chunk (it is not the final chunk) is undersized. + if (undersized) + printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err); + if (loop_st > 0) { + if (!(ub <= loop_ub)) + printf("Error with ub %d, %d, ch %d, err %d\n", + (int)ub, (int)loop_ub, chunk, ++err); + if (!(lb <= ub)) + printf("Error with bounds %d, %d, %d, err %d\n", + (int)lb, (int)ub, chunk, ++err); + } else { + if (!(ub >= loop_ub)) + printf("Error with ub %d, %d, %d, err %d\n", + (int)ub, (int)loop_ub, chunk, ++err); + if (!(lb >= ub)) + printf("Error with bounds %d, %d, %d, err %d\n", + (int)lb, (int)ub, chunk, ++err); + }; // if + // Stride should not change. + if (!(st == loop_st)) + printf("Error with st %d, %d, ch %d, err %d\n", + (int)st, (int)loop_st, chunk, ++err); + cur = ( ub - lb ) / loop_st + 1; + // Guided scheduling uses FP computations, so current chunk may + // be a bit bigger (+1) than allowed maximum. + if (!( cur <= max + 1)) + printf("Error with iter %d, %d, err %d\n", cur, max, ++err); + // Update maximum for the next chunk. + if (!last && cur % ch) + printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n", + chunk, (int)cur, ch, tid, ++err); + if (last && !no_chunk && cur > ch) + printf("Error: too big last chunk %d (%d), tid %d, err %d\n", + (int)cur, ch, tid, ++err); + if (cur < max) + max = cur; + last_ub = ub; + undersized = (cur < ch); +#if _DEBUG > 1 + if (last) + printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n", + undersized,cur,ch,tid,ub,lb,loop_st); +#endif + } // while + // Must have the right last iteration index. + if (loop_st > 0) { + if (!(last_ub <= loop_ub)) + printf("Error with last1 %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_ub, chunk, ++err); + if (last && !(last_ub + loop_st > loop_ub)) + printf("Error with last2 %d, %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); + } else { + if (!(last_ub >= loop_ub)) + printf("Error with last1 %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_ub, chunk, ++err); + if (last && !(last_ub + loop_st < loop_ub)) + printf("Error with last2 %d, %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); + } // if + } + __kmpc_barrier(&loc, gtid); +} // run_loop + +int main(int argc, char *argv[]) +{ + int chunk = 0; + if (argc > 1) { + // expect chunk size as a parameter + chunk = atoi(argv[1]); + } +#pragma omp parallel //num_threads(num_th) + run_loop(0, UBOUND, 1, chunk); + if (err) { + printf("failed, err = %d\n", err); + return 1; + } else { + printf("passed\n"); + return 0; + } +} Index: runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c =================================================================== --- runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c +++ runtime/test/worksharing/for/kmp_sch_simd_runtime_static.c @@ -0,0 +1,201 @@ +// RUN: %libomp-compile && %libomp-run +// RUN: %libomp-run 1 && %libomp-run 2 + +// The test checks schedule(simd:runtime) +// in combination with OMP_SCHEDULE=static[,chunk] +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) +#include +#define delay() Sleep(1); +#define seten(a,b,c) _putenv_s((a),(b)) +#else +#include +#define delay() usleep(10); +#define seten(a,b,c) setenv((a),(b),(c)) +#endif + +#define SIMD_LEN 4 +int err = 0; + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL. +enum sched { + kmp_sch_static_balanced_chunked = 45, + kmp_sch_guided_simd = 46, + kmp_sch_runtime_simd = 47, +}; +typedef unsigned u32; +typedef long long i64; +typedef unsigned long long u64; +typedef struct { + int reserved_1; + int flags; + int reserved_2; + int reserved_3; + char *psource; +} id; + +#ifdef __cplusplus +extern "C" { +#endif + int __kmpc_global_thread_num(id*); + void __kmpc_barrier(id*, int gtid); + void __kmpc_dispatch_init_4(id*, int, enum sched, int, int, int, int); + void __kmpc_dispatch_init_8(id*, int, enum sched, i64, i64, i64, i64); + int __kmpc_dispatch_next_4(id*, int, void*, void*, void*, void*); + int __kmpc_dispatch_next_8(id*, int, void*, void*, void*, void*); +#ifdef __cplusplus +} // extern "C" +#endif +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- +static id loc = {0, 2, 0, 0, ";file;func;0;0;;"}; + +// --------------------------------------------------------------------------- +void +run_loop( + int loop_lb, // Loop lower bound. + int loop_ub, // Loop upper bound. + int loop_st, // Loop stride. + int lchunk +) { + static int volatile loop_sync = 0; + int lb; // Chunk lower bound. + int ub; // Chunk upper bound. + int st; // Chunk stride. + int rc; + int tid = omp_get_thread_num(); + int gtid = __kmpc_global_thread_num(&loc); + int last; + int tc = (loop_ub - loop_lb) / loop_st + 1; + int ch; + int no_chunk = 0; + if (lchunk == 0) { + no_chunk = 1; + lchunk = 1; + } + ch = lchunk * SIMD_LEN; +#if _DEBUG > 1 + printf("run_loop gtid %d tid %d (lb=%d, ub=%d, st=%d, ch=%d)\n", + gtid, tid, (int)loop_lb, (int)loop_ub, (int)loop_st, lchunk); +#endif + // Don't test degenerate cases that should have been discovered by codegen. + if (loop_st == 0) + return; + if (loop_st > 0 ? loop_lb > loop_ub : loop_lb < loop_ub) + return; + __kmpc_dispatch_init_4(&loc, gtid, kmp_sch_runtime_simd, + loop_lb, loop_ub, loop_st, SIMD_LEN); + { + // Let the master thread handle the chunks alone. + int chunk; // No of current chunk. + int last_ub; // Upper bound of the last processed chunk. + u64 cur; // Number of interations in current chunk. + u64 max; // Max allowed iterations for current chunk. + int undersized = 0; + last_ub = loop_ub; + chunk = 0; + max = (loop_ub - loop_lb) / loop_st + 1; + // The first chunk can consume all iterations. + while (__kmpc_dispatch_next_4(&loc, gtid, &last, &lb, &ub, &st)) { + ++ chunk; +#if _DEBUG + printf("th %d: chunk=%d, lb=%d, ub=%d ch %d\n", + tid, chunk, (int)lb, (int)ub, (int)(ub-lb+1)); +#endif + // Check if previous chunk (it is not the final chunk) is undersized. + if (undersized) + printf("Error with chunk %d, th %d, err %d\n", chunk, tid, ++err); + if (loop_st > 0) { + if (!(ub <= loop_ub)) + printf("Error with ub %d, %d, ch %d, err %d\n", + (int)ub, (int)loop_ub, chunk, ++err); + if (!(lb <= ub)) + printf("Error with bounds %d, %d, %d, err %d\n", + (int)lb, (int)ub, chunk, ++err); + } else { + if (!(ub >= loop_ub)) + printf("Error with ub %d, %d, %d, err %d\n", + (int)ub, (int)loop_ub, chunk, ++err); + if (!(lb >= ub)) + printf("Error with bounds %d, %d, %d, err %d\n", + (int)lb, (int)ub, chunk, ++err); + }; // if + // Stride should not change. + if (!(st == loop_st)) + printf("Error with st %d, %d, ch %d, err %d\n", + (int)st, (int)loop_st, chunk, ++err); + cur = ( ub - lb ) / loop_st + 1; + // Guided scheduling uses FP computations, so current chunk may + // be a bit bigger (+1) than allowed maximum. + if (!( cur <= max + 1)) + printf("Error with iter %d, %d, err %d\n", cur, max, ++err); + // Update maximum for the next chunk. + if (last) { + if (!no_chunk && cur > ch) + printf("Error: too big last chunk %d (%d), tid %d, err %d\n", + (int)cur, ch, tid, ++err); + } else { + if (cur % ch) + printf("Error with chunk %d, %d, ch %d, tid %d, err %d\n", + chunk, (int)cur, ch, tid, ++err); + } + if (cur < max) + max = cur; + last_ub = ub; + undersized = (cur < ch); +#if _DEBUG > 1 + if (last) + printf("under%d cur %d, ch %d, tid %d, ub %d, lb %d, st %d =======\n", + undersized,cur,ch,tid,ub,lb,loop_st); +#endif + } // while + // Must have the right last iteration index. + if (loop_st > 0) { + if (!(last_ub <= loop_ub)) + printf("Error with last1 %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_ub, chunk, ++err); + if (last && !(last_ub + loop_st > loop_ub)) + printf("Error with last2 %d, %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); + } else { + if (!(last_ub >= loop_ub)) + printf("Error with last1 %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_ub, chunk, ++err); + if (last && !(last_ub + loop_st < loop_ub)) + printf("Error with last2 %d, %d, %d, ch %d, err %d\n", + (int)last_ub, (int)loop_st, (int)loop_ub, chunk, ++err); + } // if + } + __kmpc_barrier(&loc, gtid); +} // run_loop + +int main(int argc, char *argv[]) +{ + int chunk = 0; + if (argc > 1) { + char *buf = malloc(8 + strlen(argv[1])); + // expect chunk size as a parameter + chunk = atoi(argv[1]); + strcpy(buf,"static,"); + strcat(buf,argv[1]); + seten("OMP_SCHEDULE",buf,1); + printf("Testing schedule(simd:%s)\n", buf); + free(buf); + } else { + seten("OMP_SCHEDULE","static",1); + printf("Testing schedule(simd:static)\n"); + } +#pragma omp parallel// num_threads(num_th) + run_loop(0, 26, 1, chunk); + if (err) { + printf("failed, err = %d\n", err); + return 1; + } else { + printf("passed\n"); + return 0; + } +}