diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp --- a/openmp/runtime/src/kmp_dispatch.cpp +++ b/openmp/runtime/src/kmp_dispatch.cpp @@ -642,9 +642,10 @@ break; case kmp_sch_static_chunked: case kmp_sch_dynamic_chunked: - if (pr->u.p.parm1 <= 0) { + if (pr->u.p.parm1 <= 0) pr->u.p.parm1 = KMP_DEFAULT_CHUNK; - } + else if (pr->u.p.parm1 > tc) + pr->u.p.parm1 = tc; KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d " "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n", gtid)); diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp --- a/openmp/runtime/src/kmp_sched.cpp +++ b/openmp/runtime/src/kmp_sched.cpp @@ -301,7 +301,8 @@ if (tid < trip_count) { *pupper = *plower = *plower + tid * incr; } else { - *plower = *pupper + incr; + // set bounds so non-active threads execute no iterations + *plower = *pupper + (incr > 0 ? 1 : -1); } if (plastiter != NULL) *plastiter = (tid == trip_count - 1); @@ -345,15 +346,28 @@ } case kmp_sch_static_chunked: { ST span; - if (chunk < 1) { + UT nchunks; + if (chunk < 1) chunk = 1; - } + else if ((UT)chunk > trip_count) + chunk = trip_count; + nchunks = (trip_count) / (UT)chunk + (trip_count % (UT)chunk ? 1 : 0); span = chunk * incr; - *pstride = span * nth; - *plower = *plower + (span * tid); - *pupper = *plower + span - incr; + if (nchunks < nth) { + *pstride = span * nchunks; + if (tid < nchunks) { + *plower = *plower + (span * tid); + *pupper = *plower + span - incr; + } else { + *plower = *pupper + (incr > 0 ? 1 : -1); + } + } else { + *pstride = span * nth; + *plower = *plower + (span * tid); + *pupper = *plower + span - incr; + } if (plastiter != NULL) - *plastiter = (tid == ((trip_count - 1) / (UT)chunk) % nth); + *plastiter = (tid == (nchunks - 1) % nth); break; } case kmp_sch_static_balanced_chunked: { diff --git a/openmp/runtime/test/worksharing/for/omp_for_static_large_chunk.c b/openmp/runtime/test/worksharing/for/omp_for_static_large_chunk.c new file mode 100644 --- /dev/null +++ b/openmp/runtime/test/worksharing/for/omp_for_static_large_chunk.c @@ -0,0 +1,112 @@ +// RUN: %libomp-compile +// RUN: env OMP_NUM_THREADS=4 %libomp-run 5 5005 500 1000000000 +// It fails using gcc compilers because the gcc compiler does not use any +// runtime interface to calculate the iterations for static loop schedule +// Hence, the runtime is never involved. +// XFAIL: gcc +// +// This test makes sure that large chunks sizes are handled correctly +// including internal runtime calculations which incorporate the chunk size +#include +#include +#include "omp_testsuite.h" + +#ifndef DEBUG_OUTPUT +#define DEBUG_OUTPUT 0 +#endif + +// Used in qsort() to compare integers +int compare_ints(const void *v1, const void *v2) { + int i1 = *(const int *)v1; + int i2 = *(const int *)v2; + return i1 - i2; +} + +int main(int argc, char **argv) { + int i, j, lb, ub, stride, nthreads, chunk; + int num_iters = 0; + int counted_iters = 0; + int errs = 0; + if (argc != 5) { + fprintf(stderr, "error: incorrect number of arguments\n"); + fprintf(stderr, "usage: %s \n", argv[0]); + exit(EXIT_FAILURE); + } + lb = atoi(argv[1]); + ub = atoi(argv[2]); + stride = atoi(argv[3]); + chunk = atoi(argv[4]); + nthreads = omp_get_max_threads(); + if (lb >= ub) { + fprintf(stderr, "error: lb must be less than ub\n"); + exit(EXIT_FAILURE); + } + if (stride <= 0) { + fprintf(stderr, "error: stride must be positive integer\n"); + exit(EXIT_FAILURE); + } + if (chunk <= 0) { + fprintf(stderr, "error: chunk must be positive integer\n"); + exit(EXIT_FAILURE); + } + for (i = lb; i < ub; i += stride) + num_iters++; + // Thread private record of iterations each thread performed + int *iters = (int *)malloc(sizeof(int) * nthreads * num_iters); + // This will be the list of all iteration performed by every thread + int *final_iters = (int *)malloc(sizeof(int) * nthreads * num_iters); + for (i = 0; i < nthreads * num_iters; ++i) { + iters[i] = -1; + final_iters[i] = -1; + } + + #pragma omp parallel num_threads(nthreads) + { + int j = 0; + int *my_iters = iters + omp_get_thread_num() * num_iters; + #pragma omp for schedule(static, chunk) + for (i = lb; i < ub; i += stride) { + #pragma omp atomic + counted_iters++; + my_iters[j++] = i; + } + } + + // Put all iterations into final_iters then sort it from lowest to highest + for (i = 0, j = 0; i < nthreads * num_iters; ++i) { + if (iters[i] != -1) + final_iters[j++] = iters[i]; + } + if (j != counted_iters) { + fprintf(stderr, "error: wrong number of final iterations counted!\n"); + exit(EXIT_FAILURE); + } + qsort(final_iters, j, sizeof(int), compare_ints); + + // Check for the right number of iterations + if (counted_iters != num_iters) { + fprintf(stderr, "error: wrong number of iterations executed. Expected %d " + "but executed %d\n", + num_iters, counted_iters); + exit(EXIT_FAILURE); + } + +#if DEBUG_OUTPUT + for (i = 0; i < num_iters; ++i) + printf("final_iters[%d] = %d\n", i, final_iters[i]); +#endif + + // Check that the iterations performed were correct + for (i = lb, j = 0; i < ub; i += stride, ++j) { + if (final_iters[j] != i) { + fprintf(stderr, + "error: iteration j=%d i=%d is incorrect. Expect %d but see %d\n", + j, i, i, final_iters[j]); + exit(EXIT_FAILURE); + } + } + + free(iters); + free(final_iters); + return EXIT_SUCCESS; +}