diff --git a/openmp/runtime/src/kmp_dispatch.cpp b/openmp/runtime/src/kmp_dispatch.cpp
--- a/openmp/runtime/src/kmp_dispatch.cpp
+++ b/openmp/runtime/src/kmp_dispatch.cpp
@@ -643,9 +643,10 @@
     break;
   case kmp_sch_static_chunked:
   case kmp_sch_dynamic_chunked:
-    if (pr->u.p.parm1 <= 0) {
+    if (pr->u.p.parm1 <= 0)
       pr->u.p.parm1 = KMP_DEFAULT_CHUNK;
-    }
+    else if (pr->u.p.parm1 > tc)
+      pr->u.p.parm1 = tc;
     KD_TRACE(100, ("__kmp_dispatch_init_algorithm: T#%d "
                    "kmp_sch_static_chunked/kmp_sch_dynamic_chunked cases\n",
                    gtid));
diff --git a/openmp/runtime/src/kmp_sched.cpp b/openmp/runtime/src/kmp_sched.cpp
--- a/openmp/runtime/src/kmp_sched.cpp
+++ b/openmp/runtime/src/kmp_sched.cpp
@@ -301,7 +301,8 @@
       if (tid < trip_count) {
         *pupper = *plower = *plower + tid * incr;
       } else {
-        *plower = *pupper + incr;
+        // set bounds so non-active threads execute no iterations
+        *plower = *pupper + (incr > 0 ? 1 : -1);
       }
       if (plastiter != NULL)
         *plastiter = (tid == trip_count - 1);
@@ -345,15 +346,28 @@
   }
   case kmp_sch_static_chunked: {
     ST span;
-    if (chunk < 1) {
+    UT nchunks;
+    if (chunk < 1)
       chunk = 1;
-    }
+    else if ((UT)chunk > trip_count)
+      chunk = trip_count;
+    nchunks = (trip_count) / (UT)chunk + (trip_count % (UT)chunk ? 1 : 0);
     span = chunk * incr;
-    *pstride = span * nth;
-    *plower = *plower + (span * tid);
-    *pupper = *plower + span - incr;
+    if (nchunks < nth) {
+      *pstride = span * nchunks;
+      if (tid < nchunks) {
+        *plower = *plower + (span * tid);
+        *pupper = *plower + span - incr;
+      } else {
+        *plower = *pupper + (incr > 0 ? 1 : -1);
+      }
+    } else {
+      *pstride = span * nth;
+      *plower = *plower + (span * tid);
+      *pupper = *plower + span - incr;
+    }
     if (plastiter != NULL)
-      *plastiter = (tid == ((trip_count - 1) / (UT)chunk) % nth);
+      *plastiter = (tid == (nchunks - 1) % nth);
     break;
   }
   case kmp_sch_static_balanced_chunked: {
diff --git a/openmp/runtime/test/worksharing/for/omp_for_static_large_chunk.c b/openmp/runtime/test/worksharing/for/omp_for_static_large_chunk.c
new file mode 100644
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_static_large_chunk.c
@@ -0,0 +1,112 @@
+// RUN: %libomp-compile
+// RUN: env OMP_NUM_THREADS=4 %libomp-run 5 5005 500 1000000000
+// It fails using gcc compilers because the gcc compiler does not use any
+// runtime interface to calculate the iterations for static loop schedule
+// Hence, the runtime is never involved.
+// XFAIL: gcc
+//
+// This test makes sure that large chunks sizes are handled correctly
+// including internal runtime calculations which incorporate the chunk size
+#include <stdio.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#ifndef DEBUG_OUTPUT
+#define DEBUG_OUTPUT 0
+#endif
+
+// Used in qsort() to compare integers
+int compare_ints(const void *v1, const void *v2) {
+  int i1 = *(const int *)v1;
+  int i2 = *(const int *)v2;
+  return i1 - i2;
+}
+
+int main(int argc, char **argv) {
+  int i, j, lb, ub, stride, nthreads, chunk;
+  int num_iters = 0;
+  int counted_iters = 0;
+  int errs = 0;
+  if (argc != 5) {
+    fprintf(stderr, "error: incorrect number of arguments\n");
+    fprintf(stderr, "usage: %s <lb> <ub> <stride> <chunk>\n", argv[0]);
+    exit(EXIT_FAILURE);
+  }
+  lb = atoi(argv[1]);
+  ub = atoi(argv[2]);
+  stride = atoi(argv[3]);
+  chunk = atoi(argv[4]);
+  nthreads = omp_get_max_threads();
+  if (lb >= ub) {
+    fprintf(stderr, "error: lb must be less than ub\n");
+    exit(EXIT_FAILURE);
+  }
+  if (stride <= 0) {
+    fprintf(stderr, "error: stride must be positive integer\n");
+    exit(EXIT_FAILURE);
+  }
+  if (chunk <= 0) {
+    fprintf(stderr, "error: chunk must be positive integer\n");
+    exit(EXIT_FAILURE);
+  }
+  for (i = lb; i < ub; i += stride)
+    num_iters++;
+  // Thread private record of iterations each thread performed
+  int *iters = (int *)malloc(sizeof(int) * nthreads * num_iters);
+  // This will be the list of all iteration performed by every thread
+  int *final_iters = (int *)malloc(sizeof(int) * nthreads * num_iters);
+  for (i = 0; i < nthreads * num_iters; ++i) {
+    iters[i] = -1;
+    final_iters[i] = -1;
+  }
+
+  #pragma omp parallel num_threads(nthreads)
+  {
+    int j = 0;
+    int *my_iters = iters + omp_get_thread_num() * num_iters;
+    #pragma omp for schedule(static, chunk)
+    for (i = lb; i < ub; i += stride) {
+      #pragma omp atomic
+      counted_iters++;
+      my_iters[j++] = i;
+    }
+  }
+
+  // Put all iterations into final_iters then sort it from lowest to highest
+  for (i = 0, j = 0; i < nthreads * num_iters; ++i) {
+    if (iters[i] != -1)
+      final_iters[j++] = iters[i];
+  }
+  if (j != counted_iters) {
+    fprintf(stderr, "error: wrong number of final iterations counted!\n");
+    exit(EXIT_FAILURE);
+  }
+  qsort(final_iters, j, sizeof(int), compare_ints);
+
+  // Check for the right number of iterations
+  if (counted_iters != num_iters) {
+    fprintf(stderr, "error: wrong number of iterations executed. Expected %d "
+                    "but executed %d\n",
+            num_iters, counted_iters);
+    exit(EXIT_FAILURE);
+  }
+
+#if DEBUG_OUTPUT
+  for (i = 0; i < num_iters; ++i)
+    printf("final_iters[%d] = %d\n", i, final_iters[i]);
+#endif
+
+  // Check that the iterations performed were correct
+  for (i = lb, j = 0; i < ub; i += stride, ++j) {
+    if (final_iters[j] != i) {
+      fprintf(stderr,
+              "error: iteration j=%d i=%d is incorrect. Expect %d but see %d\n",
+              j, i, i, final_iters[j]);
+      exit(EXIT_FAILURE);
+    }
+  }
+
+  free(iters);
+  free(final_iters);
+  return EXIT_SUCCESS;
+}