Index: openmp/trunk/runtime/src/kmp.h
===================================================================
--- openmp/trunk/runtime/src/kmp.h
+++ openmp/trunk/runtime/src/kmp.h
@@ -2050,8 +2050,10 @@
 // specified, defaults to 0 otherwise
 #endif
 #if OMP_45_ENABLED
-extern kmp_int32 __kmp_max_task_priority; // Set via OMP_MAX_TASK_PRIORITY if
-// specified, defaults to 0 otherwise
+// Set via OMP_MAX_TASK_PRIORITY if specified, defaults to 0 otherwise
+extern kmp_int32 __kmp_max_task_priority;
+// Set via KMP_TASKLOOP_MIN_TASKS if specified, defaults to 0 otherwise
+extern kmp_uint64 __kmp_taskloop_min_tasks;
 #endif
 
 /* NOTE: kmp_taskdata_t and kmp_task_t structures allocated in single block with
Index: openmp/trunk/runtime/src/kmp_global.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_global.cpp
+++ openmp/trunk/runtime/src/kmp_global.cpp
@@ -301,6 +301,7 @@
 kmp_tasking_mode_t __kmp_tasking_mode = tskm_task_teams;
 #if OMP_45_ENABLED
 kmp_int32 __kmp_max_task_priority = 0;
+kmp_uint64 __kmp_taskloop_min_tasks = 0;
 #endif
 
 /* This check ensures that the compiler is passing the correct data type for the
Index: openmp/trunk/runtime/src/kmp_settings.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_settings.cpp
+++ openmp/trunk/runtime/src/kmp_settings.cpp
@@ -1167,6 +1167,20 @@
                                               char const *name, void *data) {
   __kmp_stg_print_int(buffer, name, __kmp_max_task_priority);
 } // __kmp_stg_print_max_task_priority
+
+// KMP_TASKLOOP_MIN_TASKS
+// taskloop threashold to switch from recursive to linear tasks creation
+static void __kmp_stg_parse_taskloop_min_tasks(char const *name,
+                                              char const *value, void *data) {
+  int tmp;
+  __kmp_stg_parse_int(name, value, 0, INT_MAX, &tmp);
+  __kmp_taskloop_min_tasks = tmp;
+} // __kmp_stg_parse_taskloop_min_tasks
+
+static void __kmp_stg_print_taskloop_min_tasks(kmp_str_buf_t *buffer,
+                                              char const *name, void *data) {
+  __kmp_stg_print_int(buffer, name, __kmp_taskloop_min_tasks);
+} // __kmp_stg_print_taskloop_min_tasks
 #endif // OMP_45_ENABLED
 
 // -----------------------------------------------------------------------------
@@ -4370,6 +4384,8 @@
 #if OMP_45_ENABLED
     {"OMP_MAX_TASK_PRIORITY", __kmp_stg_parse_max_task_priority,
      __kmp_stg_print_max_task_priority, NULL, 0, 0},
+    {"KMP_TASKLOOP_MIN_TASKS", __kmp_stg_parse_taskloop_min_tasks,
+     __kmp_stg_print_taskloop_min_tasks, NULL, 0, 0},
 #endif
     {"OMP_THREAD_LIMIT", __kmp_stg_parse_all_threads,
      __kmp_stg_print_all_threads, NULL, 0, 0},
Index: openmp/trunk/runtime/src/kmp_tasking.cpp
===================================================================
--- openmp/trunk/runtime/src/kmp_tasking.cpp
+++ openmp/trunk/runtime/src/kmp_tasking.cpp
@@ -3237,6 +3237,7 @@
                      0);
   }
   taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
   taskdata->td_taskgroup =
       parent_task
           ->td_taskgroup; // task inherits the taskgroup from the parent task
@@ -3263,32 +3264,37 @@
   return task;
 }
 
-// Routine optionally generated by th ecompiler for setting the lastprivate flag
+// Routine optionally generated by the compiler for setting the lastprivate flag
 // and calling needed constructors for private/firstprivate objects
 // (used to form taskloop tasks from pattern task)
+// Parameters: dest task, src task, lastprivate flag.
 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
 
 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
 //
 // loc       Source location information
 // gtid      Global thread ID
-// task      Task with whole loop iteration range
-// lb        Pointer to loop lower bound
-// ub        Pointer to loop upper bound
+// task      Pattern task, exposes the loop iteration range
+// lb        Pointer to loop lower bound in task structure
+// ub        Pointer to loop upper bound in task structure
 // st        Loop stride
-// sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
-// grainsize Schedule value if specified
+// ub_glob   Global upper bound (used for lastprivate check)
+// num_tasks Number of tasks to execute
+// grainsize Number of loop iterations per task
+// extras    Number of chunks with grainsize+1 iterations
+// tc        Iterations count
 // task_dup  Tasks duplication routine
 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
-                           int sched, kmp_uint64 grainsize, void *task_dup) {
+                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                           kmp_uint64 grainsize, kmp_uint64 extras,
+                           kmp_uint64 tc, void *task_dup) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
-  kmp_uint64 tc;
   kmp_uint64 lower = *lb; // compiler provides global bounds here
   kmp_uint64 upper = *ub;
-  kmp_uint64 i, num_tasks = 0, extras = 0;
+  kmp_uint64 i;
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
   kmp_task_t *next_task;
@@ -3298,6 +3304,254 @@
   size_t upper_offset =
       (char *)ub - (char *)task; // remember offset of ub in the task structure
 
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
+                "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", gtid, num_tasks,
+                grainsize, extras, lower, upper, ub_glob, st, task_dup));
+
+  // Launch num_tasks tasks, assign grainsize iterations each task
+  for (i = 0; i < num_tasks; ++i) {
+    kmp_uint64 chunk_minus_1;
+    if (extras == 0) {
+      chunk_minus_1 = grainsize - 1;
+    } else {
+      chunk_minus_1 = grainsize;
+      --extras; // first extras iterations get bigger chunk (grainsize+1)
+    }
+    upper = lower + st * chunk_minus_1;
+    if (i == num_tasks - 1) {
+      // schedule the last task, set lastprivate flag if needed
+      if (st == 1) { // most common case
+        KMP_DEBUG_ASSERT(upper == *ub);
+        if (upper == ub_glob)
+          lastpriv = 1;
+      } else if (st > 0) { // positive loop stride
+        KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
+        if ((kmp_uint64)st > ub_glob - upper)
+          lastpriv = 1;
+      } else {  // negative loop stride
+        KMP_DEBUG_ASSERT(upper + st < *ub);
+        if (upper - ub_glob < (kmp_uint64)(-st))
+          lastpriv = 1;
+      }
+    }
+    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+    // adjust task-specific bounds
+    *(kmp_uint64 *)((char *)next_task + lower_offset) = lower;
+    *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
+    if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
+      ptask_dup(next_task, task, lastpriv);
+    KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, "
+                  "upper %lld (offsets %p %p)\n",
+                  gtid, next_task, lower, upper, lower_offset, upper_offset));
+    __kmp_omp_task(gtid, next_task, true); // schedule new task
+    lower = upper + st; // adjust lower bound for the next iteration
+  }
+  // free the pattern task and exit
+  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
+  // do not execute the pattern task, just do internal bookkeeping
+  __kmp_task_finish(gtid, task, current_task);
+}
+
+// Structure to keep taskloop parameters for auxiliary task
+// kept in the shareds of the task structure.
+typedef struct __taskloop_params {
+  kmp_task_t *task;
+  kmp_uint64 *lb;
+  kmp_uint64 *ub;
+  void *task_dup;
+  kmp_int64 st;
+  kmp_uint64 ub_glob;
+  kmp_uint64 num_tasks;
+  kmp_uint64 grainsize;
+  kmp_uint64 extras;
+  kmp_uint64 tc;
+  kmp_uint64 num_t_min;
+} __taskloop_params_t;
+
+void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
+                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
+                          kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
+                          void *);
+
+// Execute part of the the taskloop submitted as a task.
+int __kmp_taskloop_task(int gtid, void *ptask) {
+  __taskloop_params_t *p = (__taskloop_params_t*)((kmp_task_t*)ptask)->shareds;
+  kmp_task_t *task = p->task;
+  kmp_uint64 *lb = p->lb;
+  kmp_uint64 *ub = p->ub;
+  void *task_dup = p->task_dup;
+//  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_int64 st = p->st;
+  kmp_uint64 ub_glob = p->ub_glob;
+  kmp_uint64 num_tasks = p->num_tasks;
+  kmp_uint64 grainsize = p->grainsize;
+  kmp_uint64 extras = p->extras;
+  kmp_uint64 tc = p->tc;
+  kmp_uint64 num_t_min = p->num_t_min;
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
+                num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
+#endif
+  KMP_DEBUG_ASSERT(num_tasks*2+1 > num_t_min);
+  if (num_tasks > num_t_min)
+    __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_t_min, task_dup);
+  else
+    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
+
+  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
+  return 0;
+}
+
+// Schedule part of the the taskloop as a task,
+// execute the rest of the the taskloop.
+//
+// loc       Source location information
+// gtid      Global thread ID
+// task      Pattern task, exposes the loop iteration range
+// lb        Pointer to loop lower bound in task structure
+// ub        Pointer to loop upper bound in task structure
+// st        Loop stride
+// ub_glob   Global upper bound (used for lastprivate check)
+// num_tasks Number of tasks to execute
+// grainsize Number of loop iterations per task
+// extras    Number of chunks with grainsize+1 iterations
+// tc        Iterations count
+// num_t_min Threashold to launch tasks recursively
+// task_dup  Tasks duplication routine
+void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
+                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                          kmp_uint64 grainsize, kmp_uint64 extras,
+                          kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) {
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
+  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
+                num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
+#endif
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_uint64 lower = *lb;
+  kmp_uint64 upper = *ub;
+  kmp_info_t *thread = __kmp_threads[gtid];
+//  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  kmp_int32 lastpriv = 0;
+  size_t lower_offset =
+      (char *)lb - (char *)task; // remember offset of lb in the task structure
+  size_t upper_offset =
+      (char *)ub - (char *)task; // remember offset of ub in the task structure
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+
+  // split the loop in two halves
+  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
+  kmp_uint64 gr_size0 = grainsize;
+  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
+  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
+  if (n_tsk0 <= extras) {
+    gr_size0++; // integrate extras into grainsize
+    ext0 = 0; // no extra iters in 1st half
+    ext1 = extras - n_tsk0; // remaining extras
+    tc0 = gr_size0 * n_tsk0;
+    tc1 = tc - tc0;
+  } else { // n_tsk0 > extras
+    ext1 = 0; // no extra iters in 2nd half
+    ext0 = extras;
+    tc1 = grainsize * n_tsk1;
+    tc0 = tc - tc1;
+  }
+  ub0 = lower + st * (tc0 - 1);
+  lb1 = ub0 + st;
+
+  // create pattern task for 2nd half of the loop
+  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+  // adjust lower bound (upper bound is not changed) for the 2nd half
+  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
+  if (ptask_dup != NULL) // construct fistprivates, etc.
+    ptask_dup(next_task, task, 0);
+  *ub = ub0; // adjust upper bound for the 1st half
+
+  // create auxiliary task for 2nd half of the loop
+  kmp_task_t *new_task =
+      __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void*),
+                            sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  __taskloop_params_t * p = (__taskloop_params_t *)new_task->shareds;
+  p->task = next_task;
+  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
+  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
+  p->task_dup = task_dup;
+  p->st = st;
+  p->ub_glob = ub_glob;
+  p->num_tasks = n_tsk1;
+  p->grainsize = grainsize;
+  p->extras = ext1;
+  p->tc = tc1;
+  p->num_t_min = num_t_min;
+  __kmp_omp_task(gtid, new_task, true); // schedule new task
+
+  // execute the 1st half of current subrange
+  if (n_tsk0 > num_t_min)
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                         gr_size0, ext0, tc0, num_t_min, task_dup);
+  else
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                          gr_size0, ext0, tc0, task_dup);
+
+  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
+                     int sched, kmp_uint64 grainsize, void *task_dup) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+
+  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
+                "grain %llu(%d), dup %p\n", gtid, taskdata, *lb, *ub, st,
+                grainsize, sched, task_dup));
+
+  if (nogroup == 0)
+    __kmpc_taskgroup(loc, gtid);
+
+  // =========================================================================
+  // calculate loop parameters
+  kmp_uint64 tc;
+  kmp_uint64 lower = *lb; // compiler provides global bounds here
+  kmp_uint64 upper = *ub;
+  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
+  kmp_uint64 num_tasks = 0, extras = 0;
+  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
   // compute trip count
   if (st == 1) { // most common case
     tc = upper - lower + 1;
@@ -3314,6 +3568,10 @@
     __kmp_task_finish(gtid, task, current_task);
     return;
   }
+  if (num_tasks_min == 0)
+    // TODO: can we choose better default heuristic?
+    num_tasks_min = KMP_MIN(thread->th.th_team_nproc * 10,
+                            INITIAL_TASK_DEQUE_SIZE);
 
   // compute num_tasks/grainsize based on the input provided
   switch (sched) {
@@ -3338,9 +3596,8 @@
       extras = 0;
     } else {
       num_tasks = tc / grainsize;
-      grainsize =
-          tc /
-          num_tasks; // adjust grainsize for balanced distribution of iterations
+      // adjust grainsize for balanced distribution of iterations
+      grainsize = tc / num_tasks;
       extras = tc % num_tasks;
     }
     break;
@@ -3350,95 +3607,32 @@
   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
-  KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize "
-                "%lld, extras %lld\n",
-                gtid, num_tasks, grainsize, extras));
-
-  // Main loop, launch num_tasks tasks, assign grainsize iterations each task
-  for (i = 0; i < num_tasks; ++i) {
-    kmp_uint64 chunk_minus_1;
-    if (extras == 0) {
-      chunk_minus_1 = grainsize - 1;
-    } else {
-      chunk_minus_1 = grainsize;
-      --extras; // first extras iterations get bigger chunk (grainsize+1)
-    }
-    upper = lower + st * chunk_minus_1;
-    if (i == num_tasks - 1) {
-      // schedule the last task, set lastprivate flag
-      lastpriv = 1;
-#if KMP_DEBUG
-      if (st == 1)
-        KMP_DEBUG_ASSERT(upper == *ub);
-      else if (st > 0)
-        KMP_DEBUG_ASSERT(upper + st > *ub);
-      else
-        KMP_DEBUG_ASSERT(upper + st < *ub);
-#endif
-    }
-    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
-    *(kmp_uint64 *)((char *)next_task + lower_offset) =
-        lower; // adjust task-specific bounds
-    *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
-    if (ptask_dup != NULL)
-      ptask_dup(next_task, task,
-                lastpriv); // set lastprivate flag, construct fistprivates, etc.
-    KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper "
-                  "%lld (offsets %p %p)\n",
-                  gtid, next_task, lower, upper, lower_offset, upper_offset));
-    __kmp_omp_task(gtid, next_task, true); // schedule new task
-    lower = upper + st; // adjust lower bound for the next iteration
-  }
-  // free the pattern task and exit
-  __kmp_task_start(gtid, task, current_task);
-  // do not execute the pattern task, just do bookkeeping
-  __kmp_task_finish(gtid, task, current_task);
-}
-
-/*!
-@ingroup TASKING
-@param loc       Source location information
-@param gtid      Global thread ID
-@param task      Task structure
-@param if_val    Value of the if clause
-@param lb        Pointer to loop lower bound
-@param ub        Pointer to loop upper bound
-@param st        Loop stride
-@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
-@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
-@param grainsize Schedule value if specified
-@param task_dup  Tasks duplication routine
-
-Execute the taskloop construct.
-*/
-void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
-                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
-                     int sched, kmp_uint64 grainsize, void *task_dup) {
-  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  KMP_DEBUG_ASSERT(task != NULL);
-
-  KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub "
-                "%lld st %lld, grain %llu(%d)\n",
-                gtid, taskdata, *lb, *ub, st, grainsize, sched));
+  // =========================================================================
 
   // check if clause value first
   if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
-  }
-  if (nogroup == 0) {
-    __kmpc_taskgroup(loc, gtid);
-  }
-
-  if (1 /* AC: use some heuristic here to choose task scheduling method */) {
-    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize,
-                          task_dup);
+    // always start serial tasks linearly
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
+  } else if (num_tasks > num_tasks_min) {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
+                  num_tasks_min, grainsize, extras));
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_tasks_min, task_dup);
+  } else {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
+                  num_tasks_min, grainsize, extras));
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
   }
 
-  if (nogroup == 0) {
+  if (nogroup == 0)
     __kmpc_end_taskgroup(loc, gtid);
-  }
-  KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
 
 #endif
Index: openmp/trunk/runtime/test/tasking/kmp_taskloop.c
===================================================================
--- openmp/trunk/runtime/test/tasking/kmp_taskloop.c
+++ openmp/trunk/runtime/test/tasking/kmp_taskloop.c
@@ -1,4 +1,5 @@
 // RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
 #include <stdio.h>
 #include <omp.h>
 #include "omp_my_sleep.h"
Index: openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c
===================================================================
--- openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c
+++ openmp/trunk/runtime/test/tasking/omp_taskloop_grainsize.c
@@ -0,0 +1,106 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_grainsize()
+{
+  int i, grainsize, count, tmp_count, result, num_off;
+  int *tmp, *tids, *tidsArray;
+
+  tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+  tids = tidsArray;
+
+  for (grainsize = 1; grainsize < 48; ++grainsize) {
+    fprintf(stderr, "Grainsize %d\n", grainsize);
+    count = tmp_count = num_off = 0;
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      tids[i] = -1;
+    }
+
+    #pragma omp parallel shared(tids)
+    {
+      #pragma omp master
+      #pragma omp taskloop grainsize(grainsize)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE; ++i) {
+      if (tids[i] == -1) {
+        fprintf(stderr, "  Iteration %d not touched!\n", i);
+        result++;
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    tmp = (int *)malloc(sizeof(int) * (count + 1));
+    tmp[0] = 1;
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tmp_count > count) {
+        printf("--------------------\nTestinternal Error: List too "
+               "small!!!\n--------------------\n");
+        break;
+      }
+      if (tids[i] != tids[i + 1]) {
+        tmp_count++;
+        tmp[tmp_count] = 1;
+      } else {
+        tmp[tmp_count]++;
+      }
+    }
+
+    // is grainsize statement working?
+    int num_tasks = CFDMAX_SIZE / grainsize;
+    int multiple1 = CFDMAX_SIZE / num_tasks;
+    int multiple2 = CFDMAX_SIZE / num_tasks + 1;
+    for (i = 0; i < count; i++) {
+      // it is possible for 2 adjacent chunks assigned to a same thread
+      if (tmp[i] % multiple1 != 0 && tmp[i] % multiple2 != 0) {
+        num_off++;
+      }
+    }
+
+    if (num_off > 1) {
+      fprintf(stderr, "  The number of bad chunks is %d\n", num_off);
+      result++;
+    } else {
+      fprintf(stderr, "  Everything ok\n");
+    }
+
+    free(tmp);
+  }
+  free(tidsArray);
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_grainsize()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}
Index: openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c
===================================================================
--- openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c
+++ openmp/trunk/runtime/test/tasking/omp_taskloop_num_tasks.c
@@ -0,0 +1,67 @@
+// RUN: %libomp-compile-and-run
+// RUN: %libomp-compile && env KMP_TASKLOOP_MIN_TASKS=1 %libomp-run
+/*
+ * Test for taskloop
+ * Method: caculate how many times the iteration space is dispatched
+ *     and judge if each dispatch has the requested grainsize
+ * It is possible for two adjacent chunks are executed by the same thread
+ */
+#include <stdio.h>
+#include <omp.h>
+#include <stdlib.h>
+#include "omp_testsuite.h"
+
+#define CFDMAX_SIZE 1120
+
+int test_omp_taskloop_num_tasks()
+{
+  int i;
+  int *tids;
+  int *tidsArray;
+  int count;
+  int result = 0;
+  int num_tasks;
+
+  for (num_tasks = 1; num_tasks < 120; ++num_tasks) {
+    count = 0;
+    tidsArray = (int *)malloc(sizeof(int) * CFDMAX_SIZE);
+    tids = tidsArray;
+
+    #pragma omp parallel shared(tids)
+    {
+      int i;
+      #pragma omp master
+      #pragma omp taskloop num_tasks(num_tasks)
+      for (i = 0; i < CFDMAX_SIZE; i++) {
+        tids[i] = omp_get_thread_num();
+      }
+    }
+
+    for (i = 0; i < CFDMAX_SIZE - 1; ++i) {
+      if (tids[i] != tids[i + 1]) {
+        count++;
+      }
+    }
+
+    if (count > num_tasks) {
+      fprintf(stderr, "counted too many tasks: (wanted %d, got %d)\n",
+              num_tasks, count);
+      result++;
+    }
+  }
+
+  return (result==0);
+}
+
+int main()
+{
+  int i;
+  int num_failed=0;
+
+  for (i = 0; i < REPETITIONS; i++) {
+    if (!test_omp_taskloop_num_tasks()) {
+      num_failed++;
+    }
+  }
+  return num_failed;
+}