Index: openmp/runtime/CMakeLists.txt
===================================================================
--- openmp/runtime/CMakeLists.txt
+++ openmp/runtime/CMakeLists.txt
@@ -288,6 +288,9 @@
   set (LIBOMP_USE_VERSION_SYMBOLS FALSE)
 endif()
 
+# Unshackled task support defaults to OFF
+set(LIBOMP_USE_UNSHACKLED_TASK FALSE CACHE BOOL "Use unshackled task?")
+
 # OMPT-support defaults to ON for OpenMP 5.0+ and if the requirements in
 # cmake/config-ix.cmake are fulfilled.
 set(OMPT_DEFAULT FALSE)
Index: openmp/runtime/src/kmp.h
===================================================================
--- openmp/runtime/src/kmp.h
+++ openmp/runtime/src/kmp.h
@@ -2235,7 +2235,14 @@
   unsigned priority_specified : 1; /* set if the compiler provides priority
                                       setting for the task */
   unsigned detachable : 1; /* 1 == can detach */
-  unsigned reserved : 9; /* reserved for compiler use */
+#if USE_UNSHACKLED_TASK
+  // 1 == unshackled task
+  // Although we mark it as a compiler flag, currently it is not set by compiler
+  // in fact. All tasks created via target task related interfaces will set this
+  // flag. Probably in the future it will be used by other features.
+  unsigned unshackled : 1;
+#endif
+  unsigned reserved : 8; /* reserved for compiler use */
 
   /* Library flags */ /* Total library flags must be 16 bits */
   unsigned tasktype : 1; /* task is either explicit(1) or implicit (0) */
@@ -2283,6 +2290,15 @@
   kmp_depnode_t
       *td_depnode; // Pointer to graph node if this task has dependencies
   kmp_task_team_t *td_task_team;
+#if USE_UNSHACKLED_TASK
+  // The task team of its parent task team. Usually we could access it via
+  // parent_task->td_task_team, but it is possible that
+  // parent_task->td_task_team is nullptr because of late initialization.
+  // Sometimes we must use this pointer, and the td_task_team of the
+  // encountering thread is never nullptr, therefore we set it when the task is
+  // created.
+  kmp_task_team_t *td_parent_task_team;
+#endif
   kmp_int32 td_size_alloc; // The size of task structure, including shareds etc.
 #if defined(KMP_GOMP_COMPAT)
   // 4 or 8 byte integers for the loop bounds in GOMP_taskloop
@@ -2354,6 +2370,11 @@
   KMP_ALIGN_CACHE
   std::atomic<kmp_int32> tt_unfinished_threads; /* #threads still active */
 
+#if USE_UNSHACKLED_TASK
+  KMP_ALIGN_CACHE
+  std::atomic<kmp_int32> tt_unfinished_unshackled_tasks;
+#endif
+
   KMP_ALIGN_CACHE
   volatile kmp_uint32
       tt_active; /* is the team still actively executing tasks */
@@ -2818,6 +2839,10 @@
 extern volatile int __kmp_init_monitor;
 #endif
 extern volatile int __kmp_init_user_locks;
+#if USE_UNSHACKLED_TASK
+// Set to TRUE when the unshackled team is being initialized
+extern volatile int __kmp_init_unshackled_threads;
+#endif
 extern int __kmp_init_counter;
 extern int __kmp_root_counter;
 extern int __kmp_version;
@@ -3048,7 +3073,13 @@
 
 static inline bool KMP_UBER_GTID(int gtid) {
   KMP_DEBUG_ASSERT(gtid >= KMP_GTID_MIN);
+#if USE_UNSHACKLED_TASK
+  KMP_DEBUG_ASSERT(gtid < __kmp_init_unshackled_threads
+                       ? 2 * __kmp_threads_capacity
+                       : __kmp_threads_capacity);
+#else
   KMP_DEBUG_ASSERT(gtid < __kmp_threads_capacity);
+#endif
   return (gtid >= 0 && __kmp_root[gtid] && __kmp_threads[gtid] &&
           __kmp_threads[gtid] == __kmp_root[gtid]->r.r_uber_thread);
 }
@@ -3906,6 +3937,30 @@
 
 extern void __kmp_omp_display_env(int verbose);
 
+#if USE_UNSHACKLED_TASK
+// Master thread of unshackled team
+extern kmp_info_t *__kmp_unshackled_master_thread;
+// Descriptors for the unshackled threads
+extern kmp_info_t **__kmp_unshackled_threads;
+extern int __kmp_unshackled_threads_num;
+
+extern void __kmp_unshackled_threads_initz_routine();
+extern void __kmp_initialize_unshackled_threads();
+extern void __kmp_do_initialize_unshackled_threads();
+extern void __kmp_unshackled_threads_initz_wait();
+extern void __kmp_unshackled_initz_release();
+extern void __kmp_unshackled_master_thread_wait();
+extern void __kmp_unshackled_worker_thread_wait();
+extern void __kmp_unshackled_worker_thread_signal();
+
+// Check whether a given thread is an unshackled thread
+#define KMP_UNSHACKLED_THREAD(gtid) ((gtid) >= __kmp_threads_capacity)
+// Map a gtid to an unshackled thread. The first unshackled thread, a.k.a master
+// thread, is skipped.
+#define KMP_GTID_TO_SHADOW_GTID(gtid)                                          \
+  ((gtid) % (__kmp_unshackled_threads_num - 1) + 1)
+#endif
+
 #ifdef __cplusplus
 }
 #endif
Index: openmp/runtime/src/kmp_config.h.cmake
===================================================================
--- openmp/runtime/src/kmp_config.h.cmake
+++ openmp/runtime/src/kmp_config.h.cmake
@@ -44,6 +44,8 @@
 #define OMPT_DEBUG LIBOMP_OMPT_DEBUG
 #cmakedefine01 LIBOMP_OMPT_SUPPORT
 #define OMPT_SUPPORT LIBOMP_OMPT_SUPPORT
+#cmakedefine01 LIBOMP_USE_UNSHACKLED_TASK
+#define USE_UNSHACKLED_TASK LIBOMP_USE_UNSHACKLED_TASK
 #cmakedefine01 LIBOMP_OMPT_OPTIONAL
 #define OMPT_OPTIONAL LIBOMP_OMPT_OPTIONAL
 #cmakedefine01 LIBOMP_USE_ADAPTIVE_LOCKS
Index: openmp/runtime/src/kmp_global.cpp
===================================================================
--- openmp/runtime/src/kmp_global.cpp
+++ openmp/runtime/src/kmp_global.cpp
@@ -51,6 +51,9 @@
     0; /* 1 - launched, 2 - actually started (Windows* OS only) */
 #endif
 volatile int __kmp_init_user_locks = FALSE;
+#if USE_UNSHACKLED_TASK
+volatile int __kmp_init_unshackled_threads = FALSE;
+#endif
 
 /* list of address of allocated caches for commons */
 kmp_cached_addr_t *__kmp_threadpriv_cache_list = NULL;
Index: openmp/runtime/src/kmp_runtime.cpp
===================================================================
--- openmp/runtime/src/kmp_runtime.cpp
+++ openmp/runtime/src/kmp_runtime.cpp
@@ -3611,6 +3611,12 @@
         serial initialization may be not a real initial thread).
   */
   capacity = __kmp_threads_capacity;
+#if USE_UNSHACKLED_TASK
+  // The capacity doubles if we enable unshackled task and are initializing the
+  // unshackled team
+  if (__kmp_init_unshackled_threads)
+    capacity *= 2;
+#endif
   if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) {
     --capacity;
   }
@@ -3627,15 +3633,30 @@
     }
   }
 
-  /* find an available thread slot */
-  /* Don't reassign the zero slot since we need that to only be used by initial
-     thread */
-  for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
-       gtid++)
-    ;
-  KA_TRACE(1,
-           ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
-  KMP_ASSERT(gtid < __kmp_threads_capacity);
+#if USE_UNSHACKLED_TASK
+  if (!__kmp_init_unshackled_threads) {
+#endif
+    /* find an available thread slot */
+    /* Don't reassign the zero slot since we need that to only be used by
+       initial thread */
+    for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL;
+         gtid++)
+      ;
+    KA_TRACE(
+        1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
+    KMP_ASSERT(gtid < __kmp_threads_capacity);
+#if USE_UNSHACKLED_TASK
+  } else {
+    // When initializing the unshackled team, we find the first empty slot from
+    // the second half of __kmp_threads
+    for (gtid = __kmp_threads_capacity; TCR_PTR(__kmp_threads[gtid]) != NULL;
+         gtid++)
+      ;
+    KA_TRACE(
+        1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid));
+    KMP_ASSERT(gtid < 2 * __kmp_threads_capacity);
+  }
+#endif
 
   /* update global accounting */
   __kmp_all_nth++;
@@ -4292,9 +4313,23 @@
 #endif
 
   KMP_MB();
-  for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
-    KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
+
+#if USE_UNSHACKLED_TASK
+  // If we're initializing the unshackled threads, the start point is at the end
+  // of regular threads array, a.k.a the start of unshackled threads array
+  if (__kmp_init_unshackled_threads) {
+    for (new_gtid = __kmp_threads_capacity;
+         TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
+      KMP_DEBUG_ASSERT(new_gtid < 2 * __kmp_threads_capacity);
+    }
+  } else {
+#endif
+    for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) {
+      KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity);
+    }
+#if USE_UNSHACKLED_TASK
   }
+#endif
 
   /* allocate space for it. */
   new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t));
@@ -6674,12 +6709,18 @@
   /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are
    * expandable */
   /* Since allocation is cache-aligned, just add extra padding at the end */
-  size =
-      (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity +
-      CACHE_LINE;
+  size = (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity
+#if USE_UNSHACKLED_TASK
+             * 2
+#endif
+         + CACHE_LINE;
   __kmp_threads = (kmp_info_t **)__kmp_allocate(size);
   __kmp_root = (kmp_root_t **)((char *)__kmp_threads +
-                               sizeof(kmp_info_t *) * __kmp_threads_capacity);
+                               sizeof(kmp_info_t *) * __kmp_threads_capacity
+#if USE_UNSHACKLED_TASK
+                                   * 2
+#endif
+  );
 
   /* init thread counts */
   KMP_DEBUG_ASSERT(__kmp_all_nth ==
@@ -6951,6 +6992,10 @@
   KA_TRACE(10, ("__kmp_parallel_initialize: exit\n"));
 
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
+
+#if USE_UNSHACKLED_TASK
+  __kmp_initialize_unshackled_threads();
+#endif
 }
 
 /* ------------------------------------------------------------------------ */
@@ -8297,7 +8342,6 @@
   }
 }
 
-
 void __kmp_omp_display_env(int verbose) {
   __kmp_acquire_bootstrap_lock(&__kmp_initz_lock);
   if (__kmp_init_serial == 0)
@@ -8305,3 +8349,48 @@
   __kmp_display_env_impl(!verbose, verbose);
   __kmp_release_bootstrap_lock(&__kmp_initz_lock);
 }
+
+#if USE_UNSHACKLED_TASK
+kmp_info_t **__kmp_unshackled_threads;
+kmp_info_t *__kmp_unshackled_master_thread;
+int __kmp_unshackled_threads_num;
+
+namespace {
+void __kmp_unshackled_wrapper_fn(int *gtid, int *, ...) {
+  // If master thread, then wait for signal
+  if (__kmpc_master(nullptr, *gtid)) {
+    // First, unset the initial state and release the initial thread
+    __kmp_init_unshackled_threads = FALSE;
+    __kmp_unshackled_initz_release();
+    __kmp_unshackled_master_thread_wait();
+  }
+}
+} // namespace
+
+void __kmp_unshackled_threads_initz_routine() {
+  kmp_info_t *master_thread = nullptr;
+
+  // Create a new root for unshackled team/threads
+  const int gtid = __kmp_register_root(TRUE);
+  __kmp_unshackled_master_thread = master_thread = __kmp_threads[gtid];
+  __kmp_unshackled_threads = &__kmp_threads[gtid];
+
+  // TODO: Determine how many unshackled threads
+  __kmp_unshackled_threads_num = 8;
+  master_thread->th.th_set_nproc = __kmp_unshackled_threads_num;
+
+  __kmpc_fork_call(nullptr, 0, __kmp_unshackled_wrapper_fn);
+}
+
+void __kmp_initialize_unshackled_threads() {
+  // Set the global variable indicating that we're initializing unshackled
+  // team/threads
+  __kmp_init_unshackled_threads = TRUE;
+
+  __kmp_do_initialize_unshackled_threads();
+
+  // Wait here for the finish of initialization of unshackled teams
+  __kmp_unshackled_threads_initz_wait();
+}
+
+#endif
Index: openmp/runtime/src/kmp_tasking.cpp
===================================================================
--- openmp/runtime/src/kmp_tasking.cpp
+++ openmp/runtime/src/kmp_tasking.cpp
@@ -325,6 +325,16 @@
 static kmp_int32 __kmp_push_task(kmp_int32 gtid, kmp_task_t *task) {
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+
+#if USE_UNSHACKLED_TASK
+  // If the task is unshackled, we push it into dequeue of the corresponding
+  // unshackled thread
+  if (taskdata->td_flags.unshackled) {
+    thread = __kmp_unshackled_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
+    gtid = thread->th.th_info.ds.ds_gtid;
+  }
+#endif
+
   kmp_task_team_t *task_team = thread->th.th_task_team;
   kmp_int32 tid = __kmp_tid_from_gtid(gtid);
   kmp_thread_data_t *thread_data;
@@ -363,7 +373,8 @@
   // Find tasking deque specific to encountering thread
   thread_data = &task_team->tt.tt_threads_data[tid];
 
-  // No lock needed since only owner can allocate
+  // No lock needed even if the task is unshackled because we have initialized
+  // the dequeue for unshackled thread data
   if (thread_data->td.td_deque == NULL) {
     __kmp_alloc_task_deque(thread, thread_data);
   }
@@ -428,6 +439,12 @@
 
   __kmp_release_bootstrap_lock(&thread_data->td.td_deque_lock);
 
+#if USE_UNSHACKLED_TASK
+  // Signal one worker thread to execute the task
+  if (taskdata->td_flags.unshackled)
+    __kmp_unshackled_worker_thread_signal();
+#endif
+
   return TASK_SUCCESSFULLY_PUSHED;
 }
 
@@ -720,7 +737,6 @@
 #else /* ! USE_FAST_MEMORY */
   __kmp_thread_free(thread, taskdata);
 #endif
-
   KA_TRACE(20, ("__kmp_free_task: T#%d freed task %p\n", gtid, taskdata));
 }
 
@@ -930,7 +946,7 @@
 #endif
 
     // Only need to keep track of count if team parallel and tasking not
-    // serialized, or task is detachable and event has already been fulfilled 
+    // serialized, or task is detachable and event has already been fulfilled
     if (!(taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser) ||
         taskdata->td_flags.detachable == TASK_DETACHABLE) {
       // Predecrement simulated by "- 1" calculation
@@ -939,6 +955,10 @@
       KMP_DEBUG_ASSERT(children >= 0);
       if (taskdata->td_taskgroup)
         KMP_ATOMIC_DEC(&taskdata->td_taskgroup->count);
+#if USE_UNSHACKLED_TASK
+      if (taskdata->td_flags.unshackled && taskdata->td_parent_task_team)
+        KMP_ATOMIC_DEC(&taskdata->td_parent_task_team->tt.tt_unfinished_unshackled_tasks);
+#endif
       __kmp_release_deps(gtid, taskdata);
     } else if (task_team && task_team->tt.tt_found_proxy_tasks) {
       // if we found proxy tasks there could exist a dependency chain
@@ -1180,6 +1200,9 @@
   kmp_task_t *task;
   kmp_taskdata_t *taskdata;
   kmp_info_t *thread = __kmp_threads[gtid];
+#if USE_UNSHACKLED_TASK
+  kmp_info_t *encountering_thread = thread;
+#endif
   kmp_team_t *team = thread->th.th_team;
   kmp_taskdata_t *parent_task = thread->th.th_current_task;
   size_t shareds_offset;
@@ -1187,6 +1210,24 @@
   if (!TCR_4(__kmp_init_middle))
     __kmp_middle_initialize();
 
+#if USE_UNSHACKLED_TASK
+  if (flags->unshackled) {
+    // Since unshackled threads are allocated via __kmpc_fork_call, we need to
+    // initialize parallel correspondingly
+    if (!TCR_4(__kmp_init_parallel))
+      __kmp_parallel_initialize();
+
+    // For an unshackled task encountered by a regular thread, we will push the
+    // task to the (gtid%__kmp_unshackled_threads_num)-th unshackled thread
+    if (!KMP_UNSHACKLED_THREAD(gtid)) {
+      thread = __kmp_unshackled_threads[KMP_GTID_TO_SHADOW_GTID(gtid)];
+      team = thread->th.th_team;
+      // We don't change the parent-child relation for unshackled task as we
+      // need that to do per-task-region synchronization
+    }
+  }
+#endif
+
   KA_TRACE(10, ("__kmp_task_alloc(enter): T#%d loc=%p, flags=(0x%x) "
                 "sizeof_task=%ld sizeof_shared=%ld entry=%p\n",
                 gtid, loc_ref, *((kmp_int32 *)flags), sizeof_kmp_task_t,
@@ -1197,6 +1238,13 @@
     }
     flags->final = 1;
   }
+
+#if USE_UNSHACKLED_TASK
+  // Unshackled thread is never final
+  if (flags->unshackled)
+    flags->final = 0;
+#endif
+
   if (flags->tiedness == TASK_UNTIED && !team->t.t_serialized) {
     // Untied task encountered causes the TSC algorithm to check entire deque of
     // the victim thread. If no untied task encountered, then checking the head
@@ -1259,11 +1307,23 @@
 
 // Avoid double allocation here by combining shareds with taskdata
 #if USE_FAST_MEMORY
+#if USE_UNSHACKLED_TASK
+  // In order to avoid race condition and using lock here, we allocate the
+  // memory from the encountering thread
+  taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(
+      encountering_thread, shareds_offset + sizeof_shareds);
+#else
   taskdata = (kmp_taskdata_t *)__kmp_fast_allocate(thread, shareds_offset +
                                                                sizeof_shareds);
+#endif
 #else /* ! USE_FAST_MEMORY */
+#if USE_UNSHACKLED_TASK
+  taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(
+      encountering_thread, shareds_offset + sizeof_shareds);
+#else
   taskdata = (kmp_taskdata_t *)__kmp_thread_malloc(thread, shareds_offset +
                                                                sizeof_shareds);
+#endif
 #endif /* USE_FAST_MEMORY */
   ANNOTATE_HAPPENS_AFTER(taskdata);
 
@@ -1310,6 +1370,10 @@
   taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
   taskdata->td_flags.proxy = flags->proxy;
   taskdata->td_flags.detachable = flags->detachable;
+#if USE_UNSHACKLED_TASK
+  taskdata->td_flags.unshackled = flags->unshackled;
+  taskdata->td_parent_task_team = encountering_thread->th.th_task_team;
+#endif
   taskdata->td_task_team = thread->th.th_task_team;
   taskdata->td_size_alloc = shareds_offset + sizeof_shareds;
   taskdata->td_flags.tasktype = TASK_EXPLICIT;
@@ -1365,6 +1429,11 @@
     if (taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT) {
       KMP_ATOMIC_INC(&taskdata->td_parent->td_allocated_child_tasks);
     }
+#if USE_UNSHACKLED_TASK
+    if (flags->unshackled && taskdata->td_parent_task_team)
+      KMP_ATOMIC_INC(
+          &taskdata->td_parent_task_team->tt.tt_unfinished_unshackled_tasks);
+#endif
   }
 
   KA_TRACE(20, ("__kmp_task_alloc(exit): T#%d created task %p parent=%p\n",
@@ -1405,6 +1474,12 @@
                                          size_t sizeof_shareds,
                                          kmp_routine_entry_t task_entry,
                                          kmp_int64 device_id) {
+#if USE_UNSHACKLED_TASK
+  // All tasks allocated via this API should be an unshackled and untied task
+  kmp_tasking_flags_t *input_flags = (kmp_tasking_flags_t *)&flags;
+  input_flags->unshackled = TRUE;
+  input_flags->tiedness = FALSE;
+#endif
   return __kmpc_omp_task_alloc(loc_ref, gtid, flags, sizeof_kmp_task_t,
                                sizeof_shareds, task_entry);
 }
@@ -1870,6 +1945,13 @@
 
     must_wait = must_wait || (thread->th.th_task_team != NULL &&
                               thread->th.th_task_team->tt.tt_found_proxy_tasks);
+
+#if USE_UNSHACKLED_TASK
+    // If unshackled thread is enabled, we must enable wait as there might be
+    // task outside of any parallel region
+    must_wait = true;
+#endif
+
     if (must_wait) {
       kmp_flag_32 flag(RCAST(std::atomic<kmp_uint32> *,
                              &(taskdata->td_incomplete_child_tasks)),
@@ -2827,7 +2909,13 @@
 
   thread->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
   threads_data = (kmp_thread_data_t *)TCR_PTR(task_team->tt.tt_threads_data);
+#if USE_UNSHACKLED_TASK
+  // This can happen when unshackled task is enabled
+  if (threads_data == nullptr)
+    return FALSE;
+#else
   KMP_DEBUG_ASSERT(threads_data != NULL);
+#endif
 
   nthreads = task_team->tt.tt_nproc;
   unfinished_threads = &(task_team->tt.tt_unfinished_threads);
@@ -2911,8 +2999,8 @@
         }
       }
 
-      if (task == NULL) // break out of tasking loop
-        break;
+      if (task == NULL)
+        break; // break out of tasking loop
 
 // Found a task; execute it
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
@@ -3357,6 +3445,9 @@
   task_team->tt.tt_nproc = nthreads = team->t.t_nproc;
 
   KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_threads, nthreads);
+#if USE_UNSHACKLED_TASK
+  KMP_ATOMIC_ST_REL(&task_team->tt.tt_unfinished_unshackled_tasks, 0);
+#endif
   TCW_4(task_team->tt.tt_active, TRUE);
 
   KA_TRACE(20, ("__kmp_allocate_task_team: T#%d exiting; task_team = %p "
@@ -3508,6 +3599,22 @@
                     __kmp_gtid_from_thread(this_thr),
                     team->t.t_task_team[other_team],
                     ((team != NULL) ? team->t.t_id : -1), other_team));
+#if USE_UNSHACKLED_TASK
+      // For regular thread, the task enabling should be called when the first
+      // task is going to be pushed to a dequeue. However, for the unshackled
+      // thread, we need it ahead of time so that some operations can be
+      // performed without using lock to avoid race condition.
+      kmp_task_team_t *task_team = team->t.t_task_team[other_team];
+      if (this_thr == __kmp_unshackled_master_thread &&
+          !KMP_TASKING_ENABLED(task_team)) {
+        __kmp_enable_tasking(task_team, this_thr);
+        for (int i = 0; i < task_team->tt.tt_nproc; ++i) {
+          kmp_thread_data_t *thread_data = &task_team->tt.tt_threads_data[i];
+          if (!thread_data->td.td_deque)
+            __kmp_alloc_task_deque(__kmp_unshackled_threads[i], thread_data);
+        }
+      }
+#endif
     } else { // Leave the old task team struct in place for the upcoming region;
       // adjust as needed
       kmp_task_team_t *task_team = team->t.t_task_team[other_team];
@@ -3595,6 +3702,14 @@
 
     TCW_PTR(this_thr->th.th_task_team, NULL);
   }
+
+#if USE_UNSHACKLED_TASK
+  // We still need to wait here when there is any unfinished unshackled task.
+  // Simply using looping here should not hurt.
+  if (task_team)
+    while (KMP_ATOMIC_LD_ACQ(&task_team->tt.tt_unfinished_unshackled_tasks))
+      ;
+#endif
 }
 
 // __kmp_tasking_barrier:
Index: openmp/runtime/src/kmp_wait_release.h
===================================================================
--- openmp/runtime/src/kmp_wait_release.h
+++ openmp/runtime/src/kmp_wait_release.h
@@ -381,6 +381,23 @@
         break;
     }
 
+#if USE_UNSHACKLED_TASK
+    // For unshackled thread, if task_team is nullptr, it means the master
+    // thread has not released the barrier. We cannot wait here because once the
+    // master thread releases all children barriers, all unshackled threads are
+    // still sleeping. This leads to a problem that following configuration,
+    // such as task team sync, will not be performed such that this thread does
+    // not have task team. Usually it is not bad. However, a corner case is,
+    // when the first task encountered is an untied task, the check in
+    // __kmp_task_alloc will crash because it uses the task team pointer without
+    // checking whether it is nullptr. It is probably under some kind of
+    // assumption.
+    if (task_team && KMP_UNSHACKLED_THREAD(th_gtid)) {
+      __kmp_unshackled_worker_thread_wait();
+      continue;
+    }
+#endif
+
     // Don't suspend if KMP_BLOCKTIME is set to "infinite"
     if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME &&
         __kmp_pause_status != kmp_soft_paused)
Index: openmp/runtime/src/z_Linux_util.cpp
===================================================================
--- openmp/runtime/src/z_Linux_util.cpp
+++ openmp/runtime/src/z_Linux_util.cpp
@@ -25,6 +25,7 @@
 #include <alloca.h>
 #endif
 #include <math.h> // HUGE_VAL.
+#include <semaphore.h>
 #include <sys/resource.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
@@ -2439,7 +2440,7 @@
                            ,
                            void **exit_frame_ptr
 #endif
-                           ) {
+) {
 #if OMPT_SUPPORT
   *exit_frame_ptr = OMPT_GET_FRAME_ADDRESS(0);
 #endif
@@ -2518,4 +2519,82 @@
 
 #endif
 
+#if USE_UNSHACKLED_TASK
+
+namespace {
+pthread_t __kmp_unshackled_master_thread_handle;
+
+// Condition variable for initializing unshackled team
+pthread_cond_t __kmp_unshackled_threads_initz_cond_var;
+pthread_mutex_t __kmp_unshackled_threads_initz_lock;
+
+// Condition variable for the wrapper function of master thread
+pthread_cond_t __kmp_unshackled_master_thread_cond_var;
+pthread_mutex_t _kmp_unshackled_master_thread_lock;
+
+sem_t __kmp_unshackled_task_sem;
+} // namespace
+
+void __kmp_unshackled_worker_thread_wait() {
+  if (sem_wait(&__kmp_unshackled_task_sem))
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+}
+
+void __kmp_do_initialize_unshackled_threads() {
+  // Initialize condition variable
+  if (pthread_cond_init(&__kmp_unshackled_threads_initz_cond_var, nullptr))
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+  if (pthread_cond_init(&__kmp_unshackled_master_thread_cond_var, nullptr))
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+
+  if (sem_init(&__kmp_unshackled_task_sem, 0, 0))
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+
+  // Create a new thread to finish initialization
+  if (pthread_create(
+          &__kmp_unshackled_master_thread_handle, nullptr,
+          [](void *) -> void * {
+            __kmp_unshackled_threads_initz_routine();
+            return nullptr;
+          },
+          nullptr)) {
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+  }
+}
+
+void __kmp_unshackled_threads_initz_wait() {
+  // Initial thread waits here for the completion of the initialization. The
+  // condition variable will be notified by master thread of unshackled teams
+  if (pthread_cond_wait(&__kmp_unshackled_threads_initz_cond_var,
+                        &__kmp_unshackled_threads_initz_lock)) {
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+  }
+}
+
+void __kmp_unshackled_initz_release() {
+  // After all initialization, reset __kmp_init_unshackled_threads to false
+  __kmp_init_unshackled_threads = FALSE;
+
+  // Notify the initial thread
+  if (pthread_cond_signal(&__kmp_unshackled_threads_initz_cond_var)) {
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+  }
+}
+
+void __kmp_unshackled_master_thread_wait() {
+  // The master thread of unshackled team will be blocked here. The
+  // condition variable can only be signal in the destructor of RTL
+  if (pthread_cond_wait(&__kmp_unshackled_master_thread_cond_var,
+                        &_kmp_unshackled_master_thread_lock)) {
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+  }
+}
+
+void __kmp_unshackled_worker_thread_signal() {
+  if (sem_post(&__kmp_unshackled_task_sem))
+    __kmp_fatal(KMP_MSG(CantRegisterNewThread));
+}
+
+#endif
+
 // end of file //